using System;
using Unity.Collections.LowLevel.Unsafe;
namespace Unity.Collections
{
///
/// Kinds of format errors.
///
public enum FormatError
{
///
/// No error.
///
None,
///
/// The target storage does not have sufficient capacity.
/// Note that the format's write failed. It did not truncate.
///
Overflow,
///
/// The source format specifier is not itself correctly formatted, or
/// a format specifier tokens were found outside of accepted usage.
/// Note that the format's write failed.
///
BadFormatSpecifier,
}
///
/// Kinds of parse errors.
///
public enum ParseError
{
///
/// No parse error.
///
None,
///
/// The text parsed does not form a number.
///
Syntax,
///
/// The number exceeds the range of the target type.
/// The number was either truncated, or failed to write entirely.
///
Overflow,
///
/// The number exceeds the precision of the target type.
///
Underflow,
}
///
/// Kinds of copy errors.
///
public enum CopyError
{
///
/// No copy error.
///
None,
///
/// The target storage does not have sufficient capacity.
/// Unless stated in the API comment, assume that the write operation was partially applied.
///
Truncation,
}
///
/// Kinds of conversion errors.
///
public enum ConversionError
{
///
/// No conversion error.
///
None,
///
/// The target storage does not have sufficient capacity.
/// For copy operations; the value was either truncated into the target storage, or failed to write entirely.
///
Overflow,
///
/// The bytes do not form a valid character.
///
Encoding,
///
/// The rune is not a valid code point.
///
CodePoint,
}
///
/// Provides utility methods for UTF-8, UTF-16, UCS-4 (a.k.a. UTF-32), and WTF-8.
///
[GenerateTestsForBurstCompatibility]
public unsafe struct Unicode
{
///
/// Representation of a Unicode character as a code point.
///
[GenerateTestsForBurstCompatibility]
public struct Rune
{
///
/// The code point.
///
/// The code point.
public int value;
///
/// Initializes and returns an instance of Rune.
///
/// You are responsible for the code point being valid.
/// The code point.
public Rune(int codepoint)
{
value = codepoint;
}
///
/// Returns a rune.
///
/// Because a char is 16-bit, it can only represent the first 2^16 code points, not all 1.1 million.
/// A code point.
/// A rune.
public static implicit operator Rune(char codepoint) => new Rune { value = codepoint };
///
/// Evaluates if one is equal to the other.
///
/// The left-hand side
/// The right-hand side
/// True if the left-hand side's is equal to the right-hand side's.
public static bool operator ==(Rune lhs, Rune rhs)
{
return lhs.value == rhs.value;
}
///
/// Returns true if the value stored in this Rune is equal to an object.
///
/// Can only be equal if the object is itself a Rune.
/// An object to compare with.
/// True if the value stored in this Rune is equal to the object.
[ExcludeFromBurstCompatTesting("Takes managed object")]
public override bool Equals(object obj)
{
if (obj is Rune)
{
return value == ((Rune)obj).value;
}
return false;
}
///
/// A hash used for comparisons.
///
/// A unique hash code.
public override int GetHashCode()
{
return value;
}
///
/// Evaluates if one is not equal to the other.
///
/// The left-hand side
/// The right-hand side
/// True if the left-hand side's is not equal to the right-hand side's.
public static bool operator !=(Rune lhs, Rune rhs)
{
return lhs.value != rhs.value;
}
///
/// Returns true if a rune is a numerical digit character.
///
/// The rune.
/// True if the rune is a numerical digit character.
public static bool IsDigit(Rune r)
{
return r.IsDigit();
}
internal bool IsAscii()
{
return value < 0x80;
}
internal bool IsLatin1()
{
return value < 0x100;
}
internal bool IsDigit()
{
return value >= '0' && value <= '9';
}
internal bool IsWhiteSpace()
{
// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
if (IsLatin1())
{
return value == ' '
|| (value >= 0x9 && value <= 0xD) // CHARACTER TABULATION (U+0009), LINE FEED (U+000A), LINE TABULATION (U+000B), FORM FEED (U+000C), CARRIAGE RETURN (U+000D)
|| value == 0xA0 // NO-BREAK SPACE
|| value == 0x85 // NEXT LINE
;
}
return value == 0x1680 // OGHAM SPACE MARK
|| (value >= 0x2000 && value <= 0x200A) // EN QUAD(U+2000)
// EM QUAD(U+2001)
// EN SPACE(U+2002)
// EM SPACE(U+2003)
// THREE - PER - EM SPACE(U + 2004)
// FOUR - PER - EM SPACE(U + 2005)
// SIX - PER - EM SPACE(U + 2006)
// FIGURE SPACE(U+2007)
// PUNCTUATION SPACE(U+2008)
// THIN SPACE(U+2009)
// HAIR SPACE(U+200A)
|| value == 0x2028 // LINE SEPARATOR
|| value == 0x2029 // PARAGRAPH SEPARATOR
|| value == 0x202F // NARROW NO-BREAK SPACE
|| value == 0x205F // MEDIUM MATHEMATICAL SPACE
|| value == 0x3000 // IDEOGRAPHIC SPACE
;
}
internal Rune ToLowerAscii()
{
return new Rune(value + (((uint)(value - 'A') <= ('Z' - 'A')) ? 0x20 : 0));
}
internal Rune ToUpperAscii()
{
return new Rune(value - (((uint)(value - 'a') <= ('z' - 'a')) ? 0x20 : 0));
}
///
/// Returns the number of bytes required to encode this rune as UTF-8.
///
/// The number of bytes required to encode this rune as UTF-8. If the rune's codepoint
/// is invalid, returns 4 (the maximum possible encoding length).
public int LengthInUtf8Bytes()
{
if (value < 0)
return 4; // invalid codepoint
if (value <= 0x7F)
return 1;
if (value <= 0x7FF)
return 2;
if (value <= 0xFFFF)
return 3;
if (value <= 0x1FFFFF)
return 4;
// invalid codepoint, max size.
return 4;
}
}
/// The maximum value of a valid UNICODE code point
public const int kMaximumValidCodePoint = 0x10FFFF;
///
/// Returns true if a code point is valid.
///
/// A code point.
/// True if a code point is valid.
public static bool IsValidCodePoint(int codepoint)
{
if (codepoint > kMaximumValidCodePoint) // maximum valid code point
return false;
// if (codepoint >= 0xD800 && codepoint <= 0xDFFF) // surrogate pair
// return false;
if (codepoint < 0) // negative?
return false;
return true;
}
///
/// Returns true if the byte is not the last byte of a UTF-8 character.
///
/// The byte.
/// True if the byte is not the last byte of a UTF-8 character.
public static bool NotTrailer(byte b)
{
return (b & 0xC0) != 0x80;
}
///
/// The Unicode character �.
///
/// This character is used to stand-in for characters that can't be rendered.
/// The Unicode character �.
public static Rune ReplacementCharacter => new Rune { value = 0xFFFD };
///
/// The null rune value.
///
/// In this package, the "bad rune" is used as a null character. It represents no valid code point.
/// The null rune value.
public static Rune BadRune => new Rune { value = 0 };
///
/// Reads a UTF-8 encoded character from a buffer.
///
/// Outputs the character read. If the read fails, outputs .
/// The buffer of bytes to read.
/// Reference to a byte index into the buffer. If the read succeeds, index is incremented by the
/// size in bytes of the character read. If the read fails, index is incremented by 1.
/// The size in bytes of the buffer. Used to check that the read is in bounds.
/// if the read succeeds. Otherwise, returns or .
public static ConversionError Utf8ToUcs(out Rune rune, byte* buffer, ref int index, int capacity)
{
int code = 0;
rune = ReplacementCharacter;
if (index + 1 > capacity)
{
return ConversionError.Overflow;
}
if ((buffer[index] & 0b10000000) == 0b00000000) // if high bit is 0, 1 byte
{
rune.value = buffer[index + 0];
index += 1;
return ConversionError.None;
}
if ((buffer[index] & 0b11100000) == 0b11000000) // if high 3 bits are 110, 2 bytes
{
if (index + 2 > capacity)
{
index += 1;
return ConversionError.Overflow;
}
code = (buffer[index + 0] & 0b00011111);
code = (code << 6) | (buffer[index + 1] & 0b00111111);
if (code < (1 << 7) || NotTrailer(buffer[index + 1]))
{
index += 1;
return ConversionError.Encoding;
}
rune.value = code;
index += 2;
return ConversionError.None;
}
if ((buffer[index] & 0b11110000) == 0b11100000) // if high 4 bits are 1110, 3 bytes
{
if (index + 3 > capacity)
{
index += 1;
return ConversionError.Overflow;
}
code = (buffer[index + 0] & 0b00001111);
code = (code << 6) | (buffer[index + 1] & 0b00111111);
code = (code << 6) | (buffer[index + 2] & 0b00111111);
if (code < (1 << 11) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]))
{
index += 1;
return ConversionError.Encoding;
}
rune.value = code;
index += 3;
return ConversionError.None;
}
if ((buffer[index] & 0b11111000) == 0b11110000) // if high 5 bits are 11110, 4 bytes
{
if (index + 4 > capacity)
{
index += 1;
return ConversionError.Overflow;
}
code = (buffer[index + 0] & 0b00000111);
code = (code << 6) | (buffer[index + 1] & 0b00111111);
code = (code << 6) | (buffer[index + 2] & 0b00111111);
code = (code << 6) | (buffer[index + 3] & 0b00111111);
if (code < (1 << 16) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]) || NotTrailer(buffer[index + 3]))
{
index += 1;
return ConversionError.Encoding;
}
rune.value = code;
index += 4;
return ConversionError.None;
}
index += 1;
return ConversionError.Encoding;
}
static int FindUtf8CharStartInReverse(byte* ptr, ref int index)
{
do
{
if (index <= 0)
{
return 0;
}
--index;
} while ((ptr[index] & 0xC0) == 0x80);
return index;
}
internal static ConversionError Utf8ToUcsReverse(out Rune rune, byte* buffer, ref int index, int capacity)
{
var prev = index;
--index;
index = FindUtf8CharStartInReverse(buffer, ref index);
if (index == prev)
{
rune = ReplacementCharacter;
return ConversionError.Overflow;
}
var ignore = index;
return Utf8ToUcs(out rune, buffer, ref ignore, capacity);
}
///
/// Returns true if a char is a Unicode leading surrogate.
///
/// The char.
/// True if the char is a Unicode leading surrogate.
static bool IsLeadingSurrogate(char c)
{
return c >= 0xD800 && c <= 0xDBFF;
}
///
/// Returns true if a char is a Unicode trailing surrogate.
///
/// The char.
/// True if the char is a Unicode trailing surrogate.
static bool IsTrailingSurrogate(char c)
{
return c >= 0xDC00 && c <= 0xDFFF;
}
///
/// Reads a UTF-16 encoded character from a buffer.
///
/// Outputs the character read. If the read fails, rune is not set.
/// The buffer of chars to read.
/// Reference to a char index into the buffer. If the read succeeds, index is incremented by the
/// size in chars of the character read. If the read fails, index is not incremented.
/// The size in chars of the buffer. Used to check that the read is in bounds.
/// if the read succeeds. Otherwise, returns .
public static ConversionError Utf16ToUcs(out Rune rune, char* buffer, ref int index, int capacity)
{
int code = 0;
rune = ReplacementCharacter;
if (index + 1 > capacity)
return ConversionError.Overflow;
if (!IsLeadingSurrogate(buffer[index]) || (index + 2 > capacity))
{
rune.value = buffer[index];
index += 1;
return ConversionError.None;
}
code = (buffer[index + 0] & 0x03FF);
char next = buffer[index + 1];
if (!IsTrailingSurrogate(next))
{
rune.value = buffer[index];
index += 1;
return ConversionError.None;
}
code = (code << 10) | (buffer[index + 1] & 0x03FF);
code += 0x10000;
rune.value = code;
index += 2;
return ConversionError.None;
}
internal static ConversionError UcsToUcs(out Rune rune, Rune* buffer, ref int index, int capacity)
{
rune = ReplacementCharacter;
if (index + 1 > capacity)
return ConversionError.Overflow;
rune = buffer[index];
index += 1;
return ConversionError.None;
}
///
/// Writes a rune to a buffer as a UTF-8 encoded character.
///
/// The rune to encode.
/// The buffer to write to.
/// Reference to a byte index into the buffer. If the write succeeds, index is incremented by the
/// size in bytes of the character written. If the write fails, index is not incremented.
/// The size in bytes of the buffer. Used to check that the write is in bounds.
/// if the write succeeds. Otherwise, returns , , or .
public static ConversionError UcsToUtf8(byte* buffer, ref int index, int capacity, Rune rune)
{
if (!IsValidCodePoint(rune.value))
{
return ConversionError.CodePoint;
}
if (index + 1 > capacity)
{
return ConversionError.Overflow;
}
if (rune.value <= 0x7F)
{
buffer[index++] = (byte)rune.value;
return ConversionError.None;
}
if (rune.value <= 0x7FF)
{
if (index + 2 > capacity)
{
return ConversionError.Overflow;
}
buffer[index++] = (byte)(0xC0 | (rune.value >> 6));
buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
return ConversionError.None;
}
if (rune.value <= 0xFFFF)
{
if (index + 3 > capacity)
{
return ConversionError.Overflow;
}
buffer[index++] = (byte)(0xE0 | (rune.value >> 12));
buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
return ConversionError.None;
}
if (rune.value <= 0x1FFFFF)
{
if (index + 4 > capacity)
{
return ConversionError.Overflow;
}
buffer[index++] = (byte)(0xF0 | (rune.value >> 18));
buffer[index++] = (byte)(0x80 | ((rune.value >> 12) & 0x3F));
buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F));
buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F));
return ConversionError.None;
}
return ConversionError.Encoding;
}
///
/// Writes a rune to a buffer as a UTF-16 encoded character.
///
/// The rune to encode.
/// The buffer of chars to write to.
/// Reference to a char index into the buffer. If the write succeeds, index is incremented by the
/// size in chars of the character written. If the write fails, index is not incremented.
/// The size in chars of the buffer. Used to check that the write is in bounds.
/// if the write succeeds. Otherwise, returns , , or .
public static ConversionError UcsToUtf16(char* buffer, ref int index, int capacity, Rune rune)
{
if (!IsValidCodePoint(rune.value))
{
return ConversionError.CodePoint;
}
if (index + 1 > capacity)
{
return ConversionError.Overflow;
}
if (rune.value >= 0x10000)
{
if (index + 2 > capacity)
{
return ConversionError.Overflow;
}
int code = rune.value - 0x10000;
if (code >= (1 << 20))
{
return ConversionError.Encoding;
}
buffer[index++] = (char)(0xD800 | (code >> 10));
buffer[index++] = (char)(0xDC00 | (code & 0x3FF));
return ConversionError.None;
}
buffer[index++] = (char)rune.value;
return ConversionError.None;
}
///
/// Copies UTF-16 characters from one buffer to another buffer as UTF-8.
///
/// Assumes the source data is valid UTF-16.
/// The source buffer.
/// The number of chars to read from the source.
/// The destination buffer.
/// Outputs the number of bytes written to the destination.
/// The size in bytes of the destination buffer.
/// if the copy fully completes. Otherwise, returns .
public static ConversionError Utf16ToUtf8(char* utf16Buffer, int utf16Length, byte* utf8Buffer, out int utf8Length, int utf8Capacity)
{
utf8Length = 0;
for (var utf16Offset = 0; utf16Offset < utf16Length;)
{
Utf16ToUcs(out var ucs, utf16Buffer, ref utf16Offset, utf16Length);
if (UcsToUtf8(utf8Buffer, ref utf8Length, utf8Capacity, ucs) == ConversionError.Overflow)
return ConversionError.Overflow;
}
return ConversionError.None;
}
///
/// Copies UTF-8 characters from one buffer to another.
///
/// Assumes the source data is valid UTF-8.
/// The source buffer.
/// The number of bytes to read from the source.
/// The destination buffer.
/// Outputs the number of bytes written to the destination.
/// The size in bytes of the destination buffer.
/// if the copy fully completes. Otherwise, returns .
public static ConversionError Utf8ToUtf8(byte* srcBuffer, int srcLength, byte* destBuffer, out int destLength, int destCapacity)
{
if (destCapacity >= srcLength)
{
UnsafeUtility.MemCpy(destBuffer, srcBuffer, srcLength);
destLength = srcLength;
return ConversionError.None;
}
// TODO even in this case, it's possible to MemCpy all but the last 3 bytes that fit, and then by looking at only
// TODO the high bits of the last 3 bytes that fit, decide how many of the 3 to append. but that requires a
// TODO little UNICODE presence of mind that nobody has today.
destLength = 0;
for (var srcOffset = 0; srcOffset < srcLength;)
{
Utf8ToUcs(out var ucs, srcBuffer, ref srcOffset, srcLength);
if (UcsToUtf8(destBuffer, ref destLength, destCapacity, ucs) == ConversionError.Overflow)
return ConversionError.Overflow;
}
return ConversionError.None;
}
///
/// Copies UTF-8 characters from one buffer to another as UTF-16.
///
/// Assumes the source data is valid UTF-8.
/// The source buffer.
/// The number of bytes to read from the source.
/// The destination buffer.
/// Outputs the number of chars written to the destination.
/// The size in chars of the destination buffer.
/// if the copy fully completes. Otherwise, .
public static ConversionError Utf8ToUtf16(byte* utf8Buffer, int utf8Length, char* utf16Buffer, out int utf16Length, int utf16Capacity)
{
utf16Length = 0;
for (var utf8Offset
= 0; utf8Offset < utf8Length;)
{
Utf8ToUcs(out var ucs, utf8Buffer, ref utf8Offset, utf8Length);
if (UcsToUtf16(utf16Buffer, ref utf16Length, utf16Capacity, ucs) == ConversionError.Overflow)
return ConversionError.Overflow;
}
return ConversionError.None;
}
static int CountRunes(byte* utf8Buffer, int utf8Length, int maxRunes = int.MaxValue)
{
var numRunes = 0;
for (var i = 0; numRunes < maxRunes && i < utf8Length; ++i)
{
if ((utf8Buffer[i] & 0xC0) != 0x80)
numRunes++;
}
return numRunes;
}
}
}