using System; using Unity.Collections.LowLevel.Unsafe; namespace Unity.Collections { ///

/// Kinds of format errors. ///

public enum FormatError { ///

/// No error. ///

None, ///

/// The target storage does not have sufficient capacity. /// Note that the format's write failed. It did not truncate. ///

Overflow, ///

/// The source format specifier is not itself correctly formatted, or /// a format specifier tokens were found outside of accepted usage. /// Note that the format's write failed. ///

BadFormatSpecifier, } ///

/// Kinds of parse errors. ///

public enum ParseError { ///

/// No parse error. ///

None, ///

/// The text parsed does not form a number. ///

Syntax, ///

/// The number exceeds the range of the target type. /// The number was either truncated, or failed to write entirely. ///

Overflow, ///

/// The number exceeds the precision of the target type. ///

Underflow, } ///

/// Kinds of copy errors. ///

public enum CopyError { ///

/// No copy error. ///

None, ///

/// The target storage does not have sufficient capacity. /// Unless stated in the API comment, assume that the write operation was partially applied. ///

Truncation, } ///

/// Kinds of conversion errors. ///

public enum ConversionError { ///

/// No conversion error. ///

None, ///

/// The target storage does not have sufficient capacity. /// For copy operations; the value was either truncated into the target storage, or failed to write entirely. ///

Overflow, ///

/// The bytes do not form a valid character. ///

Encoding, ///

/// The rune is not a valid code point. ///

CodePoint, } ///

/// Provides utility methods for UTF-8, UTF-16, UCS-4 (a.k.a. UTF-32), and WTF-8. ///

[GenerateTestsForBurstCompatibility] public unsafe struct Unicode { ///

/// Representation of a Unicode character as a code point. ///

[GenerateTestsForBurstCompatibility] public struct Rune { ///

/// The code point. ///

/// The code point. public int value; ///

/// Initializes and returns an instance of Rune. ///

/// You are responsible for the code point being valid. /// The code point. public Rune(int codepoint) { value = codepoint; } ///

/// Returns a rune. ///

/// Because a char is 16-bit, it can only represent the first 2^16 code points, not all 1.1 million. /// A code point. /// A rune. public static implicit operator Rune(char codepoint) => new Rune { value = codepoint }; ///

/// Evaluates if one is equal to the other. ///

/// The left-hand side /// The right-hand side /// True if the left-hand side's is equal to the right-hand side's. public static bool operator ==(Rune lhs, Rune rhs) { return lhs.value == rhs.value; } ///

/// Returns true if the value stored in this Rune is equal to an object. ///

/// Can only be equal if the object is itself a Rune. /// An object to compare with. /// True if the value stored in this Rune is equal to the object. [ExcludeFromBurstCompatTesting("Takes managed object")] public override bool Equals(object obj) { if (obj is Rune) { return value == ((Rune)obj).value; } return false; } ///

/// A hash used for comparisons. ///

/// A unique hash code. public override int GetHashCode() { return value; } ///

/// Evaluates if one is not equal to the other. ///

/// The left-hand side /// The right-hand side /// True if the left-hand side's is not equal to the right-hand side's. public static bool operator !=(Rune lhs, Rune rhs) { return lhs.value != rhs.value; } ///

/// Returns true if a rune is a numerical digit character. ///

/// The rune. /// True if the rune is a numerical digit character. public static bool IsDigit(Rune r) { return r.IsDigit(); } internal bool IsAscii() { return value < 0x80; } internal bool IsLatin1() { return value < 0x100; } internal bool IsDigit() { return value >= '0' && value <= '9'; } internal bool IsWhiteSpace() { // https://en.wikipedia.org/wiki/Whitespace_character#Unicode if (IsLatin1()) { return value == ' ' || (value >= 0x9 && value <= 0xD) // CHARACTER TABULATION (U+0009), LINE FEED (U+000A), LINE TABULATION (U+000B), FORM FEED (U+000C), CARRIAGE RETURN (U+000D) || value == 0xA0 // NO-BREAK SPACE || value == 0x85 // NEXT LINE ; } return value == 0x1680 // OGHAM SPACE MARK || (value >= 0x2000 && value <= 0x200A) // EN QUAD(U+2000) // EM QUAD(U+2001) // EN SPACE(U+2002) // EM SPACE(U+2003) // THREE - PER - EM SPACE(U + 2004) // FOUR - PER - EM SPACE(U + 2005) // SIX - PER - EM SPACE(U + 2006) // FIGURE SPACE(U+2007) // PUNCTUATION SPACE(U+2008) // THIN SPACE(U+2009) // HAIR SPACE(U+200A) || value == 0x2028 // LINE SEPARATOR || value == 0x2029 // PARAGRAPH SEPARATOR || value == 0x202F // NARROW NO-BREAK SPACE || value == 0x205F // MEDIUM MATHEMATICAL SPACE || value == 0x3000 // IDEOGRAPHIC SPACE ; } internal Rune ToLowerAscii() { return new Rune(value + (((uint)(value - 'A') <= ('Z' - 'A')) ? 0x20 : 0)); } internal Rune ToUpperAscii() { return new Rune(value - (((uint)(value - 'a') <= ('z' - 'a')) ? 0x20 : 0)); } ///

/// Returns the number of bytes required to encode this rune as UTF-8. ///

/// The number of bytes required to encode this rune as UTF-8. If the rune's codepoint /// is invalid, returns 4 (the maximum possible encoding length). public int LengthInUtf8Bytes() { if (value < 0) return 4; // invalid codepoint if (value <= 0x7F) return 1; if (value <= 0x7FF) return 2; if (value <= 0xFFFF) return 3; if (value <= 0x1FFFFF) return 4; // invalid codepoint, max size. return 4; } } ///

The maximum value of a valid UNICODE code point

public const int kMaximumValidCodePoint = 0x10FFFF; ///

/// Returns true if a code point is valid. ///

/// A code point. /// True if a code point is valid. public static bool IsValidCodePoint(int codepoint) { if (codepoint > kMaximumValidCodePoint) // maximum valid code point return false; // if (codepoint >= 0xD800 && codepoint <= 0xDFFF) // surrogate pair // return false; if (codepoint < 0) // negative? return false; return true; } ///

/// Returns true if the byte is not the last byte of a UTF-8 character. ///

/// The byte. /// True if the byte is not the last byte of a UTF-8 character. public static bool NotTrailer(byte b) { return (b & 0xC0) != 0x80; } ///

/// The Unicode character �. ///

/// This character is used to stand-in for characters that can't be rendered. /// The Unicode character �. public static Rune ReplacementCharacter => new Rune { value = 0xFFFD }; ///

/// The null rune value. ///

/// In this package, the "bad rune" is used as a null character. It represents no valid code point. /// The null rune value. public static Rune BadRune => new Rune { value = 0 }; ///

/// Reads a UTF-8 encoded character from a buffer. ///

/// Outputs the character read. If the read fails, outputs . /// The buffer of bytes to read. /// Reference to a byte index into the buffer. If the read succeeds, index is incremented by the /// size in bytes of the character read. If the read fails, index is incremented by 1. /// The size in bytes of the buffer. Used to check that the read is in bounds. /// if the read succeeds. Otherwise, returns or . public static ConversionError Utf8ToUcs(out Rune rune, byte* buffer, ref int index, int capacity) { int code = 0; rune = ReplacementCharacter; if (index + 1 > capacity) { return ConversionError.Overflow; } if ((buffer[index] & 0b10000000) == 0b00000000) // if high bit is 0, 1 byte { rune.value = buffer[index + 0]; index += 1; return ConversionError.None; } if ((buffer[index] & 0b11100000) == 0b11000000) // if high 3 bits are 110, 2 bytes { if (index + 2 > capacity) { index += 1; return ConversionError.Overflow; } code = (buffer[index + 0] & 0b00011111); code = (code << 6) | (buffer[index + 1] & 0b00111111); if (code < (1 << 7) || NotTrailer(buffer[index + 1])) { index += 1; return ConversionError.Encoding; } rune.value = code; index += 2; return ConversionError.None; } if ((buffer[index] & 0b11110000) == 0b11100000) // if high 4 bits are 1110, 3 bytes { if (index + 3 > capacity) { index += 1; return ConversionError.Overflow; } code = (buffer[index + 0] & 0b00001111); code = (code << 6) | (buffer[index + 1] & 0b00111111); code = (code << 6) | (buffer[index + 2] & 0b00111111); if (code < (1 << 11) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2])) { index += 1; return ConversionError.Encoding; } rune.value = code; index += 3; return ConversionError.None; } if ((buffer[index] & 0b11111000) == 0b11110000) // if high 5 bits are 11110, 4 bytes { if (index + 4 > capacity) { index += 1; return ConversionError.Overflow; } code = (buffer[index + 0] & 0b00000111); code = (code << 6) | (buffer[index + 1] & 0b00111111); code = (code << 6) | (buffer[index + 2] & 0b00111111); code = (code << 6) | (buffer[index + 3] & 0b00111111); if (code < (1 << 16) || !IsValidCodePoint(code) || NotTrailer(buffer[index + 1]) || NotTrailer(buffer[index + 2]) || NotTrailer(buffer[index + 3])) { index += 1; return ConversionError.Encoding; } rune.value = code; index += 4; return ConversionError.None; } index += 1; return ConversionError.Encoding; } static int FindUtf8CharStartInReverse(byte* ptr, ref int index) { do { if (index <= 0) { return 0; } --index; } while ((ptr[index] & 0xC0) == 0x80); return index; } internal static ConversionError Utf8ToUcsReverse(out Rune rune, byte* buffer, ref int index, int capacity) { var prev = index; --index; index = FindUtf8CharStartInReverse(buffer, ref index); if (index == prev) { rune = ReplacementCharacter; return ConversionError.Overflow; } var ignore = index; return Utf8ToUcs(out rune, buffer, ref ignore, capacity); } ///

/// Returns true if a char is a Unicode leading surrogate. ///

/// The char. /// True if the char is a Unicode leading surrogate. static bool IsLeadingSurrogate(char c) { return c >= 0xD800 && c <= 0xDBFF; } ///

/// Returns true if a char is a Unicode trailing surrogate. ///

/// The char. /// True if the char is a Unicode trailing surrogate. static bool IsTrailingSurrogate(char c) { return c >= 0xDC00 && c <= 0xDFFF; } ///

/// Reads a UTF-16 encoded character from a buffer. ///

/// Outputs the character read. If the read fails, rune is not set. /// The buffer of chars to read. /// Reference to a char index into the buffer. If the read succeeds, index is incremented by the /// size in chars of the character read. If the read fails, index is not incremented. /// The size in chars of the buffer. Used to check that the read is in bounds. /// if the read succeeds. Otherwise, returns . public static ConversionError Utf16ToUcs(out Rune rune, char* buffer, ref int index, int capacity) { int code = 0; rune = ReplacementCharacter; if (index + 1 > capacity) return ConversionError.Overflow; if (!IsLeadingSurrogate(buffer[index]) || (index + 2 > capacity)) { rune.value = buffer[index]; index += 1; return ConversionError.None; } code = (buffer[index + 0] & 0x03FF); char next = buffer[index + 1]; if (!IsTrailingSurrogate(next)) { rune.value = buffer[index]; index += 1; return ConversionError.None; } code = (code << 10) | (buffer[index + 1] & 0x03FF); code += 0x10000; rune.value = code; index += 2; return ConversionError.None; } internal static ConversionError UcsToUcs(out Rune rune, Rune* buffer, ref int index, int capacity) { rune = ReplacementCharacter; if (index + 1 > capacity) return ConversionError.Overflow; rune = buffer[index]; index += 1; return ConversionError.None; } ///

/// Writes a rune to a buffer as a UTF-8 encoded character. ///

/// The rune to encode. /// The buffer to write to. /// Reference to a byte index into the buffer. If the write succeeds, index is incremented by the /// size in bytes of the character written. If the write fails, index is not incremented. /// The size in bytes of the buffer. Used to check that the write is in bounds. /// if the write succeeds. Otherwise, returns , , or . public static ConversionError UcsToUtf8(byte* buffer, ref int index, int capacity, Rune rune) { if (!IsValidCodePoint(rune.value)) { return ConversionError.CodePoint; } if (index + 1 > capacity) { return ConversionError.Overflow; } if (rune.value <= 0x7F) { buffer[index++] = (byte)rune.value; return ConversionError.None; } if (rune.value <= 0x7FF) { if (index + 2 > capacity) { return ConversionError.Overflow; } buffer[index++] = (byte)(0xC0 | (rune.value >> 6)); buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F)); return ConversionError.None; } if (rune.value <= 0xFFFF) { if (index + 3 > capacity) { return ConversionError.Overflow; } buffer[index++] = (byte)(0xE0 | (rune.value >> 12)); buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F)); buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F)); return ConversionError.None; } if (rune.value <= 0x1FFFFF) { if (index + 4 > capacity) { return ConversionError.Overflow; } buffer[index++] = (byte)(0xF0 | (rune.value >> 18)); buffer[index++] = (byte)(0x80 | ((rune.value >> 12) & 0x3F)); buffer[index++] = (byte)(0x80 | ((rune.value >> 6) & 0x3F)); buffer[index++] = (byte)(0x80 | ((rune.value >> 0) & 0x3F)); return ConversionError.None; } return ConversionError.Encoding; } ///

/// Writes a rune to a buffer as a UTF-16 encoded character. ///

/// The rune to encode. /// The buffer of chars to write to. /// Reference to a char index into the buffer. If the write succeeds, index is incremented by the /// size in chars of the character written. If the write fails, index is not incremented. /// The size in chars of the buffer. Used to check that the write is in bounds. /// if the write succeeds. Otherwise, returns , , or . public static ConversionError UcsToUtf16(char* buffer, ref int index, int capacity, Rune rune) { if (!IsValidCodePoint(rune.value)) { return ConversionError.CodePoint; } if (index + 1 > capacity) { return ConversionError.Overflow; } if (rune.value >= 0x10000) { if (index + 2 > capacity) { return ConversionError.Overflow; } int code = rune.value - 0x10000; if (code >= (1 << 20)) { return ConversionError.Encoding; } buffer[index++] = (char)(0xD800 | (code >> 10)); buffer[index++] = (char)(0xDC00 | (code & 0x3FF)); return ConversionError.None; } buffer[index++] = (char)rune.value; return ConversionError.None; } ///

/// Copies UTF-16 characters from one buffer to another buffer as UTF-8. ///

/// Assumes the source data is valid UTF-16. /// The source buffer. /// The number of chars to read from the source. /// The destination buffer. /// Outputs the number of bytes written to the destination. /// The size in bytes of the destination buffer. /// if the copy fully completes. Otherwise, returns . public static ConversionError Utf16ToUtf8(char* utf16Buffer, int utf16Length, byte* utf8Buffer, out int utf8Length, int utf8Capacity) { utf8Length = 0; for (var utf16Offset = 0; utf16Offset < utf16Length;) { Utf16ToUcs(out var ucs, utf16Buffer, ref utf16Offset, utf16Length); if (UcsToUtf8(utf8Buffer, ref utf8Length, utf8Capacity, ucs) == ConversionError.Overflow) return ConversionError.Overflow; } return ConversionError.None; } ///

/// Copies UTF-8 characters from one buffer to another. ///

/// Assumes the source data is valid UTF-8. /// The source buffer. /// The number of bytes to read from the source. /// The destination buffer. /// Outputs the number of bytes written to the destination. /// The size in bytes of the destination buffer. /// if the copy fully completes. Otherwise, returns . public static ConversionError Utf8ToUtf8(byte* srcBuffer, int srcLength, byte* destBuffer, out int destLength, int destCapacity) { if (destCapacity >= srcLength) { UnsafeUtility.MemCpy(destBuffer, srcBuffer, srcLength); destLength = srcLength; return ConversionError.None; } // TODO even in this case, it's possible to MemCpy all but the last 3 bytes that fit, and then by looking at only // TODO the high bits of the last 3 bytes that fit, decide how many of the 3 to append. but that requires a // TODO little UNICODE presence of mind that nobody has today. destLength = 0; for (var srcOffset = 0; srcOffset < srcLength;) { Utf8ToUcs(out var ucs, srcBuffer, ref srcOffset, srcLength); if (UcsToUtf8(destBuffer, ref destLength, destCapacity, ucs) == ConversionError.Overflow) return ConversionError.Overflow; } return ConversionError.None; } ///

/// Copies UTF-8 characters from one buffer to another as UTF-16. ///

/// Assumes the source data is valid UTF-8. /// The source buffer. /// The number of bytes to read from the source. /// The destination buffer. /// Outputs the number of chars written to the destination. /// The size in chars of the destination buffer. /// if the copy fully completes. Otherwise, . public static ConversionError Utf8ToUtf16(byte* utf8Buffer, int utf8Length, char* utf16Buffer, out int utf16Length, int utf16Capacity) { utf16Length = 0; for (var utf8Offset = 0; utf8Offset < utf8Length;) { Utf8ToUcs(out var ucs, utf8Buffer, ref utf8Offset, utf8Length); if (UcsToUtf16(utf16Buffer, ref utf16Length, utf16Capacity, ucs) == ConversionError.Overflow) return ConversionError.Overflow; } return ConversionError.None; } static int CountRunes(byte* utf8Buffer, int utf8Length, int maxRunes = int.MaxValue) { var numRunes = 0; for (var i = 0; numRunes < maxRunes && i < utf8Length; ++i) { if ((utf8Buffer[i] & 0xC0) != 0x80) numRunes++; } return numRunes; } } }