10 #ifndef __UT_UnicodeImpl__ 
   11 #define __UT_UnicodeImpl__ 
   25 getCharacterInfo(
utf32 cp)
 
   33             return block[cp & 255];
 
   41     return ((
uchar(c) & 0x80) == 0);
 
   45 isContinuation(
utf8 c)
 
   47     return ((
uchar(c) & 0xC0) == 0x80);
 
   51 isValidLeading(
utf8 c)
 
   54            ((
uchar(c) & 0xE0) == 0xC0) ||
 
   55            ((
uchar(c) & 0xF0) == 0xE0) ||
 
   56            ((
uchar(c) & 0xF8) == 0xF0);
 
   60 getContinuationCount(
utf8 c)
 
   62     if ((
uchar(c) & 0xE0) == 0xC0)
 
   64     else if ((
uchar(c) & 0xF0) == 0xE0)
 
   66     else if ((
uchar(c) & 0xF8) == 0xF0)
 
   76     return isValidLeading(octet) || isContinuation(octet);
 
   97     int         cont_bytes = getContinuationCount(c);
 
  106     static const utf32 least_values[4] = { 0x0, 0x80, 0x800, 0x10000 };
 
  107     utf32       least_value = least_values[cont_bytes];
 
  111     int         shift = cont_bytes * 6;
 
  120         result |= (c & 
mask) << shift;
 
  122         if (cont_bytes-- == 0)
 
  128         if (!isContinuation(c))
 
  155         if (buf && buflen >= 1)
 
  159     else if (cp < 0x00000800)
 
  161         if (buf && buflen >= 2)
 
  163             buf[0] = 0xC0 | 
utf8(cp >> 6);
 
  164             buf[1] = 0x80 | 
utf8(cp & 0x3F);
 
  168     else if (cp < 0x00010000)
 
  174         if (buf && buflen >= 3)
 
  176             buf[0] = 0xE0 | 
utf8(cp >> 12);
 
  177             buf[1] = 0x80 | 
utf8((cp >> 6) & 0x3F);
 
  178             buf[2] = 0x80 | 
utf8(cp & 0x3F);
 
  182     else if (cp < 0x110000)
 
  184         if (buf && buflen >= 4)
 
  186             buf[0] = 0xF0 | 
utf8(cp >> 18);
 
  187             buf[1] = 0x80 | 
utf8((cp >> 12) & 0x3F);
 
  188             buf[2] = 0x80 | 
utf8((cp >> 6) & 0x3F);
 
  189             buf[3] = 0x80 | 
utf8(cp & 0x3F);
 
  207             return (c & 0xFF) << 8 | (c >> 8);
 
  221     utf16       c0 = norm16(str[0], big_endian);
 
  229     utf16       c1 = norm16(str[1], big_endian);
 
  230     if ((c0 >= 0xD800 && c0 < 0xDC00) && (c1 >= 0xDC00 && c1 < 0xE000))
 
  232         static const utf32 offset = ((0xD800 << 10) + 0xDC00) - 0x10000; 
 
  254         if (buf && buflen >= 
sizeof(
utf16))
 
  260         if (buf && buflen >= 
sizeof(
utf16[2]))
 
  263             buf[0] = 
utf16(0xD800 | ((cp >> 10) & 0x03FF));   
 
  264             buf[1] = 
utf16(0xDC00 | (cp & 0x03FF));   
 
  275     return const_cast<utf8 *
>(
next(const_cast<const utf8 *>(current)));
 
  298     if (isContinuation(c))
 
  305         while (isContinuation(*current++) && nb_cont--) { }
 
  310         if (!isValidLeading(*current))
 
  315     else if ((nb_cont = getContinuationCount(c)) == 0)
 
  323     for (
int i = 0; i < nb_cont; i++)
 
  325         if (!isContinuation(*current++))
 
  329     if (*current && !isValidLeading(*current))
 
  338     return const_cast<utf8 *
>(
prev(start, const_cast<const utf8 *>(current)));
 
  344     if (!current || !start)
 
  348     if (start >= current)
 
  354     if ( isValidLeading(current[0]) && 
isASCII(current[-1]))
 
  362     if (isValidLeading(*current))
 
  366     else if (!isContinuation(*current))
 
  374     while(current >= start && isContinuation(*current))
 
  385     if (nb_cont > getContinuationCount(*current))
 
  391 template<
bool backward>
 
  395     if (!at || (backward && start == at))
 
  398     const auto is_dot = [](
utf32 cp) { 
return cp == 
'.'; };
 
  401     const auto is_alnum = [](
utf32 cp)
 
  404                cp == 
'_' || cp == 
'@';
 
  406     const auto is_punct = [](
utf32 cp)
 
  408         return cp == 
'{' || cp == 
'[' || cp == 
'(' || cp == 
';' ||
 
  409                cp == 
'}' || cp == 
']' || cp == 
')' || cp == 
',' || cp == 
'.' ||
 
  410                cp == 
'\n' || cp == 
'\r';
 
  414     const auto read = [](
const utf8 *at)
 
  419     const auto is_numerical = [&](
const utf8 *
start, 
const utf8 *at)
 
  423         return  is_digit(p) && is_digit(
n);
 
  428     if (!
first && backward)
 
  439     enum State { SPACE, ALNUM, PUNCT, OTHER };
 
  441                 : is_punct(
first) ? PUNCT : OTHER;
 
  444     if (state == PUNCT && is_dot(
first) && is_numerical(start, at))
 
  447     const auto is_continuous = [&](
auto &&cp)
 
  451             case SPACE: 
return is_space(cp);
 
  452             case ALNUM: 
return is_alnum(cp);
 
  453             case PUNCT: 
return false;
 
  454             default   : 
return !is_space(cp) && !is_alnum(cp) && !is_punct(cp);
 
  470             if (!(is_dot(cp) && is_numerical(start, at)) &&
 
  487             if (!(is_dot(cp) && is_numerical(start, p)) &&
 
  501     return utFindWordBoundary<false>(
start, current);
 
  507     return utFindWordBoundary<true>(
start, current);
 
  513     return fixpos(start, const_cast<const utf8 *&>(current));
 
  519     if (isContinuation(*current))
 
  520         return prev(start, current) != NULL;
 
  551         return strlen((
const char *)start);
 
  553         return exint(end - start);
 
  566         return (
utf8 *)::strdup((
const char *)start);
 
  573         ::memcpy(buf, start, length);
 
  583     if (cp == 0 || !start)
 
  592             return (
const utf8 *)strchr((
const char *)start, 
char(cp));
 
  595             while(start < end && *start != cp)
 
  597             return start == end ? NULL : 
start;
 
  615             if (!end || next < end)
 
  635         return (
const utf8 *)::strstr((
const char *)start, (
const char *)str);
 
  639         size_t           len = ::strlen((
const char *)str);
 
  643         if (!len || (end - start) < len)
 
  647         while(start < (end - len))
 
  715     return (cp >= 0x04E00 && cp <= 0x09FFF) ||  
 
  716            (cp >= 0x03400 && cp <= 0x04DBF) ||  
 
  717            (cp >= 0x20000 && cp <= 0x2A6D6);    
 
  751     init(NULL, NULL, NULL);
 
  757     init(start, end, transform);
 
  778 void UT_Unicode::iterator::init(
const utf8 *start, 
const utf8 *
end,
 
  785     if (start && isValidLeading(*start) && (!end || start < end))
 
  793         myStart = myEnd = myNext = myNext2 = NULL;
 
  806     else if (to < myStart || (myEnd && to > myEnd))
 
  813     myNext = 
convert(myCurrent, myCP);
 
  818         if (myTransform && myCP)
 
  819             myCP = myTransform->transformCodepoint(myCP);
 
  820         if (!myEnd || (myNext < myEnd))
 
  822             myNext2 = 
convert(myNext, myNextCP);
 
  823             if (myTransform && myNextCP)
 
  824                 myNextCP = myTransform->transformCodepoint(myNextCP);
 
  835     if (!myStart || !myCP || (myEnd && myCurrent >= myEnd))
 
  843         if(!myEnd || (myNext < myEnd))
 
  845             myNext2 = 
convert(myNext, myNextCP);
 
  846             if (myTransform && myNextCP)
 
  847                 myNextCP = myTransform->transformCodepoint(myNextCP);
 
  861     if (!myStart || myCurrent == myStart)
 
  871         if (myTransform && cp)
 
  872             cp = myTransform->transformCodepoint(cp);
 
  884 #endif // __UT_UnicodeImpl__ 
static bool isUpper(utf32 cp)
 
SYS_FORCE_INLINE const_iterator begin() const 
 
GLenum GLuint GLenum GLsizei const GLchar * buf
 
static bool isValidCodePoint(utf32 cp)
 
static bool isSpace(utf32 cp, bool break_only=true)
 
static bool isLower(utf32 cp)
 
static bool isAlpha(utf32 cp)
 
unsigned int myComplement
 
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator end() const 
Returns a constant iterator pointing to the end of the string. 
 
GLuint GLsizei GLsizei * length
 
static bool isFromSupplementaryPlane(utf32 cp)
 
static bool isPunct(utf32 cp)
 
**But if you need a result
 
void reset(const utf8 *to=0)
 
void read(T &in, bool &v)
 
const char * c_str() const 
 
static const utf8 * prev(const utf8 *start, const utf8 *current)
 
SYS_FORCE_INLINE const_iterator end() const 
 
static bool isUTF8(utf8 octet)
 
A utility class to do read-only operations on a subset of an existing string. 
 
static const utf8 * convert(const utf8 *str, utf32 &cp)
 
static bool isDigit(utf32 cp)
 
static bool isWordDelimiter(utf32 cp)
 
static bool isSurrogatePair(utf32 cp)
 
static const utf8 * nextWord(const utf8 *start, const utf8 *current)
 
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents. 
 
static const utf8 * next(const utf8 *current)
 
GA_API const UT_StringHolder transform
 
static bool isAlnum(utf32 cp)
 
static bool fixpos(const utf8 *start, const utf8 *¤t)
 
static exint length(const utf8 *start, const utf8 *end=0)
 
static const utf8 * prevWord(const utf8 *start, const utf8 *current)
 
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator begin() const 
Returns a constant iterator pointing to the beginning of the string. 
 
static bool isASCII(utf32 cp)
 
static bool isCJK(utf32 cp)
 
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
 
static utf32 replacementCodePoint()
 
UT_API UT_UnicodeCharacter * theUnicodeTable[256]
 
static utf32 toLower(utf32 cp)
 
PXR_NAMESPACE_OPEN_SCOPE typedef unsigned char uchar
 
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
 
static utf32 toUpper(utf32 cp)