HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_UnicodeImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_UnicodeImpl.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_UnicodeImpl__
11 #define __UT_UnicodeImpl__
12 
13 // TODO: Make the UTF8 portion table-driven.
14 
15 #include <string.h>
16 
17 #include "UT_Assert.h"
18 #include "UT_UnicodeTable.h"
19 
20 // ============================================================================
21 
22 namespace /* anonymous */ {
23 
24 const UT_UnicodeCharacter &
25 getCharacterInfo(utf32 cp)
26 {
27  static UT_UnicodeCharacter empty = {0,0};
28 
29  if (cp < 65536)
30  {
31  UT_UnicodeCharacter *block = theUnicodeTable[cp >> 8];
32  if (block)
33  return block[cp & 255];
34  }
35  return empty;
36 }
37 
38 static inline bool
39 isASCII(utf8 c)
40 {
41  return ((uchar(c) & 0x80) == 0);
42 }
43 
44 static inline bool
45 isContinuation(utf8 c)
46 {
47  return ((uchar(c) & 0xC0) == 0x80);
48 }
49 
50 static inline bool
51 isValidLeading(utf8 c)
52 {
53  return isASCII(c) ||
54  ((uchar(c) & 0xE0) == 0xC0) ||
55  ((uchar(c) & 0xF0) == 0xE0) ||
56  ((uchar(c) & 0xF8) == 0xF0);
57 }
58 
59 static inline int
60 getContinuationCount(utf8 c)
61 {
62  if ((uchar(c) & 0xE0) == 0xC0)
63  return 1;
64  else if ((uchar(c) & 0xF0) == 0xE0)
65  return 2;
66  else if ((uchar(c) & 0xF8) == 0xF0)
67  return 3;
68  return 0;
69 }
70 
71 }
72 
73 bool
75 {
76  return isValidLeading(octet) || isContinuation(octet);
77 }
78 
79 
80 const utf8 *
81 UT_Unicode::convert(const utf8 *str, utf32 &cp)
82 {
83  cp = 0;
84  if (!str)
85  return NULL;
86 
87  utf8 c = *str++;
88 
89  // Quick ASCII check.
90  if (isASCII(c))
91  {
92  cp = c;
93  return str;
94  }
95 
96  // Get the number of expected continuation bytes.
97  int cont_bytes = getContinuationCount(c);
98 
99  // Is the leading byte broken?
100  if (cont_bytes == 0)
101  return NULL;
102 
103  // The minimum value representable by continuation byte count. Any value
104  // below the minimum value is illegal. This is to avoid multiple encodings
105  // of the same value (e.g. '.' -> 0x
106  static const utf32 least_values[4] = { 0x0, 0x80, 0x800, 0x10000 };
107  utf32 least_value = least_values[cont_bytes];
108 
109  // The amount of shift is determined by the number of continuation
110  // bytes. The shift is progressively reduced as more bytes are read.
111  int shift = cont_bytes * 6;
112 
113  // Set up the initial mask for the data in the leading byte.
114  utf8 mask = 0x3F >> cont_bytes;
115 
116  utf32 result;
117  result = 0;
118  for(;;)
119  {
120  result |= (c & mask) << shift;
121 
122  if (cont_bytes-- == 0)
123  break;
124 
125  c = *str++;
126 
127  // Make sure the continuation byte is of the right form.
128  if (!isContinuation(c))
129  return NULL;
130 
131  // Every continuation byte has the same mask but contributes
132  // six bits lower than the byte before.
133  mask = 0x3F;
134  shift -= 6;
135  }
136 
137  // If the code point is not valid, return the current string position but
138  // a zero code point. I.e. the encoding is correct, but the code point
139  // is not, for the given encoding.
140  if (result < least_value || !isValidCodePoint(result))
141  cp = replacementCodePoint(); // The replacement character
142  else
143  cp = result;
144 
145  return str;
146 }
147 
148 
149 
150 int
152 {
153  if (cp < 0x00080)
154  {
155  if (buf && buflen >= 1)
156  buf[0] = utf8(cp);
157  return 1;
158  }
159  else if (cp < 0x00000800)
160  {
161  if (buf && buflen >= 2)
162  {
163  buf[0] = 0xC0 | utf8(cp >> 6);
164  buf[1] = 0x80 | utf8(cp & 0x3F);
165  }
166  return 2;
167  }
168  else if (cp < 0x00010000)
169  {
170  // We don't encode surrogate pairs.
171  if (isSurrogatePair(cp))
172  return 0;
173 
174  if (buf && buflen >= 3)
175  {
176  buf[0] = 0xE0 | utf8(cp >> 12);
177  buf[1] = 0x80 | utf8((cp >> 6) & 0x3F);
178  buf[2] = 0x80 | utf8(cp & 0x3F);
179  }
180  return 3;
181  }
182  else if (cp < 0x110000)
183  {
184  if (buf && buflen >= 4)
185  {
186  buf[0] = 0xF0 | utf8(cp >> 18);
187  buf[1] = 0x80 | utf8((cp >> 12) & 0x3F);
188  buf[2] = 0x80 | utf8((cp >> 6) & 0x3F);
189  buf[3] = 0x80 | utf8(cp & 0x3F);
190  }
191  return 4;
192  }
193  else
194  {
195  /// 0x10FFFF is the greatest code point value allowed by Unicode and
196  // hence UTF-8 encodings.
197  return 0;
198  }
199 }
200 
201 //
202 namespace
203 {
204  static utf16 norm16(utf16 c, bool big_endian)
205  {
206  if (big_endian)
207  return (c & 0xFF) << 8 | (c >> 8);
208  else
209  return c;
210  }
211 }
212 
213 const utf16 *
214 UT_Unicode::convert(const utf16 *str, utf32 &cp, bool big_endian)
215 {
216  cp = 0;
217  if (!str)
218  return NULL;
219 
220  // Check non-surrogate characters first
221  utf16 c0 = norm16(str[0], big_endian);
222 
223  if (!isSurrogatePair(c0))
224  {
225  cp = utf32(c0);
226  return str + 1;
227  }
228 
229  utf16 c1 = norm16(str[1], big_endian);
230  if ((c0 >= 0xD800 && c0 < 0xDC00) && (c1 >= 0xDC00 && c1 < 0xE000))
231  {
232  static const utf32 offset = ((0xD800 << 10) + 0xDC00) - 0x10000;
233  cp = utf32((c0 << 10) + c1) - offset;
234  return str + 2;
235  }
236  else
237  {
238  // The second character wasn't a surrogate pair character, so skip
239  // over the first of the invalid pair.
240  cp = replacementCodePoint();
241  return str + 1;
242  }
243 }
244 
245 int
247 {
248  if (!isValidCodePoint(cp))
249  return 0;
250 
251  // Characters outside the 64K range are encoded as surrogate pairs.
252  if (!isFromSupplementaryPlane(cp))
253  {
254  if (buf && buflen >= sizeof(utf16))
255  buf[0] = utf16(cp);
256  return 2;
257  }
258  else
259  {
260  if (buf && buflen >= sizeof(utf16[2]))
261  {
262  cp -= 0x10000;
263  buf[0] = utf16(0xD800 | ((cp >> 10) & 0x03FF));
264  buf[1] = utf16(0xDC00 | (cp & 0x03FF));
265  }
266  return 4;
267  }
268 }
269 
270 
271 
272 utf8 *
274 {
275  return const_cast<utf8 *>(next(const_cast<const utf8 *>(current)));
276 }
277 
278 const utf8 *
279 UT_Unicode::next(const utf8 *current)
280 {
281  if (!current)
282  return NULL;
283 
284  utf8 c = *current;
285 
286  // End of string already?
287  if (c == 0)
288  return current;
289 
290  // Quick check for plain ASCII.
291  if (isASCII(c))
292  {
293  current++;
294  return current;
295  }
296 
297  int nb_cont;
298  if (isContinuation(c))
299  {
300  // Are we inside a continuation byte? Then we'll have to scan forward
301  // until we reach a non-continuation byte or end. If we scan forward
302  // more than two bytes, then the continuation is invalid.
303  nb_cont = 2;
304  current++;
305  while (isContinuation(*current++) && nb_cont--) { }
306 
307  if (nb_cont == 0)
308  return NULL;
309 
310  if (!isValidLeading(*current))
311  return NULL;
312 
313  return current;
314  }
315  else if ((nb_cont = getContinuationCount(c)) == 0)
316  {
317  // We didn't encounter a valid byte. We probably got passed a non-UTF8
318  // encoded string.
319  return NULL;
320  }
321 
322  current++;
323  for (int i = 0; i < nb_cont; i++)
324  {
325  if (!isContinuation(*current++))
326  return NULL;
327  }
328 
329  if (*current && !isValidLeading(*current))
330  return NULL;
331 
332  return current;
333 }
334 
335 utf8 *
336 UT_Unicode::prev(const utf8 *start, utf8 *current)
337 {
338  return const_cast<utf8 *>(prev(start, const_cast<const utf8 *>(current)));
339 }
340 
341 const utf8 *
342 UT_Unicode::prev(const utf8 *start, const utf8 *current)
343 {
344  if (!current || !start)
345  return NULL;
346 
347  // Already at the start (or beyond) ?
348  if (start >= current)
349  return NULL;
350 
351  // If the current byte is either ASCII or UTF8 leading byte, and the
352  // previous is ASCII, we can skip right back to it. Otherwise the previous
353  // character must be a part of an UTF8 encoding, or garbage.
354  if ( isValidLeading(current[0]) && isASCII(current[-1]))
355  {
356  current--;
357  return current;
358  }
359 
360  // If we're currently on an ASCII character or a leading byte, go one back
361  // and try passing over the continuation bytes until we hit a leading byte.
362  if (isValidLeading(*current))
363  {
364  current--;
365  }
366  else if (!isContinuation(*current))
367  {
368  // Previous character is garbage. There's no safe way to go back and
369  // expect the current character to be valid.
370  return NULL;
371  }
372 
373  int nb_cont = 0;
374  while(current >= start && isContinuation(*current))
375  {
376  current--;
377  nb_cont++;
378  }
379 
380  // We got no continuation bytes. We should've gotten at least one.
381  if (nb_cont == 0)
382  return NULL;
383 
384  // We passed too many continuation bytes. The encoding is garbage.
385  if (nb_cont > getContinuationCount(*current))
386  return NULL;
387 
388  return current;
389 }
390 
391 template<bool backward>
392 static const utf8 *
393 utFindWordBoundary(const utf8 *start, const utf8 *at)
394 {
395  if (!at || (backward && start == at))
396  return at;
397 
398  const auto is_dot = [](utf32 cp) { return cp == '.'; };
399  const auto is_digit = [](utf32 cp) { return UT_Unicode::isDigit(cp); };
400  const auto is_space = [](utf32 cp) { return UT_Unicode::isSpace(cp); };
401  const auto is_alnum = [](utf32 cp)
402  {
403  return UT_Unicode::isAlnum(cp) ||
404  cp == '_' || cp == '@';
405  };
406  const auto is_punct = [](utf32 cp)
407  {
408  return cp == '{' || cp == '[' || cp == '(' || cp == ';' ||
409  cp == '}' || cp == ']' || cp == ')' || cp == ',' || cp == '.' ||
410  cp == '\n' || cp == '\r';
411  };
412 
413  // Read UTF32 -- null and conversion error will be NUL
414  const auto read = [](const utf8 *at)
415  {
416  utf32 cp;
417  return !at ? 0 : UT_Unicode::convert(at, cp) ? cp : 0;
418  };
419  const auto is_numerical = [&](const utf8 *start, const utf8 *at)
420  {
421  auto p = read(UT_Unicode::prev(start, at));
422  auto n = read(UT_Unicode::next(at));
423  return is_digit(p) && is_digit(n);
424  };
425 
426  // Check the first character
427  auto first = read(at);
428  if (!first && backward)
429  {
430  // After the end of line; try to use the previous character.
431  at = UT_Unicode::prev(start, at);
432  first = read(at);
433  }
434 
435  if (!first)
436  return at;
437 
438  // 4 types of grouping
439  enum State { SPACE, ALNUM, PUNCT, OTHER };
440  State state = is_space(first) ? SPACE : is_alnum(first) ? ALNUM
441  : is_punct(first) ? PUNCT : OTHER;
442 
443  // It starts with dot, but still part of ALNUM, e.g., 1.2
444  if (state == PUNCT && is_dot(first) && is_numerical(start, at))
445  state = ALNUM;
446 
447  const auto is_continuous = [&](auto &&cp)
448  {
449  switch (state)
450  {
451  case SPACE: return is_space(cp);
452  case ALNUM: return is_alnum(cp);
453  case PUNCT: return false;
454  default : return !is_space(cp) && !is_alnum(cp) && !is_punct(cp);
455  }
456  };
457 
458  if (!backward)
459  {
460  // Try to move forward until it's no longer the same type as the
461  // first character. NUL or broken character will exit the loop.
462  at = UT_Unicode::next(at);
463  for (;; at = UT_Unicode::next(at))
464  {
465  auto cp = read(at);
466  if (!cp)
467  break;
468 
469  // When a dot is sandwiched by digits, e.g., 1.1 -> continuous
470  if (!(is_dot(cp) && is_numerical(start, at)) &&
471  !is_continuous(cp))
472  break;
473  }
474  }
475  else
476  {
477  // Try to move backward. It stops at (before) the boundary unlike
478  // the forward loop above.
479  const utf8 *p = UT_Unicode::prev(start, at);
480  for (;; p = UT_Unicode::prev(start, at))
481  {
482  auto cp = read(p);
483  if (!cp)
484  break;
485 
486  // When a dot is sandwiched by digits, e.g., 1.1 -> continuous
487  if (!(is_dot(cp) && is_numerical(start, p)) &&
488  !is_continuous(cp))
489  break;
490 
491  at = p;
492  }
493  }
494 
495  return at;
496 }
497 
498 const utf8 *
499 UT_Unicode::nextWord(const utf8 *start, const utf8 *current)
500 {
501  return utFindWordBoundary<false>(start, current);
502 }
503 
504 const utf8 *
505 UT_Unicode::prevWord(const utf8 *start, const utf8 *current)
506 {
507  return utFindWordBoundary<true>(start, current);
508 }
509 
510 bool
511 UT_Unicode::fixpos(const utf8 *start, utf8 *&current)
512 {
513  return fixpos(start, const_cast<const utf8 *&>(current));
514 }
515 
516 bool
517 UT_Unicode::fixpos(const utf8 *start, const utf8 *&current)
518 {
519  if (isContinuation(*current))
520  return prev(start, current) != NULL;
521 
522  return true;
523 }
524 
525 exint
526 UT_Unicode::count(const utf8 *start, const utf8 *end)
527 {
528  if (!start)
529  return 0;
530 
531  exint nb_cp = 0;
532 
533  UT_ASSERT(isValidLeading(*start));
534  UT_ASSERT(!end || isValidLeading(*end));
535 
536  for(iterator it(start, end); !it.atEnd(); ++it)
537  nb_cp++;
538  return nb_cp;
539 }
540 
541 exint
542 UT_Unicode::length(const utf8 *start, const utf8 *end)
543 {
544  if (!start)
545  return 0;
546 
547  UT_ASSERT(isValidLeading(*start));
548  UT_ASSERT(!end || isValidLeading(*end));
549 
550  if (!end)
551  return strlen((const char *)start);
552  else
553  return exint(end - start);
554 }
555 
556 inline utf8 *
557 UT_Unicode::duplicate(const utf8 *start, const utf8 *end)
558 {
559  if (!start)
560  return NULL;
561 
562  UT_ASSERT(isValidLeading(*start));
563  UT_ASSERT(!end || isValidLeading(*end));
564 
565  if (!end)
566  return (utf8 *)::strdup((const char *)start);
567  else
568  {
569  size_t length = (end - start);
570  utf8 *buf = (utf8 *)malloc(length + 1);
571 
572 
573  ::memcpy(buf, start, length);
574  buf[length] = '\0';
575  return buf;
576  }
577 }
578 
579 
580 const utf8 *
581 UT_Unicode::find(utf32 cp, const utf8 *start, const utf8 *end)
582 {
583  if (cp == 0 || !start)
584  return NULL;
585 
586  UT_ASSERT(isValidLeading(*start));
587  UT_ASSERT(!end || isValidLeading(*end));
588 
589  if (isASCII(cp))
590  {
591  if (!end)
592  return (const utf8 *)strchr((const char *)start, char(cp));
593  else
594  {
595  while(start < end && *start != cp)
596  start++;
597  return start == end ? NULL : start;
598  }
599  }
600  else
601  {
602  const utf8 *pos = start;
603  while (pos)
604  {
605  const utf8 *next;
606  utf32 ccp;
607 
608  next = convert(pos, ccp);
609  if (!ccp)
610  break;
611 
612  if (cp == ccp)
613  return pos;
614 
615  if (!end || next < end)
616  pos = next;
617  else
618  break;
619  }
620  return NULL;
621  }
622 }
623 
624 const utf8 *
625 UT_Unicode::find(const utf8 *str, const utf8 *start, const utf8 *end)
626 {
627  if (!str || !start)
628  return NULL;
629 
630  UT_ASSERT(isValidLeading(*start));
631  UT_ASSERT(!end || isValidLeading(*end));
632 
633  if (!end)
634  {
635  return (const utf8 *)::strstr((const char *)start, (const char *)str);
636  }
637  else
638  {
639  size_t len = ::strlen((const char *)str);
640 
641  // If the section given is shorter than the string to search for,
642  // or the search string is empty, bail early.
643  if (!len || (end - start) < len)
644  return NULL;
645 
646  const utf8 *find = str;
647  while(start < (end - len))
648  {
649  if (*find == *start)
650  {
651 
652 // if (strncmp())
653  return NULL;
654  }
655 
656  start = next(start);
657  }
658  }
659  return NULL;
660 }
661 
662 
663 
664 
665 
666 bool
667 UT_Unicode::isSpace(utf32 cp, bool break_only)
668 {
669  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
670  return (c.myCategory == UT_UNICODE_SPACE ||
671  (!break_only && c.myCategory == UT_UNICODE_SPACE_NONBREAK));
672 }
673 
674 bool
676 {
677  return getCharacterInfo(cp).myCategory == UT_UNICODE_NUMBER;
678 }
679 
680 bool
682 {
683  return isUpper(cp) || isLower(cp);
684 }
685 
686 bool
688 {
689  return isAlpha(cp) || isDigit(cp);
690 }
691 
692 bool
694 {
695  return getCharacterInfo(cp).myCategory == UT_UNICODE_PUNCTUATION;
696 }
697 
698 bool
700 {
701  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
702  return c.myCategory == UT_UNICODE_LT_UPPER ||
704 }
705 
706 bool
708 {
709  return getCharacterInfo(cp).myCategory == UT_UNICODE_LT_LOWER;
710 }
711 
712 ///
714 {
715  return (cp >= 0x04E00 && cp <= 0x09FFF) || // CJK Unified Ideographs
716  (cp >= 0x03400 && cp <= 0x04DBF) || // - Extension A
717  (cp >= 0x20000 && cp <= 0x2A6D6); // - Extension B
718 }
719 
720 
721 utf32
723 {
724  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
725  if (c.myComplement &&
728  return c.myComplement;
729  return cp;
730 }
731 
732 utf32
734 {
735  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
736  if (c.myComplement &&
738  return c.myComplement;
739  return cp;
740 }
741 
742 bool
744 {
745  return (!cp || cp == '/' || UT_Unicode::isSpace(cp));
746 }
747 
748 // UT_Unicode::iterator
750 {
751  init(NULL, NULL, NULL);
752 }
753 
756 {
757  init(start, end, transform);
758 }
759 
762 {
763  init(str.begin(), str.end(), transform);
764 }
765 
768 {
769  init(str.c_str(), nullptr, transform);
770 }
771 
774 {
775  init(str.begin(), str.end(), transform);
776 }
777 
778 void UT_Unicode::iterator::init(const utf8 *start, const utf8 *end,
780 {
781  myCurrent = NULL;
782  myCP = myNextCP = 0;
783  myTransform = transform;
784 
785  if (start && isValidLeading(*start) && (!end || start < end))
786  {
787  myStart = start;
788  myEnd = end;
789  reset();
790  }
791  else
792  {
793  myStart = myEnd = myNext = myNext2 = NULL;
794  }
795 }
796 
797 
798 void
800 {
801  if (!myStart)
802  return;
803 
804  if (!to)
805  to = myStart;
806  else if (to < myStart || (myEnd && to > myEnd))
807  return;
808 
809  // Make sure we start at a decent place.
810  UT_Unicode::fixpos(myStart, to);
811 
812  myCurrent = to;
813  myNext = convert(myCurrent, myCP);
814  if (!myCP)
815  myNext = NULL;
816  else
817  {
818  if (myTransform && myCP)
819  myCP = myTransform->transformCodepoint(myCP);
820  if (!myEnd || (myNext < myEnd))
821  {
822  myNext2 = convert(myNext, myNextCP);
823  if (myTransform && myNextCP)
824  myNextCP = myTransform->transformCodepoint(myNextCP);
825  }
826  else
827  myNextCP = 0;
828  }
829 }
830 
831 bool
833 {
834  // Invalid iterator or at the end already?
835  if (!myStart || !myCP || (myEnd && myCurrent >= myEnd))
836  return true;
837 
838  myCP = myNextCP;
839  myCurrent = myNext;
840  myNext = myNext2;
841  if (myCP)
842  {
843  if(!myEnd || (myNext < myEnd))
844  {
845  myNext2 = convert(myNext, myNextCP);
846  if (myTransform && myNextCP)
847  myNextCP = myTransform->transformCodepoint(myNextCP);
848  }
849  else
850  myNextCP = 0;
851  return true;
852  }
853  else
854  return false;
855 }
856 
857 bool
859 {
860  // Invalid iterator or at the start already?
861  if (!myStart || myCurrent == myStart)
862  return false;
863 
864  const utf8 *prev;
865  utf32 cp;
866 
867  prev = UT_Unicode::prev(myStart, myCurrent);
868  UT_Unicode::convert(prev, cp);
869  if (prev && cp)
870  {
871  if (myTransform && cp)
872  cp = myTransform->transformCodepoint(cp);
873  myNextCP = myCP;
874  myCP = cp;
875  myNext2 = myNext;
876  myNext = myCurrent;
877  myCurrent = prev;
878  return true;
879  }
880  else
881  return false;
882 }
883 
884 #endif // __UT_UnicodeImpl__
GLint first
Definition: glcorearb.h:405
static bool isUpper(utf32 cp)
SYS_FORCE_INLINE const_iterator begin() const
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: glcorearb.h:2540
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:162
static bool isSpace(utf32 cp, bool break_only=true)
static bool isLower(utf32 cp)
GLuint start
Definition: glcorearb.h:475
unsigned short utf16
Definition: SYS_Types.h:56
static bool isAlpha(utf32 cp)
int64 exint
Definition: SYS_Types.h:125
unsigned int myComplement
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator end() const
Returns a constant iterator pointing to the end of the string.
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:795
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:156
static bool isPunct(utf32 cp)
**But if you need a result
Definition: thread.h:613
void reset(const utf8 *to=0)
void read(T &in, bool &v)
Definition: ImfXdr.h:418
unsigned int myCategory
const char * c_str() const
Definition: UT_String.h:508
static const utf8 * prev(const utf8 *start, const utf8 *current)
SYS_FORCE_INLINE const_iterator end() const
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:39
static const utf8 * convert(const utf8 *str, utf32 &cp)
GLdouble n
Definition: glcorearb.h:2008
GLintptr offset
Definition: glcorearb.h:665
static bool isDigit(utf32 cp)
GLboolean reset
Definition: glad.h:5138
static bool isWordDelimiter(utf32 cp)
unsigned int utf32
Definition: SYS_Types.h:58
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:151
GLuint GLuint end
Definition: glcorearb.h:475
static const utf8 * nextWord(const utf8 *start, const utf8 *current)
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
GLint GLuint mask
Definition: glcorearb.h:124
static const utf8 * next(const utf8 *current)
GA_API const UT_StringHolder transform
static bool isAlnum(utf32 cp)
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
static const utf8 * prevWord(const utf8 *start, const utf8 *current)
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator begin() const
Returns a constant iterator pointing to the beginning of the string.
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:177
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:156
char utf8
Definition: SYS_Types.h:52
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:261
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:143
UT_API UT_UnicodeCharacter * theUnicodeTable[256]
static utf32 toLower(utf32 cp)
PXR_NAMESPACE_OPEN_SCOPE typedef unsigned char uchar
Definition: inttypes.h:58
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)