HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_UnicodeImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_UnicodeImpl.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_UnicodeImpl__
11 #define __UT_UnicodeImpl__
12 
13 // TODO: Make the UTF8 portion table-driven.
14 
15 #include <string.h>
16 
17 #include "UT_Assert.h"
18 #include "UT_UnicodeTable.h"
19 
20 // ============================================================================
21 
22 namespace /* anonymous */ {
23 
24 const UT_UnicodeCharacter &
25 getCharacterInfo(utf32 cp)
26 {
27  static UT_UnicodeCharacter empty = {0,0};
28 
29  if (cp < 65536)
30  {
31  UT_UnicodeCharacter *block = theUnicodeTable[cp >> 8];
32  if (block)
33  return block[cp & 255];
34  }
35  return empty;
36 }
37 
38 static inline bool
39 isASCII(utf8 c)
40 {
41  return ((uchar(c) & 0x80) == 0);
42 }
43 
44 static inline bool
45 isContinuation(utf8 c)
46 {
47  return ((uchar(c) & 0xC0) == 0x80);
48 }
49 
50 static inline bool
51 isValidLeading(utf8 c)
52 {
53  return isASCII(c) ||
54  ((uchar(c) & 0xE0) == 0xC0) ||
55  ((uchar(c) & 0xF0) == 0xE0) ||
56  ((uchar(c) & 0xF8) == 0xF0);
57 }
58 
59 static inline int
60 getContinuationCount(utf8 c)
61 {
62  if ((uchar(c) & 0xE0) == 0xC0)
63  return 1;
64  else if ((uchar(c) & 0xF0) == 0xE0)
65  return 2;
66  else if ((uchar(c) & 0xF8) == 0xF0)
67  return 3;
68  return 0;
69 }
70 
71 }
72 
73 bool
75 {
76  return isValidLeading(octet) || isContinuation(octet);
77 }
78 
79 
80 const utf8 *
81 UT_Unicode::convert(const utf8 *str, utf32 &cp)
82 {
83  cp = 0;
84  if (!str)
85  return NULL;
86 
87  utf8 c = *str++;
88 
89  // Quick ASCII check.
90  if (isASCII(c))
91  {
92  cp = c;
93  return str;
94  }
95 
96  // Get the number of expected continuation bytes.
97  int cont_bytes = getContinuationCount(c);
98 
99  // Is the leading byte broken?
100  if (cont_bytes == 0)
101  return NULL;
102 
103  // The minimum value representable by continuation byte count. Any value
104  // below the minimum value is illegal. This is to avoid multiple encodings
105  // of the same value (e.g. '.' -> 0x
106  static const utf32 least_values[4] = { 0x0, 0x80, 0x800, 0x10000 };
107  utf32 least_value = least_values[cont_bytes];
108 
109  // The amount of shift is determined by the number of continuation
110  // bytes. The shift is progressively reduced as more bytes are read.
111  int shift = cont_bytes * 6;
112 
113  // Set up the initial mask for the data in the leading byte.
114  utf8 mask = 0x3F >> cont_bytes;
115 
116  utf32 result;
117  result = 0;
118  for(;;)
119  {
120  result |= (c & mask) << shift;
121 
122  if (cont_bytes-- == 0)
123  break;
124 
125  c = *str++;
126 
127  // Make sure the continuation byte is of the right form.
128  if (!isContinuation(c))
129  return NULL;
130 
131  // Every continuation byte has the same mask but contributes
132  // six bits lower than the byte before.
133  mask = 0x3F;
134  shift -= 6;
135  }
136 
137  // If the code point is not valid, return the current string position but
138  // a zero code point. I.e. the encoding is correct, but the code point
139  // is not, for the given encoding.
140  if (result < least_value || !isValidCodePoint(result))
141  cp = replacementCodePoint(); // The replacement character
142  else
143  cp = result;
144 
145  return str;
146 }
147 
148 
149 
150 int
152 {
153  if (cp < 0x00080)
154  {
155  if (buf && buflen >= 1)
156  buf[0] = utf8(cp);
157  return 1;
158  }
159  else if (cp < 0x00000800)
160  {
161  if (buf && buflen >= 2)
162  {
163  buf[0] = 0xC0 | utf8(cp >> 6);
164  buf[1] = 0x80 | utf8(cp & 0x3F);
165  }
166  return 2;
167  }
168  else if (cp < 0x00010000)
169  {
170  // We don't encode surrogate pairs.
171  if (isSurrogatePair(cp))
172  return 0;
173 
174  if (buf && buflen >= 3)
175  {
176  buf[0] = 0xE0 | utf8(cp >> 12);
177  buf[1] = 0x80 | utf8((cp >> 6) & 0x3F);
178  buf[2] = 0x80 | utf8(cp & 0x3F);
179  }
180  return 3;
181  }
182  else if (cp < 0x110000)
183  {
184  if (buf && buflen >= 4)
185  {
186  buf[0] = 0xF0 | utf8(cp >> 18);
187  buf[1] = 0x80 | utf8((cp >> 12) & 0x3F);
188  buf[2] = 0x80 | utf8((cp >> 6) & 0x3F);
189  buf[3] = 0x80 | utf8(cp & 0x3F);
190  }
191  return 4;
192  }
193  else
194  {
195  /// 0x10FFFF is the greatest code point value allowed by Unicode and
196  // hence UTF-8 encodings.
197  return 0;
198  }
199 }
200 
201 //
202 namespace
203 {
204  static utf16 norm16(utf16 c, bool big_endian)
205  {
206  if (big_endian)
207  return (c & 0xFF) << 8 | (c >> 8);
208  else
209  return c;
210  }
211 }
212 
213 const utf16 *
214 UT_Unicode::convert(const utf16 *str, utf32 &cp, bool big_endian)
215 {
216  cp = 0;
217  if (!str)
218  return NULL;
219 
220  // Check non-surrogate characters first
221  utf16 c0 = norm16(str[0], big_endian);
222 
223  if (!isSurrogatePair(c0))
224  {
225  cp = utf32(c0);
226  return str + 1;
227  }
228 
229  utf16 c1 = norm16(str[1], big_endian);
230  if ((c0 >= 0xD800 && c0 < 0xDC00) && (c1 >= 0xDC00 && c1 < 0xE000))
231  {
232  static const utf32 offset = ((0xD800 << 10) + 0xDC00) - 0x10000;
233  cp = utf32((c0 << 10) + c1) - offset;
234  return str + 2;
235  }
236  else
237  {
238  // The second character wasn't a surrogate pair character, so skip
239  // over the first of the invalid pair.
240  cp = replacementCodePoint();
241  return str + 1;
242  }
243 }
244 
245 int
247 {
248  if (!isValidCodePoint(cp))
249  return 0;
250 
251  // Characters outside the 64K range are encoded as surrogate pairs.
252  if (!isFromSupplementaryPlane(cp))
253  {
254  if (buf && buflen >= sizeof(utf16))
255  buf[0] = utf16(cp);
256  return 2;
257  }
258  else
259  {
260  if (buf && buflen >= sizeof(utf16[2]))
261  {
262  cp -= 0x10000;
263  buf[0] = utf16(0xD800 | ((cp >> 10) & 0x03FF));
264  buf[1] = utf16(0xDC00 | (cp & 0x03FF));
265  }
266  return 4;
267  }
268 }
269 
270 
271 
272 utf8 *
274 {
275  return const_cast<utf8 *>(next(const_cast<const utf8 *>(current)));
276 }
277 
278 const utf8 *
279 UT_Unicode::next(const utf8 *current)
280 {
281  if (!current)
282  return NULL;
283 
284  utf8 c = *current;
285 
286  // End of string already?
287  if (c == 0)
288  return current;
289 
290  // Quick check for plain ASCII.
291  if (isASCII(c))
292  {
293  current++;
294  return current;
295  }
296 
297  int nb_cont;
298  if (isContinuation(c))
299  {
300  // Are we inside a continuation byte? Then we'll have to scan forward
301  // until we reach a non-continuation byte or end. If we scan forward
302  // more than two bytes, then the continuation is invalid.
303  nb_cont = 2;
304  current++;
305  while (isContinuation(*current++) && nb_cont--) { }
306 
307  if (nb_cont == 0)
308  return NULL;
309 
310  if (!isValidLeading(*current))
311  return NULL;
312 
313  return current;
314  }
315  else if ((nb_cont = getContinuationCount(c)) == 0)
316  {
317  // We didn't encounter a valid byte. We probably got passed a non-UTF8
318  // encoded string.
319  return NULL;
320  }
321 
322  current++;
323  for (int i = 0; i < nb_cont; i++)
324  {
325  if (!isContinuation(*current++))
326  return NULL;
327  }
328 
329  if (*current && !isValidLeading(*current))
330  return NULL;
331 
332  return current;
333 }
334 
335 utf8 *
336 UT_Unicode::prev(const utf8 *start, utf8 *current)
337 {
338  return const_cast<utf8 *>(prev(start, const_cast<const utf8 *>(current)));
339 }
340 
341 const utf8 *
342 UT_Unicode::prev(const utf8 *start, const utf8 *current)
343 {
344  if (!current || !start)
345  return NULL;
346 
347  // Already at the start (or beyond) ?
348  if (start >= current)
349  return NULL;
350 
351  // If the current byte is either ASCII or UTF8 leading byte, and the
352  // previous is ASCII, we can skip right back to it. Otherwise the previous
353  // character must be a part of an UTF8 encoding, or garbage.
354  if ( isValidLeading(current[0]) && isASCII(current[-1]))
355  {
356  current--;
357  return current;
358  }
359 
360  // If we're currently on an ASCII character or a leading byte, go one back
361  // and try passing over the continuation bytes until we hit a leading byte.
362  if (isValidLeading(*current))
363  {
364  current--;
365  }
366  else if (!isContinuation(*current))
367  {
368  // Previous character is garbage. There's no safe way to go back and
369  // expect the current character to be valid.
370  return NULL;
371  }
372 
373  int nb_cont = 0;
374  while(current >= start && isContinuation(*current))
375  {
376  current--;
377  nb_cont++;
378  }
379 
380  // We got no continuation bytes. We should've gotten at least one.
381  if (nb_cont == 0)
382  return NULL;
383 
384  // We passed too many continuation bytes. The encoding is garbage.
385  if (nb_cont > getContinuationCount(*current))
386  return NULL;
387 
388  return current;
389 }
390 
391 
392 bool
393 UT_Unicode::fixpos(const utf8 *start, utf8 *&current)
394 {
395  return fixpos(start, const_cast<const utf8 *&>(current));
396 }
397 
398 bool
399 UT_Unicode::fixpos(const utf8 *start, const utf8 *&current)
400 {
401  if (isContinuation(*current))
402  return prev(start, current) != NULL;
403 
404  return true;
405 }
406 
407 exint
409 {
410  if (!start)
411  return 0;
412 
413  exint nb_cp = 0;
414 
415  UT_ASSERT(isValidLeading(*start));
416  UT_ASSERT(!end || isValidLeading(*end));
417 
418  for(iterator it(start, end); !it.atEnd(); ++it)
419  nb_cp++;
420  return nb_cp;
421 }
422 
423 exint
425 {
426  if (!start)
427  return 0;
428 
429  UT_ASSERT(isValidLeading(*start));
430  UT_ASSERT(!end || isValidLeading(*end));
431 
432  if (!end)
433  return strlen((const char *)start);
434  else
435  return exint(end - start);
436 }
437 
438 inline utf8 *
440 {
441  if (!start)
442  return NULL;
443 
444  UT_ASSERT(isValidLeading(*start));
445  UT_ASSERT(!end || isValidLeading(*end));
446 
447  if (!end)
448  return (utf8 *)::strdup((const char *)start);
449  else
450  {
451  size_t length = (end - start);
452  utf8 *buf = (utf8 *)malloc(length + 1);
453 
454 
455  ::memcpy(buf, start, length);
456  buf[length] = '\0';
457  return buf;
458  }
459 }
460 
461 
462 const utf8 *
464 {
465  if (cp == 0 || !start)
466  return NULL;
467 
468  UT_ASSERT(isValidLeading(*start));
469  UT_ASSERT(!end || isValidLeading(*end));
470 
471  if (isASCII(cp))
472  {
473  if (!end)
474  return (const utf8 *)strchr((const char *)start, char(cp));
475  else
476  {
477  while(start < end && *start != cp)
478  start++;
479  return start == end ? NULL : start;
480  }
481  }
482  else
483  {
484  const utf8 *pos = start;
485  while (pos)
486  {
487  const utf8 *next;
488  utf32 ccp;
489 
490  next = convert(pos, ccp);
491  if (!ccp)
492  break;
493 
494  if (cp == ccp)
495  return pos;
496 
497  if (!end || next < end)
498  pos = next;
499  else
500  break;
501  }
502  return NULL;
503  }
504 }
505 
506 const utf8 *
507 UT_Unicode::find(const utf8 *str, const utf8 *start, const utf8 *end)
508 {
509  if (!str || !start)
510  return NULL;
511 
512  UT_ASSERT(isValidLeading(*start));
513  UT_ASSERT(!end || isValidLeading(*end));
514 
515  if (!end)
516  {
517  return (const utf8 *)::strstr((const char *)start, (const char *)str);
518  }
519  else
520  {
521  size_t len = ::strlen((const char *)str);
522 
523  // If the section given is shorter than the string to search for,
524  // or the search string is empty, bail early.
525  if (!len || (end - start) < len)
526  return NULL;
527 
528  const utf8 *find = str;
529  while(start < (end - len))
530  {
531  if (*find == *start)
532  {
533 
534 // if (strncmp())
535  return NULL;
536  }
537 
538  start = next(start);
539  }
540  }
541  return NULL;
542 }
543 
544 
545 
546 
547 
548 bool
549 UT_Unicode::isSpace(utf32 cp, bool break_only)
550 {
551  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
552  return (c.myCategory == UT_UNICODE_SPACE ||
553  (!break_only && c.myCategory == UT_UNICODE_SPACE_NONBREAK));
554 }
555 
556 bool
558 {
559  return getCharacterInfo(cp).myCategory == UT_UNICODE_NUMBER;
560 }
561 
562 bool
564 {
565  return isUpper(cp) || isLower(cp);
566 }
567 
568 bool
570 {
571  return isAlpha(cp) || isDigit(cp);
572 }
573 
574 bool
576 {
577  return getCharacterInfo(cp).myCategory == UT_UNICODE_PUNCTUATION;
578 }
579 
580 bool
582 {
583  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
584  return c.myCategory == UT_UNICODE_LT_UPPER ||
586 }
587 
588 bool
590 {
591  return getCharacterInfo(cp).myCategory == UT_UNICODE_LT_LOWER;
592 }
593 
594 ///
596 {
597  return (cp >= 0x04E00 && cp <= 0x09FFF) || // CJK Unified Ideographs
598  (cp >= 0x03400 && cp <= 0x04DBF) || // - Extension A
599  (cp >= 0x20000 && cp <= 0x2A6D6); // - Extension B
600 }
601 
602 
603 utf32
605 {
606  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
607  if (c.myComplement &&
610  return c.myComplement;
611  return cp;
612 }
613 
614 utf32
616 {
617  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
618  if (c.myComplement &&
620  return c.myComplement;
621  return cp;
622 }
623 
624 bool
626 {
627  return (!cp || cp == '/' || UT_Unicode::isSpace(cp));
628 }
629 
630 // UT_Unicode::iterator
632 {
633  init(NULL, NULL, NULL);
634 }
635 
638 {
639  init(start, end, transform);
640 }
641 
644 {
645  init(str.begin(), str.end(), transform);
646 }
647 
650 {
651  init(str.c_str(), nullptr, transform);
652 }
653 
656 {
657  init(str.begin(), str.end(), transform);
658 }
659 
660 void UT_Unicode::iterator::init(const utf8 *start, const utf8 *end,
662 {
663  myCurrent = NULL;
664  myCP = myNextCP = 0;
665  myTransform = transform;
666 
667  if (start && isValidLeading(*start) && (!end || start < end))
668  {
669  myStart = start;
670  myEnd = end;
671  reset();
672  }
673  else
674  {
675  myStart = myEnd = myNext = myNext2 = NULL;
676  }
677 }
678 
679 
680 void
682 {
683  if (!myStart)
684  return;
685 
686  if (!to)
687  to = myStart;
688  else if (to < myStart || (myEnd && to > myEnd))
689  return;
690 
691  // Make sure we start at a decent place.
692  UT_Unicode::fixpos(myStart, to);
693 
694  myCurrent = to;
695  myNext = convert(myCurrent, myCP);
696  if (!myCP)
697  myNext = NULL;
698  else
699  {
700  if (myTransform && myCP)
701  myCP = myTransform->transformCodepoint(myCP);
702  if (!myEnd || (myNext < myEnd))
703  {
704  myNext2 = convert(myNext, myNextCP);
705  if (myTransform && myNextCP)
706  myNextCP = myTransform->transformCodepoint(myNextCP);
707  }
708  else
709  myNextCP = 0;
710  }
711 }
712 
713 bool
715 {
716  // Invalid iterator or at the end already?
717  if (!myStart || !myCP || (myEnd && myCurrent >= myEnd))
718  return true;
719 
720  myCP = myNextCP;
721  myCurrent = myNext;
722  myNext = myNext2;
723  if (myCP)
724  {
725  if(!myEnd || (myNext < myEnd))
726  {
727  myNext2 = convert(myNext, myNextCP);
728  if (myTransform && myNextCP)
729  myNextCP = myTransform->transformCodepoint(myNextCP);
730  }
731  else
732  myNextCP = 0;
733  return true;
734  }
735  else
736  return false;
737 }
738 
739 bool
741 {
742  // Invalid iterator or at the start already?
743  if (!myStart || myCurrent == myStart)
744  return false;
745 
746  const utf8 *prev;
747  utf32 cp;
748 
749  prev = UT_Unicode::prev(myStart, myCurrent);
750  UT_Unicode::convert(prev, cp);
751  if (prev && cp)
752  {
753  if (myTransform && cp)
754  cp = myTransform->transformCodepoint(cp);
755  myNextCP = myCP;
756  myCP = cp;
757  myNext2 = myNext;
758  myNext = myCurrent;
759  myCurrent = prev;
760  return true;
761  }
762  else
763  return false;
764 }
765 
766 #endif // __UT_UnicodeImpl__
static bool isUpper(utf32 cp)
SYS_FORCE_INLINE const_iterator begin() const
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:136
static bool isSpace(utf32 cp, bool break_only=true)
static bool isLower(utf32 cp)
GLuint GLenum GLenum transform
Definition: glew.h:14742
unsigned short utf16
Definition: SYS_Types.h:56
static bool isAlpha(utf32 cp)
int64 exint
Definition: SYS_Types.h:125
unsigned int myComplement
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator end() const
Returns a constant iterator pointing to the end of the string.
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:130
static bool isPunct(utf32 cp)
void reset(const utf8 *to=0)
GLenum GLint GLuint mask
Definition: glew.h:1845
GLboolean reset
Definition: glew.h:4959
unsigned int myCategory
const char * c_str() const
Definition: UT_String.h:505
static const utf8 * prev(const utf8 *start, const utf8 *current)
SYS_FORCE_INLINE const_iterator end() const
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:40
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
static bool isWordDelimiter(utf32 cp)
unsigned int utf32
Definition: SYS_Types.h:58
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:125
GLuint GLuint end
Definition: glew.h:1253
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
const GLfloat * c
Definition: glew.h:16296
GLuint GLsizei GLsizei * length
Definition: glew.h:1825
static const utf8 * next(const utf8 *current)
GLuint start
Definition: glew.h:1253
static bool isAlnum(utf32 cp)
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
SYS_NO_DISCARD_RESULT SYS_FORCE_INLINE const_iterator begin() const
Returns a constant iterator pointing to the beginning of the string.
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:151
GLuint64EXT * result
Definition: glew.h:14007
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:135
char utf8
Definition: SYS_Types.h:52
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:235
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:117
GLenum GLuint GLsizei const GLchar * buf
Definition: glew.h:2580
UT_API UT_UnicodeCharacter * theUnicodeTable[256]
static utf32 toLower(utf32 cp)
PXR_NAMESPACE_OPEN_SCOPE typedef unsigned char uchar
Definition: inttypes.h:58
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)
GLenum GLsizei len
Definition: glew.h:7752
GLintptr offset
Definition: glew.h:1682