HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
UT_UnicodeImpl.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_UnicodeImpl.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_UnicodeImpl__
11 #define __UT_UnicodeImpl__
12 
13 // TODO: Make the UTF8 portion table-driven.
14 
15 #include <string.h>
16 
17 #include "UT_Assert.h"
18 #include "UT_UnicodeTable.h"
19 
20 // ============================================================================
21 
22 namespace /* anonymous */ {
23 
24 const UT_UnicodeCharacter &
25 getCharacterInfo(utf32 cp)
26 {
27  static UT_UnicodeCharacter empty = {0,0};
28 
29  if (cp < 65536)
30  {
31  UT_UnicodeCharacter *block = theUnicodeTable[cp >> 8];
32  if (block)
33  return block[cp & 255];
34  }
35  return empty;
36 }
37 
38 static inline bool
39 isASCII(utf8 c)
40 {
41  return ((uchar(c) & 0x80) == 0);
42 }
43 
44 static inline bool
45 isContinuation(utf8 c)
46 {
47  return ((uchar(c) & 0xC0) == 0x80);
48 }
49 
50 static inline bool
51 isValidLeading(utf8 c)
52 {
53  return isASCII(c) ||
54  ((uchar(c) & 0xE0) == 0xC0) ||
55  ((uchar(c) & 0xF0) == 0xE0) ||
56  ((uchar(c) & 0xF8) == 0xF0);
57 }
58 
59 static inline int
60 getContinuationCount(utf8 c)
61 {
62  if ((uchar(c) & 0xE0) == 0xC0)
63  return 1;
64  else if ((uchar(c) & 0xF0) == 0xE0)
65  return 2;
66  else if ((uchar(c) & 0xF8) == 0xF0)
67  return 3;
68  return 0;
69 }
70 
71 }
72 
73 bool
75 {
76  return isValidLeading(octet) || isContinuation(octet);
77 }
78 
79 
80 const utf8 *
81 UT_Unicode::convert(const utf8 *str, utf32 &cp)
82 {
83  cp = 0;
84  if (!str)
85  return NULL;
86 
87  utf8 c = *str++;
88 
89  // Quick ASCII check.
90  if (isASCII(c))
91  {
92  cp = c;
93  return str;
94  }
95 
96  // Get the number of expected continuation bytes.
97  int cont_bytes = getContinuationCount(c);
98 
99  // Is the leading byte broken?
100  if (cont_bytes == 0)
101  return NULL;
102 
103  // The minimum value representable by continuation byte count. Any value
104  // below the minimum value is illegal. This is to avoid multiple encodings
105  // of the same value (e.g. '.' -> 0x
106  static const utf32 least_values[4] = { 0x0, 0x80, 0x800, 0x10000 };
107  utf32 least_value = least_values[cont_bytes];
108 
109  // The amount of shift is determined by the number of continuation
110  // bytes. The shift is progressively reduced as more bytes are read.
111  int shift = cont_bytes * 6;
112 
113  // Set up the initial mask for the data in the leading byte.
114  utf8 mask = 0x3F >> cont_bytes;
115 
116  utf32 result;
117  result = 0;
118  for(;;)
119  {
120  result |= (c & mask) << shift;
121 
122  if (cont_bytes-- == 0)
123  break;
124 
125  c = *str++;
126 
127  // Make sure the continuation byte is of the right form.
128  if (!isContinuation(c))
129  return NULL;
130 
131  // Every continuation byte has the same mask but contributes
132  // six bits lower than the byte before.
133  mask = 0x3F;
134  shift -= 6;
135  }
136 
137  // If the code point is not valid, return the current string position but
138  // a zero code point. I.e. the encoding is correct, but the code point
139  // is not, for the given encoding.
140  if (result < least_value || !isValidCodePoint(result))
141  cp = replacementCodePoint(); // The replacement character
142  else
143  cp = result;
144 
145  return str;
146 }
147 
148 
149 
150 int
152 {
153  if (cp < 0x00080)
154  {
155  if (buf && buflen >= 1)
156  buf[0] = utf8(cp);
157  return 1;
158  }
159  else if (cp < 0x00000800)
160  {
161  if (buf && buflen >= 2)
162  {
163  buf[0] = 0xC0 | utf8(cp >> 6);
164  buf[1] = 0x80 | utf8(cp & 0x3F);
165  }
166  return 2;
167  }
168  else if (cp < 0x00010000)
169  {
170  // We don't encode surrogate pairs.
171  if (isSurrogatePair(cp))
172  return 0;
173 
174  if (buf && buflen >= 3)
175  {
176  buf[0] = 0xE0 | utf8(cp >> 12);
177  buf[1] = 0x80 | utf8((cp >> 6) & 0x3F);
178  buf[2] = 0x80 | utf8(cp & 0x3F);
179  }
180  return 3;
181  }
182  else if (cp < 0x110000)
183  {
184  if (buf && buflen >= 4)
185  {
186  buf[0] = 0xF0 | utf8(cp >> 18);
187  buf[1] = 0x80 | utf8((cp >> 12) & 0x3F);
188  buf[2] = 0x80 | utf8((cp >> 6) & 0x3F);
189  buf[3] = 0x80 | utf8(cp & 0x3F);
190  }
191  return 4;
192  }
193  else
194  {
195  /// 0x10FFFF is the greatest code point value allowed by Unicode and
196  // hence UTF-8 encodings.
197  return 0;
198  }
199 }
200 
201 //
202 namespace
203 {
204  static utf16 norm16(utf16 c, bool big_endian)
205  {
206  if (big_endian)
207  return (c & 0xFF) << 8 | (c >> 8);
208  else
209  return c;
210  }
211 }
212 
213 const utf16 *
214 UT_Unicode::convert(const utf16 *str, utf32 &cp, bool big_endian)
215 {
216  cp = 0;
217  if (!str)
218  return NULL;
219 
220  // Check non-surrogate characters first
221  utf16 c0 = norm16(str[0], big_endian);
222 
223  if (!isSurrogatePair(c0))
224  {
225  cp = utf32(c0);
226  return str + 1;
227  }
228 
229  utf16 c1 = norm16(str[1], big_endian);
230  if ((c0 >= 0xD800 && c0 < 0xDC00) && (c1 >= 0xDC00 && c1 < 0xE000))
231  {
232  static const utf32 offset = ((0xD800 << 10) + 0xDC00) - 0x10000;
233  cp = utf32((c0 << 10) + c1) - offset;
234  return str + 2;
235  }
236  else
237  {
238  // The second character wasn't a surrogate pair character, so skip
239  // over the first of the invalid pair.
240  cp = replacementCodePoint();
241  return str + 1;
242  }
243 }
244 
245 int
247 {
248  if (!isValidCodePoint(cp))
249  return 0;
250 
251  // Characters outside the 64K range are encoded as surrogate pairs.
252  if (!isFromSupplementaryPlane(cp))
253  {
254  if (buf && buflen >= sizeof(utf16))
255  buf[0] = utf16(cp);
256  return 2;
257  }
258  else
259  {
260  if (buf && buflen >= sizeof(utf16[2]))
261  {
262  cp -= 0x10000;
263  buf[0] = utf16(0xD800 | ((cp >> 10) & 0x03FF));
264  buf[1] = utf16(0xDC00 | (cp & 0x03FF));
265  }
266  return 4;
267  }
268 }
269 
270 
271 
272 utf8 *
274 {
275  return const_cast<utf8 *>(next(const_cast<const utf8 *>(current)));
276 }
277 
278 const utf8 *
279 UT_Unicode::next(const utf8 *current)
280 {
281  if (!current)
282  return NULL;
283 
284  utf8 c = *current;
285 
286  // End of string already?
287  if (c == 0)
288  return current;
289 
290  // Quick check for plain ASCII.
291  if (isASCII(c))
292  {
293  current++;
294  return current;
295  }
296 
297  int nb_cont;
298  if (isContinuation(c))
299  {
300  // Are we inside a continuation byte? Then we'll have to scan forward
301  // until we reach a non-continuation byte or end. If we scan forward
302  // more than two bytes, then the continuation is invalid.
303  nb_cont = 2;
304  current++;
305  while (isContinuation(*current++) && nb_cont--) { }
306 
307  if (nb_cont == 0)
308  return NULL;
309 
310  if (!isValidLeading(*current))
311  return NULL;
312 
313  return current;
314  }
315  else if ((nb_cont = getContinuationCount(c)) == 0)
316  {
317  // We didn't encounter a valid byte. We probably got passed a non-UTF8
318  // encoded string.
319  return NULL;
320  }
321 
322  current++;
323  for (int i = 0; i < nb_cont; i++)
324  {
325  if (!isContinuation(*current++))
326  return NULL;
327  }
328 
329  if (*current && !isValidLeading(*current))
330  return NULL;
331 
332  return current;
333 }
334 
335 utf8 *
336 UT_Unicode::prev(const utf8 *start, utf8 *current)
337 {
338  return const_cast<utf8 *>(prev(start, const_cast<const utf8 *>(current)));
339 }
340 
341 const utf8 *
342 UT_Unicode::prev(const utf8 *start, const utf8 *current)
343 {
344  if (!current || !start)
345  return NULL;
346 
347  // Already at the start (or beyond) ?
348  if (start >= current)
349  return NULL;
350 
351  // If the current byte is either ASCII or UTF8 leading byte, and the
352  // previous is ASCII, we can skip right back to it. Otherwise the previous
353  // character must be a part of an UTF8 encoding, or garbage.
354  if ( isValidLeading(current[0]) && isASCII(current[-1]))
355  {
356  current--;
357  return current;
358  }
359 
360  // If we're currently on an ASCII character or a leading byte, go one back
361  // and try passing over the continuation bytes until we hit a leading byte.
362  if (isValidLeading(*current))
363  {
364  current--;
365  }
366  else if (!isContinuation(*current))
367  {
368  // Previous character is garbage. There's no safe way to go back and
369  // expect the current character to be valid.
370  return NULL;
371  }
372 
373  int nb_cont = 0;
374  while(current >= start && isContinuation(*current))
375  {
376  current--;
377  nb_cont++;
378  }
379 
380  // We got no continuation bytes. We should've gotten at least one.
381  if (nb_cont == 0)
382  return NULL;
383 
384  // We passed too many continuation bytes. The encoding is garbage.
385  if (nb_cont > getContinuationCount(*current))
386  return NULL;
387 
388  return current;
389 }
390 
391 
392 bool
393 UT_Unicode::fixpos(const utf8 *start, utf8 *&current)
394 {
395  return fixpos(start, const_cast<const utf8 *&>(current));
396 }
397 
398 bool
399 UT_Unicode::fixpos(const utf8 *start, const utf8 *&current)
400 {
401  if (isContinuation(*current))
402  return prev(start, current) != NULL;
403 
404  return true;
405 }
406 
407 exint
409 {
410  if (!start)
411  return 0;
412 
413  exint nb_cp = 0;
414 
415  UT_ASSERT(isValidLeading(*start));
416  UT_ASSERT(!end || isValidLeading(*end));
417 
418  for(iterator it(start, end); !it.atEnd(); ++it)
419  nb_cp++;
420  return nb_cp;
421 }
422 
423 exint
425 {
426  if (!start)
427  return 0;
428 
429  UT_ASSERT(isValidLeading(*start));
430  UT_ASSERT(!end || isValidLeading(*end));
431 
432  if (!end)
433  return strlen((const char *)start);
434  else
435  return exint(end - start);
436 }
437 
438 inline utf8 *
440 {
441  if (!start)
442  return NULL;
443 
444  UT_ASSERT(isValidLeading(*start));
445  UT_ASSERT(!end || isValidLeading(*end));
446 
447  if (!end)
448  return (utf8 *)::strdup((const char *)start);
449  else
450  {
451  size_t length = (end - start);
452  utf8 *buf = (utf8 *)malloc(length + 1);
453 
454 
455  ::memcpy(buf, start, length);
456  buf[length] = '\0';
457  return buf;
458  }
459 }
460 
461 
462 const utf8 *
464 {
465  if (cp == 0 || !start)
466  return NULL;
467 
468  UT_ASSERT(isValidLeading(*start));
469  UT_ASSERT(!end || isValidLeading(*end));
470 
471  if (isASCII(cp))
472  {
473  if (!end)
474  return (const utf8 *)strchr((const char *)start, char(cp));
475  else
476  {
477  while(start < end && *start != cp)
478  start++;
479  return start == end ? NULL : start;
480  }
481  }
482  else
483  {
484  for(;;)
485  {
486  const utf8 *pos = start, *next;
487  utf32 ccp;
488 
489  next = convert(start, ccp);
490  if (!ccp)
491  break;
492 
493  if (cp == ccp)
494  return pos;
495 
496  pos = next;
497  }
498  return NULL;
499  }
500 }
501 
502 const utf8 *
503 UT_Unicode::find(const utf8 *str, const utf8 *start, const utf8 *end)
504 {
505  if (!str || !start)
506  return NULL;
507 
508  UT_ASSERT(isValidLeading(*start));
509  UT_ASSERT(!end || isValidLeading(*end));
510 
511  if (!end)
512  {
513  return (const utf8 *)::strstr((const char *)start, (const char *)str);
514  }
515  else
516  {
517  size_t len = ::strlen((const char *)str);
518 
519  // If the section given is shorter than the string to search for,
520  // or the search string is empty, bail early.
521  if (!len || (end - start) < len)
522  return NULL;
523 
524  const utf8 *find = str;
525  while(start < (end - len))
526  {
527  if (*find == *start)
528  {
529 
530 // if (strncmp())
531  return NULL;
532  }
533 
534  start = next(start);
535  }
536  }
537  return NULL;
538 }
539 
540 
541 
542 
543 
544 bool
545 UT_Unicode::isSpace(utf32 cp, bool break_only)
546 {
547  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
548  return (c.myCategory == UT_UNICODE_SPACE ||
549  (!break_only && c.myCategory == UT_UNICODE_SPACE_NONBREAK));
550 }
551 
552 bool
554 {
555  return getCharacterInfo(cp).myCategory == UT_UNICODE_NUMBER;
556 }
557 
558 bool
560 {
561  return isUpper(cp) || isLower(cp);
562 }
563 
564 bool
566 {
567  return isAlpha(cp) || isDigit(cp);
568 }
569 
570 bool
572 {
573  return getCharacterInfo(cp).myCategory == UT_UNICODE_PUNCTUATION;
574 }
575 
576 bool
578 {
579  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
580  return c.myCategory == UT_UNICODE_LT_UPPER ||
582 }
583 
584 bool
586 {
587  return getCharacterInfo(cp).myCategory == UT_UNICODE_LT_LOWER;
588 }
589 
590 ///
592 {
593  return (cp >= 0x04E00 && cp <= 0x09FFF) || // CJK Unified Ideographs
594  (cp >= 0x03400 && cp <= 0x04DBF) || // - Extension A
595  (cp >= 0x20000 && cp <= 0x2A6D6); // - Extension B
596 }
597 
598 
599 utf32
601 {
602  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
603  if (c.myComplement &&
606  return c.myComplement;
607  return cp;
608 }
609 
610 utf32
612 {
613  const UT_UnicodeCharacter &c = getCharacterInfo(cp);
614  if (c.myComplement &&
616  return c.myComplement;
617  return cp;
618 }
619 
620 
621 // UT_Unicode::iterator
623 {
624  init(NULL, NULL, NULL);
625 }
626 
629 {
630  init(start, end, transform);
631 }
632 
635 {
636  init(str.begin(), str.end(), transform);
637 }
638 
639 void UT_Unicode::iterator::init(const utf8 *start, const utf8 *end,
641 {
642  myCurrent = NULL;
643  myCP = myNextCP = 0;
644  myTransform = transform;
645 
646  if (start && isValidLeading(*start) && (!end || start < end))
647  {
648  myStart = start;
649  myEnd = end;
650  reset();
651  }
652  else
653  {
654  myStart = myEnd = myNext = myNext2 = NULL;
655  }
656 }
657 
658 
659 void
661 {
662  if (!myStart)
663  return;
664 
665  if (!to)
666  to = myStart;
667  else if (to < myStart || (myEnd && to > myEnd))
668  return;
669 
670  // Make sure we start at a decent place.
671  UT_Unicode::fixpos(myStart, to);
672 
673  myCurrent = to;
674  myNext = convert(myCurrent, myCP);
675  if (!myCP)
676  myNext = NULL;
677  else
678  {
679  if (myTransform && myCP)
680  myCP = myTransform->transformCodepoint(myCP);
681  if (!myEnd || (myNext < myEnd))
682  {
683  myNext2 = convert(myNext, myNextCP);
684  if (myTransform && myNextCP)
685  myNextCP = myTransform->transformCodepoint(myNextCP);
686  }
687  else
688  myNextCP = 0;
689  }
690 }
691 
692 bool
694 {
695  // Invalid iterator or at the end already?
696  if (!myStart || !myCP || (myEnd && myCurrent >= myEnd))
697  return true;
698 
699  myCP = myNextCP;
700  myCurrent = myNext;
701  myNext = myNext2;
702  if (myCP)
703  {
704  if(!myEnd || (myNext < myEnd))
705  {
706  myNext2 = convert(myNext, myNextCP);
707  if (myTransform && myNextCP)
708  myNextCP = myTransform->transformCodepoint(myNextCP);
709  }
710  else
711  myNextCP = 0;
712  return true;
713  }
714  else
715  return false;
716 }
717 
718 bool
720 {
721  // Invalid iterator or at the start already?
722  if (!myStart || myCurrent == myStart)
723  return false;
724 
725  const utf8 *prev;
726  utf32 cp;
727 
728  prev = UT_Unicode::prev(myStart, myCurrent);
729  UT_Unicode::convert(prev, cp);
730  if (prev && cp)
731  {
732  if (myTransform && cp)
733  cp = myTransform->transformCodepoint(cp);
734  myNextCP = myCP;
735  myCP = cp;
736  myNext2 = myNext;
737  myNext = myCurrent;
738  myCurrent = prev;
739  return true;
740  }
741  else
742  return false;
743 }
744 
745 #endif // __UT_UnicodeImpl__
static bool isUpper(utf32 cp)
unsigned char uchar
Definition: SYS_Types.h:31
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:135
static bool isSpace(utf32 cp, bool break_only=true)
char utf8
Definition: SYS_Types.h:40
unsigned int utf32
Definition: SYS_Types.h:42
static bool isLower(utf32 cp)
GLuint start
Definition: glcorearb.h:474
static bool isAlpha(utf32 cp)
unsigned int myComplement
const_iterator begin() const
Returns a constant iterator pointing to the beginning of the string.
GLint GLuint mask
Definition: glcorearb.h:123
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:129
static bool isPunct(utf32 cp)
void reset(const utf8 *to=0)
png_uint_32 i
Definition: png.h:2877
unsigned int myCategory
static const utf8 * prev(const utf8 *start, const utf8 *current)
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:30
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
#define UT_ASSERT(ZZ)
Definition: UT_Assert.h:102
int64 exint
Definition: SYS_Types.h:109
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:124
GLuint GLuint end
Definition: glcorearb.h:474
GLintptr offset
Definition: glcorearb.h:664
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
static const utf8 * next(const utf8 *current)
GA_API const UT_StringHolder transform
static bool isAlnum(utf32 cp)
GLenum GLuint GLenum GLsizei const GLchar * buf
Definition: glcorearb.h:2539
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
unsigned short utf16
Definition: SYS_Types.h:41
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:150
static bool isCJK(utf32 cp)
const_iterator end() const
Returns a constant iterator pointing to the end of the string.
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:228
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:116
UT_API UT_UnicodeCharacter * theUnicodeTable[256]
static utf32 toLower(utf32 cp)
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)
GLuint GLsizei GLsizei * length
Definition: glcorearb.h:794