HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_Unicode.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_Unicode.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_Unicode__
11 #define __UT_Unicode__
12 
13 #include <SYS/SYS_Types.h>
14 #include "UT_StringView.h"
15 #include "UT_StringHolder.h"
16 
17 /// Specifies the maximum possible size of a UTF8 encoding for any given
18 /// code point.
19 #define UT_UTF8_MAX_ENCODING_LEN 4
20 
21 // Helper function to cast unsigned char literals to UTF-8
22 static inline const utf8 *UTF8(const uchar *lit)
23 {
24  return reinterpret_cast<const utf8 *>(lit);
25 }
26 
27 /// Helper functions for Unicode and the UTF-8 variable length encoding.
29 {
30 public:
31  /// Parses a code point from a UTF-8 encoding and returns it as a single
32  /// code point value. Returns a pointer to the next encoding if the
33  /// current one got successfully decoded. If the decoding fails, it
34  /// return @c NULL and cp is set to zero.
35  static inline const utf8 *convert(const utf8 *str, utf32 &cp);
36 
37  /// Converts a code point to its UTF-8 encoding. If no buffer is given,
38  /// returns the number of characters needed to store the resulting
39  /// encoded sequence. Does not write out a terminating zero but moves the
40  /// pointer to where the next character after the sequence should be
41  /// written.
42  static inline int convert(utf32 cp, utf8 *str, exint buflen);
43 
44  /// Given a current location in a buffer, moves to the next character.
45  /// If the location is inside a UTF-8 multi-character encoding (i.e not at
46  /// the beginning of one), it moves to the next encoded character start
47  /// after.
48  /// If the current location is already at the terminating NUL character
49  /// the function does nothing and just returns the current pointer.
50  /// If it is unable to move successfully to the next encoded character (e.g.
51  /// it's already at the end of the string, or the encoding is garbage and
52  /// no recovery is possible) the function returns NULL.
53  static inline const utf8 *next(const utf8 *current);
54  static inline utf8 *next(utf8 *current);
55 
56  /// Given a location in a buffer, moves to the to the previous character,
57  /// unless already at the beginning of the string, as defined by 'start'.
58  /// If the location is inside a UTF-8 multi-character encoding, it moves to
59  /// the beginning of that encoding.
60  /// If going back lands on an invalid character, it encounters bad
61  /// encoding (e.g. too many continuation bytes), or it's already at the
62  /// start, the function returns NULL.
63  static inline const utf8 *prev(const utf8 *start, const utf8 *current);
64  static inline utf8 *prev(const utf8 *start, utf8 *current);
65 
66  /// Given a location in a buffer, moves after the end of the word. This
67  /// is done by grouping characters that are considered continuous.
68  /// There are 4 types of groups:
69  /// 1. space
70  /// 2. alphanumeric: ASCII, including `_@`
71  /// 3. punctuation : `{}[]();,.` and `\\n\\r`
72  /// 4. other : symbols (including non ASCII)
73  /// Punctuation is always one character group, or never grouped with
74  /// another character. Also, when a dot is sandwiched by digits (e.g., 1.1)
75  /// it's considerd continuous.
76  /// Note that this function uses different rules from isWordDelimiter.
77  static inline const utf8 *nextWord(const utf8 *start, const utf8 *current);
78 
79  /// Given a location in a buffer, moves to the beginning of the word.
80  /// This is done by grouping characters that are considered continuous.
81  /// There are 4 types of groups:
82  /// 1. space
83  /// 2. alphanumeric: ASCII, including `_@`
84  /// 3. punctuation : `{}[]();,.` and `\\n\\r`
85  /// 4. other : symbols (including non ASCII)
86  /// Punctuation is always one character group, or never grouped with
87  /// another character. Also, when a dot is sandwiched by digits (e.g., 1.1)
88  /// it's considerd continuous.
89  /// Note that this function uses different rules from isWordDelimiter.
90  static inline const utf8 *prevWord(const utf8 *start, const utf8 *current);
91 
92  /// Given a pointer inside of a string representing variable length
93  /// encoding, moves the pointer so that it points to the beginning of the
94  /// encoding, if not there already.
95  /// Returns @c false if it was unable to fix the position and @c true if
96  /// successful or the position was already valid.
97  static inline bool fixpos(const utf8 *start, const utf8 *&current);
98  static inline bool fixpos(const utf8 *start, utf8 *&current);
99 
100  /// Returns the number of code points this variable encoding represents.
101  static inline exint count(const utf8 *start, const utf8 *end = 0);
102 
103  /// Returns the number of octets for this variable encoding. One octet
104  /// is the same as a byte for UTF-8 encodings.
105  static inline exint length(const utf8 *start, const utf8 *end = 0);
106 
107  /// Duplicates the string using malloc. Use free() to free the resulting
108  /// string. If a NULL pointer is passed, a NULL pointer is returned.
109  static inline utf8 *duplicate(const utf8 *start, const utf8 *end = 0);
110 
111  /// Find a code point in a variable length string and return a pointer to
112  /// it. An optional end point can be supplied, which delineates a search
113  /// range. Otherwise the string is searched up to the terminating NUL.
114  static inline const utf8 *find(utf32 cp,
115  const utf8 *start, const utf8 *end = 0);
116 
117  /// Find a UTF8 encoded string in another UTF8 encoded string and return
118  /// a pointer to the start of the match. Returns NULL if the string
119  /// was not found.
120  static inline const utf8 *find(const utf8 *str,
121  const utf8 *start, const utf8 *end = 0);
122 
123 
124  /// Parses a code point from a UTF-16 encoding and returns it as a single
125  /// code point value. Returns a pointer to the next encoding if the
126  /// current one got successfully decoded. If the decoding fails, it
127  /// return @c NULL and cp is set to zero.
128  /// Set @c big_endian to true if the incoming UTF-16 string is encoded as
129  /// big endian (UTF-16BE).
130  static inline const utf16 *convert(const utf16 *str, utf32 &cp,
131  bool big_endian = false);
132 
133  /// Converts a code point to its UTF-16LE encoding into the buffer given.
134  /// If no buffer is given, or if the buffer size is too small, returns the
135  /// number of bytes needed to store the resulting encoded sequence.
136  /// @c buflen should be given in bytes, and not number of utf16 entries.
137  /// Does not write out a terminating zero but moves the pointer to where
138  /// the next character after the sequence should be written.
139  static inline int convert(utf32 cp, utf16 *str, exint buflen);
140 
141  /// Returns the replacement character, which is returned by the convert
142  /// functions, when they encounter an invalid, but recoverable, encoding.
143  static inline utf32 replacementCodePoint()
144  {
145  return 0xFFFD;
146  }
147 
148  /// Returns @c true if the code point given is a surrogate pair. This is
149  /// valid UTF-16 character, since it is used to encode greater-than 0xFFFF
150  /// code points. It is not a valid UTF-32 code point, however.
151  static inline bool isSurrogatePair(utf32 cp)
152  {
153  return cp >= 0xD800 && cp < 0xE000;
154  }
155 
156  static inline bool isFromSupplementaryPlane(utf32 cp)
157  {
158  return cp >= 0x10000 && cp <= 0x10FFFF;
159  }
160 
161  // Returns true if the given value is a valid Unicode code point.
162  static inline bool isValidCodePoint(utf32 cp)
163  {
164  // Unicode is specified up to 0x10FFFF. Surrogate pairs are only valid
165  // for UTF-16, not for UTF-32.
166  return !isSurrogatePair(cp) && cp <= 0x10FFFF;
167  }
168 
169  // Returns true if the code point represents a control character (newline,
170  // tab, etc).
171  static inline bool isControlChar(utf32 cp)
172  {
173  return cp <= 0x1F || cp == 0x7F;
174  }
175 
176  // Code point validation for different character sets.
177  static inline bool isASCII(utf32 cp)
178  {
179  return cp <= 0x7F;
180  }
181 
182  static inline bool isLatin1(utf32 cp)
183  {
184  return cp <= 0xFF;
185  }
186 
187  // Character type queries.
188  static inline bool isSpace(utf32 cp, bool break_only = true);
189  static inline bool isDigit(utf32 cp);
190  static inline bool isAlpha(utf32 cp);
191  static inline bool isAlnum(utf32 cp);
192  static inline bool isPunct(utf32 cp);
193  static inline bool isUpper(utf32 cp);
194  static inline bool isLower(utf32 cp);
195 
196  /// Returns true if the character is from any of the Unicode CJK Unified
197  /// Ideographs blocks.
198  static inline bool isCJK(utf32 cp);
199  static inline utf32 toLower(utf32 cp);
200  static inline utf32 toUpper(utf32 cp);
201 
202  // Return true if the character shold act as a delimiter between words.
203  static inline bool isWordDelimiter(utf32 cp);
204 
205  // Returns true if the byte given is a valid UTF-8 octet (any position).
206  static inline bool isUTF8(utf8 octet);
207 
208  class transform
209  {
210  public:
211  /// Transform a code point from one to another.
212  virtual utf32 transformCodepoint(utf32 cp) const = 0;
213 
214  virtual ~transform() {};
215  };
216 
217 
218  /// Simple string iterator to iterate over an UTF-8 string and peel off
219  /// the code points in sequence. This iterator can always get the 'next'
220  /// code point, which is useful for function which require sequential
221  /// code point pairs, such as kerning.
222  /// The iterator terminates when the current character is nil, invalid,
223  /// or at the 'end' pointer.
224  class iterator
225  {
226  public:
227  inline iterator();
228 
229  // Create a new iterator starting at 'start' and, optionally, ending
230  // at 'end'. If end is not given, the string will stop iterating
231  // at the terminating NUL or if it hits an invalid encoding. If 'end'
232  // is given it should be a valid pointer and follow 'start' in memory.
233  inline iterator(const utf8 *start, const utf8 *end = 0,
234  const UT_Unicode::transform *transform = 0);
235  inline iterator(const UT_StringView &str,
236  const UT_Unicode::transform *transform = 0);
237  inline iterator(const UT_String &str,
238  const UT_Unicode::transform *transform = 0);
239  inline iterator(const UT_StringRef &str,
240  const UT_Unicode::transform *transform = 0);
241 
242  inline void reset(const utf8 *to = 0);
243  inline bool advance();
244  inline bool retreat();
245 
246  /// Returns the pointer to the current UTF-8 sequence. Valid if atEnd
247  /// is not false.
248  const utf8 *at() const { return myCurrent; }
249 
250  // Returns a pointer to the next UTF-8 sequence, or the terminating NUL.
251  // Valid if atEnd is not false.
252  const utf8 *next() const { return myNext; }
253 
254  // Returns the code point of the encoding at the current location.
255  utf32 getCP() const { return myCP; }
256 
257  // Returns the next code point of the encoding following the current
258  // one, unless it happens to be invalid, in which case it returns zero.
259  utf32 getNextCP() const { return myNextCP; }
260 
261  bool atEnd() const { return myCP == 0; }
262 
263  iterator &operator++() { advance(); return *this; }
264  iterator &operator--() { retreat(); return *this; }
265 
266  private:
267  inline void init(const utf8 *start, const utf8 *end,
269 
270  iterator &operator++(int); // Post-increment is verboten
271 
272  const utf8 *myStart, *myEnd;
273  const utf8 *myCurrent, *myNext, *myNext2;
274  utf32 myCP, myNextCP;
275  const UT_Unicode::transform *myTransform;
276  };
277 };
278 
279 #include "UT_UnicodeImpl.h"
280 
281 #endif // __UT_Unicode__
static bool isUpper(utf32 cp)
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:162
static bool isSpace(utf32 cp, bool break_only=true)
Helper functions for Unicode and the UTF-8 variable length encoding.
Definition: UT_Unicode.h:28
static bool isLower(utf32 cp)
GLuint start
Definition: glcorearb.h:475
unsigned short utf16
Definition: SYS_Types.h:56
static bool isAlpha(utf32 cp)
int64 exint
Definition: SYS_Types.h:125
static bool isControlChar(utf32 cp)
Definition: UT_Unicode.h:171
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:156
static bool isPunct(utf32 cp)
void reset(const utf8 *to=0)
iterator & operator++()
Definition: UT_Unicode.h:263
static bool isLatin1(utf32 cp)
Definition: UT_Unicode.h:182
static const utf8 * prev(const utf8 *start, const utf8 *current)
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:39
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
static bool isWordDelimiter(utf32 cp)
unsigned int utf32
Definition: SYS_Types.h:58
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:151
GLuint GLuint end
Definition: glcorearb.h:475
static const utf8 * nextWord(const utf8 *start, const utf8 *current)
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
utf32 getNextCP() const
Definition: UT_Unicode.h:259
static const utf8 * next(const utf8 *current)
virtual utf32 transformCodepoint(utf32 cp) const =0
Transform a code point from one to another.
static bool isAlnum(utf32 cp)
utf32 getCP() const
Definition: UT_Unicode.h:255
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
const utf8 * next() const
Definition: UT_Unicode.h:252
static const utf8 * prevWord(const utf8 *start, const utf8 *current)
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:177
char utf8
Definition: SYS_Types.h:52
const utf8 * at() const
Definition: UT_Unicode.h:248
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:261
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:143
static utf32 toLower(utf32 cp)
iterator & operator--()
Definition: UT_Unicode.h:264
unsigned char uchar
Definition: SYS_Types.h:42
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)