HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_Unicode.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_Unicode.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_Unicode__
11 #define __UT_Unicode__
12 
13 #include <SYS/SYS_Types.h>
14 #include "UT_StringView.h"
15 #include "UT_StringHolder.h"
16 
17 /// Specifies the maximum possible size of a UTF8 encoding for any given
18 /// code point.
19 #define UT_UTF8_MAX_ENCODING_LEN 4
20 
21 // Helper function to cast unsigned char literals to UTF-8
22 static inline const utf8 *UTF8(const uchar *lit)
23 {
24  return reinterpret_cast<const utf8 *>(lit);
25 }
26 
27 /// Helper functions for Unicode and the UTF-8 variable length encoding.
29 {
30 public:
31  /// Parses a code point from a UTF-8 encoding and returns it as a single
32  /// code point value. Returns a pointer to the next encoding if the
33  /// current one got successfully decoded. If the decoding fails, it
34  /// return @c NULL and cp is set to zero.
35  static inline const utf8 *convert(const utf8 *str, utf32 &cp);
36 
37  /// Converts a code point to its UTF-8 encoding. If no buffer is given,
38  /// returns the number of characters needed to store the resulting
39  /// encoded sequence. Does not write out a terminating zero but moves the
40  /// pointer to where the next character after the sequence should be
41  /// written.
42  static inline int convert(utf32 cp, utf8 *str, exint buflen);
43 
44  /// Given a current location in a buffer, moves to the next character.
45  /// If the location is inside a UTF-8 multi-character encoding (i.e not at
46  /// the beginning of one), it moves to the next encoded character start
47  /// after.
48  /// If the current location is already at the terminating NUL character
49  /// the function does nothing and just returns the current pointer.
50  /// If it is unable to move successfully to the next encoded character (e.g.
51  /// it's already at the end of the string, or the encoding is garbage and
52  /// no recovery is possible) the function returns NULL.
53  static inline const utf8 *next(const utf8 *current);
54  static inline utf8 *next(utf8 *current);
55 
56  /// Given a location in a buffer, moves to the to the previous character,
57  /// unless already at the beginning of the string, as defined by 'start'.
58  /// If the location is inside a UTF-8 multi-character encoding, it moves to
59  /// the beginning of that encoding.
60  /// If going back lands on an invalid character, it encounters bad
61  /// encoding (e.g. too many continuation bytes), or it's already at the
62  /// start, the function returns NULL.
63  static inline const utf8 *prev(const utf8 *start, const utf8 *current);
64  static inline utf8 *prev(const utf8 *start, utf8 *current);
65 
66  /// Given a pointer inside of a string representing variable length
67  /// encoding, moves the pointer so that it points to the beginning of the
68  /// encoding, if not there already.
69  /// Returns @c false if it was unable to fix the position and @c true if
70  /// successful or the position was already valid.
71  static inline bool fixpos(const utf8 *start, const utf8 *&current);
72  static inline bool fixpos(const utf8 *start, utf8 *&current);
73 
74  /// Returns the number of code points this variable encoding represents.
75  static inline exint count(const utf8 *start, const utf8 *end = 0);
76 
77  /// Returns the number of octets for this variable encoding. One octet
78  /// is the same as a byte for UTF-8 encodings.
79  static inline exint length(const utf8 *start, const utf8 *end = 0);
80 
81  /// Duplicates the string using malloc. Use free() to free the resulting
82  /// string. If a NULL pointer is passed, a NULL pointer is returned.
83  static inline utf8 *duplicate(const utf8 *start, const utf8 *end = 0);
84 
85  /// Find a code point in a variable length string and return a pointer to
86  /// it. An optional end point can be supplied, which delineates a search
87  /// range. Otherwise the string is searched up to the terminating NUL.
88  static inline const utf8 *find(utf32 cp,
89  const utf8 *start, const utf8 *end = 0);
90 
91  /// Find a UTF8 encoded string in another UTF8 encoded string and return
92  /// a pointer to the start of the match. Returns NULL if the string
93  /// was not found.
94  static inline const utf8 *find(const utf8 *str,
95  const utf8 *start, const utf8 *end = 0);
96 
97 
98  /// Parses a code point from a UTF-16 encoding and returns it as a single
99  /// code point value. Returns a pointer to the next encoding if the
100  /// current one got successfully decoded. If the decoding fails, it
101  /// return @c NULL and cp is set to zero.
102  /// Set @c big_endian to true if the incoming UTF-16 string is encoded as
103  /// big endian (UTF-16BE).
104  static inline const utf16 *convert(const utf16 *str, utf32 &cp,
105  bool big_endian = false);
106 
107  /// Converts a code point to its UTF-16LE encoding into the buffer given.
108  /// If no buffer is given, or if the buffer size is too small, returns the
109  /// number of bytes needed to store the resulting encoded sequence.
110  /// @c buflen should be given in bytes, and not number of utf16 entries.
111  /// Does not write out a terminating zero but moves the pointer to where
112  /// the next character after the sequence should be written.
113  static inline int convert(utf32 cp, utf16 *str, exint buflen);
114 
115  /// Returns the replacement character, which is returned by the convert
116  /// functions, when they encounter an invalid, but recoverable, encoding.
117  static inline utf32 replacementCodePoint()
118  {
119  return 0xFFFD;
120  }
121 
122  /// Returns @c true if the code point given is a surrogate pair. This is
123  /// valid UTF-16 character, since it is used to encode greater-than 0xFFFF
124  /// code points. It is not a valid UTF-32 code point, however.
125  static inline bool isSurrogatePair(utf32 cp)
126  {
127  return cp >= 0xD800 && cp < 0xE000;
128  }
129 
130  static inline bool isFromSupplementaryPlane(utf32 cp)
131  {
132  return cp >= 0x10000 && cp <= 0x10FFFF;
133  }
134 
135  // Returns true if the given value is a valid Unicode code point.
136  static inline bool isValidCodePoint(utf32 cp)
137  {
138  // Unicode is specified up to 0x10FFFF. Surrogate pairs are only valid
139  // for UTF-16, not for UTF-32.
140  return !isSurrogatePair(cp) && cp <= 0x10FFFF;
141  }
142 
143  // Returns true if the code point represents a control character (newline,
144  // tab, etc).
145  static inline bool isControlChar(utf32 cp)
146  {
147  return cp <= 0x1F || cp == 0x7F;
148  }
149 
150  // Code point validation for different character sets.
151  static inline bool isASCII(utf32 cp)
152  {
153  return cp <= 0x7F;
154  }
155 
156  static inline bool isLatin1(utf32 cp)
157  {
158  return cp <= 0xFF;
159  }
160 
161  // Character type queries.
162  static inline bool isSpace(utf32 cp, bool break_only = true);
163  static inline bool isDigit(utf32 cp);
164  static inline bool isAlpha(utf32 cp);
165  static inline bool isAlnum(utf32 cp);
166  static inline bool isPunct(utf32 cp);
167  static inline bool isUpper(utf32 cp);
168  static inline bool isLower(utf32 cp);
169 
170  /// Returns true if the character is from any of the Unicode CJK Unified
171  /// Ideographs blocks.
172  static inline bool isCJK(utf32 cp);
173  static inline utf32 toLower(utf32 cp);
174  static inline utf32 toUpper(utf32 cp);
175 
176  // Return true if the character shold act as a delimiter between words.
177  static inline bool isWordDelimiter(utf32 cp);
178 
179  // Returns true if the byte given is a valid UTF-8 octet (any position).
180  static inline bool isUTF8(utf8 octet);
181 
182  class transform
183  {
184  public:
185  /// Transform a code point from one to another.
186  virtual utf32 transformCodepoint(utf32 cp) const = 0;
187 
188  virtual ~transform() {};
189  };
190 
191 
192  /// Simple string iterator to iterate over an UTF-8 string and peel off
193  /// the code points in sequence. This iterator can always get the 'next'
194  /// code point, which is useful for function which require sequential
195  /// code point pairs, such as kerning.
196  /// The iterator terminates when the current character is nil, invalid,
197  /// or at the 'end' pointer.
198  class iterator
199  {
200  public:
201  inline iterator();
202 
203  // Create a new iterator starting at 'start' and, optionally, ending
204  // at 'end'. If end is not given, the string will stop iterating
205  // at the terminating NUL or if it hits an invalid encoding. If 'end'
206  // is given it should be a valid pointer and follow 'start' in memory.
207  inline iterator(const utf8 *start, const utf8 *end = 0,
208  const UT_Unicode::transform *transform = 0);
209  inline iterator(const UT_StringView &str,
210  const UT_Unicode::transform *transform = 0);
211  inline iterator(const UT_String &str,
212  const UT_Unicode::transform *transform = 0);
213  inline iterator(const UT_StringRef &str,
214  const UT_Unicode::transform *transform = 0);
215 
216  inline void reset(const utf8 *to = 0);
217  inline bool advance();
218  inline bool retreat();
219 
220  /// Returns the pointer to the current UTF-8 sequence. Valid if atEnd
221  /// is not false.
222  const utf8 *at() const { return myCurrent; }
223 
224  // Returns a pointer to the next UTF-8 sequence, or the terminating NUL.
225  // Valid if atEnd is not false.
226  const utf8 *next() const { return myNext; }
227 
228  // Returns the code point of the encoding at the current location.
229  utf32 getCP() const { return myCP; }
230 
231  // Returns the next code point of the encoding following the current
232  // one, unless it happens to be invalid, in which case it returns zero.
233  utf32 getNextCP() const { return myNextCP; }
234 
235  bool atEnd() const { return myCP == 0; }
236 
237  iterator &operator++() { advance(); return *this; }
238  iterator &operator--() { retreat(); return *this; }
239 
240  private:
241  inline void init(const utf8 *start, const utf8 *end,
243 
244  iterator &operator++(int); // Post-increment is verboten
245 
246  const utf8 *myStart, *myEnd;
247  const utf8 *myCurrent, *myNext, *myNext2;
248  utf32 myCP, myNextCP;
249  const UT_Unicode::transform *myTransform;
250  };
251 };
252 
253 #include "UT_UnicodeImpl.h"
254 
255 #endif // __UT_Unicode__
static bool isUpper(utf32 cp)
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:136
static bool isSpace(utf32 cp, bool break_only=true)
Helper functions for Unicode and the UTF-8 variable length encoding.
Definition: UT_Unicode.h:28
static bool isLower(utf32 cp)
unsigned short utf16
Definition: SYS_Types.h:56
static bool isAlpha(utf32 cp)
int64 exint
Definition: SYS_Types.h:125
static bool isControlChar(utf32 cp)
Definition: UT_Unicode.h:145
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:130
static bool isPunct(utf32 cp)
void reset(const utf8 *to=0)
iterator & operator++()
Definition: UT_Unicode.h:237
static bool isLatin1(utf32 cp)
Definition: UT_Unicode.h:156
static const utf8 * prev(const utf8 *start, const utf8 *current)
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:40
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
static bool isWordDelimiter(utf32 cp)
unsigned int utf32
Definition: SYS_Types.h:58
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:125
GLuint GLuint end
Definition: glew.h:1253
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
utf32 getNextCP() const
Definition: UT_Unicode.h:233
static const utf8 * next(const utf8 *current)
virtual utf32 transformCodepoint(utf32 cp) const =0
Transform a code point from one to another.
GLuint start
Definition: glew.h:1253
static bool isAlnum(utf32 cp)
utf32 getCP() const
Definition: UT_Unicode.h:229
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
const utf8 * next() const
Definition: UT_Unicode.h:226
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:151
char utf8
Definition: SYS_Types.h:52
const utf8 * at() const
Definition: UT_Unicode.h:222
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:235
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:117
static utf32 toLower(utf32 cp)
iterator & operator--()
Definition: UT_Unicode.h:238
unsigned char uchar
Definition: SYS_Types.h:42
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)