HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UT_Unicode.h
Go to the documentation of this file.
1 /*
2  * PROPRIETARY INFORMATION. This software is proprietary to
3  * Side Effects Software Inc., and is not to be reproduced,
4  * transmitted, or disclosed in any way without written permission.
5  *
6  * NAME: UT_Unicode.h (RE Library, C++)
7  *
8  */
9 
10 #ifndef __UT_Unicode__
11 #define __UT_Unicode__
12 
13 #include <SYS/SYS_Types.h>
14 #include "UT_StringView.h"
15 
16 /// Specifies the maximum possible size of a UTF8 encoding for any given
17 /// code point.
18 #define UT_UTF8_MAX_ENCODING_LEN 4
19 
20 // Helper function to cast unsigned char literals to UTF-8
21 static inline const utf8 *UTF8(const uchar *lit)
22 {
23  return reinterpret_cast<const utf8 *>(lit);
24 }
25 
26 /// Helper functions for Unicode and the UTF-8 variable length encoding.
28 {
29 public:
30  /// Parses a code point from a UTF-8 encoding and returns it as a single
31  /// code point value. Returns a pointer to the next encoding if the
32  /// current one got successfully decoded. If the decoding fails, it
33  /// return @c NULL and cp is set to zero.
34  static inline const utf8 *convert(const utf8 *str, utf32 &cp);
35 
36  /// Converts a code point to its UTF-8 encoding. If no buffer is given,
37  /// returns the number of characters needed to store the resulting
38  /// encoded sequence. Does not write out a terminating zero but moves the
39  /// pointer to where the next character after the sequence should be
40  /// written.
41  static inline int convert(utf32 cp, utf8 *str, exint buflen);
42 
43  /// Given a current location in a buffer, moves to the next character.
44  /// If the location is inside a UTF-8 multi-character encoding (i.e not at
45  /// the beginning of one), it moves to the next encoded character start
46  /// after.
47  /// If the current location is already at the terminating NUL character
48  /// the function does nothing and just returns the current pointer.
49  /// If it is unable to move successfully to the next encoded character (e.g.
50  /// it's already at the end of the string, or the encoding is garbage and
51  /// no recovery is possible) the function returns NULL.
52  static inline const utf8 *next(const utf8 *current);
53  static inline utf8 *next(utf8 *current);
54 
55  /// Given a location in a buffer, moves to the to the previous character,
56  /// unless already at the beginning of the string, as defined by 'start'.
57  /// If the location is inside a UTF-8 multi-character encoding, it moves to
58  /// the beginning of that encoding.
59  /// If going back lands on an invalid character, it encounters bad
60  /// encoding (e.g. too many continuation bytes), or it's already at the
61  /// start, the function returns NULL.
62  static inline const utf8 *prev(const utf8 *start, const utf8 *current);
63  static inline utf8 *prev(const utf8 *start, utf8 *current);
64 
65  /// Given a pointer inside of a string representing variable length
66  /// encoding, moves the pointer so that it points to the beginning of the
67  /// encoding, if not there already.
68  /// Returns @c false if it was unable to fix the position and @c true if
69  /// successful or the position was already valid.
70  static inline bool fixpos(const utf8 *start, const utf8 *&current);
71  static inline bool fixpos(const utf8 *start, utf8 *&current);
72 
73  /// Returns the number of code points this variable encoding represents.
74  static inline exint count(const utf8 *start, const utf8 *end = 0);
75 
76  /// Returns the number of octets for this variable encoding. One octet
77  /// is the same as a byte for UTF-8 encodings.
78  static inline exint length(const utf8 *start, const utf8 *end = 0);
79 
80  /// Duplicates the string using malloc. Use free() to free the resulting
81  /// string. If a NULL pointer is passed, a NULL pointer is returned.
82  static inline utf8 *duplicate(const utf8 *start, const utf8 *end = 0);
83 
84  /// Find a code point in a variable length string and return a pointer to
85  /// it. An optional end point can be supplied, which delineates a search
86  /// range. Otherwise the string is searched up to the terminating NUL.
87  static inline const utf8 *find(utf32 cp,
88  const utf8 *start, const utf8 *end = 0);
89 
90  /// Find a UTF8 encoded string in another UTF8 encoded string and return
91  /// a pointer to the start of the match. Returns NULL if the string
92  /// was not found.
93  static inline const utf8 *find(const utf8 *str,
94  const utf8 *start, const utf8 *end = 0);
95 
96 
97  /// Parses a code point from a UTF-16 encoding and returns it as a single
98  /// code point value. Returns a pointer to the next encoding if the
99  /// current one got successfully decoded. If the decoding fails, it
100  /// return @c NULL and cp is set to zero.
101  /// Set @c big_endian to true if the incoming UTF-16 string is encoded as
102  /// big endian (UTF-16BE).
103  static inline const utf16 *convert(const utf16 *str, utf32 &cp,
104  bool big_endian = false);
105 
106  /// Converts a code point to its UTF-16LE encoding into the buffer given.
107  /// If no buffer is given, or if the buffer size is too small, returns the
108  /// number of bytes needed to store the resulting encoded sequence.
109  /// @c buflen should be given in bytes, and not number of utf16 entries.
110  /// Does not write out a terminating zero but moves the pointer to where
111  /// the next character after the sequence should be written.
112  static inline int convert(utf32 cp, utf16 *str, exint buflen);
113 
114  /// Returns the replacement character, which is returned by the convert
115  /// functions, when they encounter an invalid, but recoverable, encoding.
116  static inline utf32 replacementCodePoint()
117  {
118  return 0xFFFD;
119  }
120 
121  /// Returns @c true if the code point given is a surrogate pair. This is
122  /// valid UTF-16 character, since it is used to encode greater-than 0xFFFF
123  /// code points. It is not a valid UTF-32 code point, however.
124  static inline bool isSurrogatePair(utf32 cp)
125  {
126  return cp >= 0xD800 && cp < 0xE000;
127  }
128 
129  static inline bool isFromSupplementaryPlane(utf32 cp)
130  {
131  return cp >= 0x10000 && cp <= 0x10FFFF;
132  }
133 
134  // Returns true if the given value is a valid Unicode code point.
135  static inline bool isValidCodePoint(utf32 cp)
136  {
137  // Unicode is specified up to 0x10FFFF. Surrogate pairs are only valid
138  // for UTF-16, not for UTF-32.
139  return !isSurrogatePair(cp) && cp <= 0x10FFFF;
140  }
141 
142  // Returns true if the code point represents a control character (newline,
143  // tab, etc).
144  static inline bool isControlChar(utf32 cp)
145  {
146  return cp <= 0x1F || cp == 0x7F;
147  }
148 
149  // Code point validation for different character sets.
150  static inline bool isASCII(utf32 cp)
151  {
152  return cp <= 0x7F;
153  }
154 
155  static inline bool isLatin1(utf32 cp)
156  {
157  return cp <= 0xFF;
158  }
159 
160  // Character type queries.
161  static inline bool isSpace(utf32 cp, bool break_only = true);
162  static inline bool isDigit(utf32 cp);
163  static inline bool isAlpha(utf32 cp);
164  static inline bool isAlnum(utf32 cp);
165  static inline bool isPunct(utf32 cp);
166  static inline bool isUpper(utf32 cp);
167  static inline bool isLower(utf32 cp);
168 
169  /// Returns true if the character is from any of the Unicode CJK Unified
170  /// Ideographs blocks.
171  static inline bool isCJK(utf32 cp);
172  static inline utf32 toLower(utf32 cp);
173  static inline utf32 toUpper(utf32 cp);
174 
175  // Returns true if the byte given is a valid UTF-8 octet (any position).
176  static inline bool isUTF8(utf8 octet);
177 
178  class transform
179  {
180  public:
181  /// Transform a code point from one to another.
182  virtual utf32 transformCodepoint(utf32 cp) const = 0;
183 
184  virtual ~transform() {};
185  };
186 
187 
188  /// Simple string iterator to iterate over an UTF-8 string and peel off
189  /// the code points in sequence. This iterator can always get the 'next'
190  /// code point, which is useful for function which require sequential
191  /// code point pairs, such as kerning.
192  /// The iterator terminates when the current character is nil, invalid,
193  /// or at the 'end' pointer.
194  class iterator
195  {
196  public:
197  inline iterator();
198 
199  // Create a new iterator starting at 'start' and, optionally, ending
200  // at 'end'. If end is not given, the string will stop iterating
201  // at the terminating NUL or if it hits an invalid encoding. If 'end'
202  // is given it should be a valid pointer and follow 'start' in memory.
203  inline iterator(const utf8 *start, const utf8 *end = 0,
204  const UT_Unicode::transform *transform = 0);
205 
206  inline iterator(const UT_StringView &str,
207  const UT_Unicode::transform *transform = 0);
208 
209  inline void reset(const utf8 *to = 0);
210  inline bool advance();
211  inline bool retreat();
212 
213  /// Returns the pointer to the current UTF-8 sequence. Valid if atEnd
214  /// is not false.
215  const utf8 *at() const { return myCurrent; }
216 
217  // Returns a pointer to the next UTF-8 sequence, or the terminating NUL.
218  // Valid if atEnd is not false.
219  const utf8 *next() const { return myNext; }
220 
221  // Returns the code point of the encoding at the current location.
222  utf32 getCP() const { return myCP; }
223 
224  // Returns the next code point of the encoding following the current
225  // one, unless it happens to be invalid, in which case it returns zero.
226  utf32 getNextCP() const { return myNextCP; }
227 
228  bool atEnd() const { return myCP == 0; }
229 
230  iterator &operator++() { advance(); return *this; }
231  iterator &operator--() { retreat(); return *this; }
232 
233  private:
234  inline void init(const utf8 *start, const utf8 *end,
236 
237  iterator &operator++(int); // Post-increment is verboten
238 
239  const utf8 *myStart, *myEnd;
240  const utf8 *myCurrent, *myNext, *myNext2;
241  utf32 myCP, myNextCP;
242  const UT_Unicode::transform *myTransform;
243  };
244 };
245 
246 #include "UT_UnicodeImpl.h"
247 
248 #endif // __UT_Unicode__
static bool isUpper(utf32 cp)
unsigned char uchar
Definition: SYS_Types.h:38
static bool isValidCodePoint(utf32 cp)
Definition: UT_Unicode.h:135
static bool isSpace(utf32 cp, bool break_only=true)
char utf8
Definition: SYS_Types.h:47
Helper functions for Unicode and the UTF-8 variable length encoding.
Definition: UT_Unicode.h:27
unsigned int utf32
Definition: SYS_Types.h:49
static bool isLower(utf32 cp)
GLuint start
Definition: glcorearb.h:474
static bool isAlpha(utf32 cp)
static bool isControlChar(utf32 cp)
Definition: UT_Unicode.h:144
static bool isFromSupplementaryPlane(utf32 cp)
Definition: UT_Unicode.h:129
static bool isPunct(utf32 cp)
void reset(const utf8 *to=0)
iterator & operator++()
Definition: UT_Unicode.h:230
static bool isLatin1(utf32 cp)
Definition: UT_Unicode.h:155
static const utf8 * prev(const utf8 *start, const utf8 *current)
static bool isUTF8(utf8 octet)
A utility class to do read-only operations on a subset of an existing string.
Definition: UT_StringView.h:31
static const utf8 * convert(const utf8 *str, utf32 &cp)
static bool isDigit(utf32 cp)
int64 exint
Definition: SYS_Types.h:116
static bool isSurrogatePair(utf32 cp)
Definition: UT_Unicode.h:124
GLuint GLuint end
Definition: glcorearb.h:474
static exint count(const utf8 *start, const utf8 *end=0)
Returns the number of code points this variable encoding represents.
utf32 getNextCP() const
Definition: UT_Unicode.h:226
static const utf8 * next(const utf8 *current)
virtual utf32 transformCodepoint(utf32 cp) const =0
Transform a code point from one to another.
static bool isAlnum(utf32 cp)
utf32 getCP() const
Definition: UT_Unicode.h:222
static bool fixpos(const utf8 *start, const utf8 *&current)
static exint length(const utf8 *start, const utf8 *end=0)
const utf8 * next() const
Definition: UT_Unicode.h:219
unsigned short utf16
Definition: SYS_Types.h:48
static bool isASCII(utf32 cp)
Definition: UT_Unicode.h:150
const utf8 * at() const
Definition: UT_Unicode.h:215
static bool isCJK(utf32 cp)
static const utf8 * find(utf32 cp, const utf8 *start, const utf8 *end=0)
bool atEnd() const
Definition: UT_Unicode.h:228
static utf32 replacementCodePoint()
Definition: UT_Unicode.h:116
static utf32 toLower(utf32 cp)
iterator & operator--()
Definition: UT_Unicode.h:231
static utf8 * duplicate(const utf8 *start, const utf8 *end=0)
static utf32 toUpper(utf32 cp)