HDK
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicodeUtils.h
Go to the documentation of this file.
1 //
2 // Copyright 2023 Pixar
3 //
4 // Licensed under the terms set forth in the LICENSE.txt file available at
5 // https://openusd.org/license.
6 //
7 #ifndef PXR_BASE_TF_UNICODE_UTILS_H
8 #define PXR_BASE_TF_UNICODE_UTILS_H
9 
10 /// \file tf/unicodeUtils.h
11 /// \ingroup group_tf_String
12 /// Definitions of basic UTF-8 utilities in tf.
13 
14 #include "pxr/pxr.h"
15 #include "pxr/base/tf/api.h"
16 #include "pxr/base/tf/diagnostic.h"
17 
18 #include <ostream>
19 #include <string>
20 #include <string_view>
21 
23 
24 /// \class TfUtf8CodePoint
25 /// \ingroup group_tf_String
26 ///
27 /// Wrapper for a 32-bit code point value that can be encoded as UTF-8.
28 ///
29 /// \code{.cpp}
30 /// // Stream operator overload encodes each code point as UTF-8.
31 /// std::stringstream s;
32 /// s << TfUtf8CodePoint(8747) << " " << TfUtf8CodePoint(120);
33 /// \endcode
34 /// A single `TfUtf8CodePoint` may be converted to a string using
35 /// `TfStringify` as well.
37 public:
38  /// Code points that cannot be decoded or are outside of the valid range
39  /// will be replaced with this value.
40  static constexpr uint32_t ReplacementValue = 0xFFFD;
41 
42  /// Values higher than this will be replaced with the replacement
43  /// code point.
44  static constexpr uint32_t MaximumValue = 0x10FFFF;
45 
46  /// Values in this range (inclusive) cannot be constructed and will be
47  /// replaced by the replacement code point.
48  static constexpr std::pair<uint32_t, uint32_t>
49  SurrogateRange = {0xD800, 0xDFFF};
50 
51  /// Construct a code point initialized to the replacement value
52  constexpr TfUtf8CodePoint() = default;
53 
54  /// Construct a UTF-8 valued code point, constrained by the maximum value
55  /// and surrogate range.
56  constexpr explicit TfUtf8CodePoint(uint32_t value) :
57  _value(((value <= MaximumValue) &&
58  ((value < SurrogateRange.first) ||
59  (value > SurrogateRange.second))) ?
60  value : ReplacementValue) {}
61 
62  constexpr uint32_t AsUInt32() const { return _value; }
63 
64  friend constexpr bool operator==(const TfUtf8CodePoint left,
65  const TfUtf8CodePoint right) {
66  return left._value == right._value;
67  }
68  friend constexpr bool operator!=(const TfUtf8CodePoint left,
69  const TfUtf8CodePoint right) {
70  return left._value != right._value;
71  }
72 
73 private:
74  uint32_t _value{ReplacementValue};
75 };
76 
77 TF_API std::ostream& operator<<(std::ostream&, const TfUtf8CodePoint);
78 
79 /// The replacement code point can be used to signal that a code point could
80 /// not be decoded and needed to be replaced.
81 constexpr TfUtf8CodePoint TfUtf8InvalidCodePoint{
83 
84 /// Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
86 {
87  return static_cast<unsigned char>(value) < 128 ?
88  TfUtf8CodePoint(static_cast<unsigned char>(value)) :
89  TfUtf8InvalidCodePoint;
90 }
91 
92 /// Defines an iterator over a UTF-8 encoded string that extracts unicode
93 /// code point values.
94 ///
95 /// UTF-8 is a variable length encoding, meaning that one Unicode
96 /// code point can be encoded in UTF-8 as 1, 2, 3, or 4 bytes. This
97 /// iterator takes care of consuming the valid UTF-8 bytes for a
98 /// code point while incrementing.
100 public:
101  using iterator_category = std::forward_iterator_tag;
103  using difference_type = std::ptrdiff_t;
104  using pointer = void;
106 
107  /// Model iteration ending when the underlying iterator's end condition
108  /// has been met.
109  class PastTheEndSentinel final {};
110 
111  /// Constructs an iterator that can read UTF-8 character sequences from
112  /// the given starting string_view iterator \a it. \a end is used as a
113  /// guard against reading byte sequences past the end of the source string.
114  ///
115  /// When working with views of substrings, \a end must not point to a
116  /// continuation byte in a valid UTF-8 byte sequence to avoid decoding
117  /// errors.
120  const std::string_view::const_iterator& end) : _it(it), _end(end) {
121  TF_DEV_AXIOM(_it <= _end);
122  }
123 
124  /// Retrieves the current UTF-8 character in the sequence as its Unicode
125  /// code point value. Returns `TfUtf8InvalidCodePoint` when the
126  /// byte sequence pointed to by the iterator cannot be decoded.
127  ///
128  /// A code point might be invalid because it's incorrectly encoded, exceeds
129  /// the maximum allowed value, or is in the disallowed surrogate range.
131  {
132  return TfUtf8CodePoint{_GetCodePoint()};
133  }
134 
135  /// Retrieves the wrapped string iterator.
137  {
138  return this->_it;
139  }
140 
141  /// Determines if two iterators are equal.
142  /// This intentionally does not consider the end iterator to allow for
143  /// comparison of iterators between different substring views of the
144  /// same underlying string.
145  bool operator== (const TfUtf8CodePointIterator& rhs) const
146  {
147  return (this->_it == rhs._it);
148  }
149 
150  /// Determines if two iterators are unequal.
151  /// This intentionally does not consider the end iterator to allow for
152  /// comparison of iterators between different substring views of the
153  /// same underlying string.
154  bool operator!= (const TfUtf8CodePointIterator& rhs) const
155  {
156  return (this->_it != rhs._it);
157  }
158 
159  /// Advances the iterator logically one UTF-8 character sequence in
160  /// the string. The underlying string iterator will be advanced
161  /// according to the variable length encoding of the next UTF-8
162  /// character, but will never consume non-continuation bytes after
163  /// the current one.
165  {
166  // The increment operator should never be called if it's past
167  // the end. The user is expected to have already checked this
168  // condition.
169  TF_DEV_AXIOM(!_IsPastTheEnd());
170  _EncodingLength increment = _GetEncodingLength();
171  // Note that in cases where the encoding is invalid, we move to the
172  // next byte. This is necessary because otherwise the iterator would
173  // never advance and the end condition of == iterator::end() would
174  // never be satisfied. This means that we increment, even if the
175  // encoding length is 0.
176  ++_it;
177  // Only continuation bytes will be consumed after the the first byte.
178  // This avoids consumption of ASCII characters or other starting bytes.
179  auto isContinuation = [](const char c) {
180  const auto uc = static_cast<unsigned char>(c);
181  return (uc >= static_cast<unsigned char>('\x80')) &&
182  (uc < static_cast<unsigned char>('\xc0'));
183  };
184  while ((increment > 1) && !_IsPastTheEnd() && isContinuation(*_it)) {
185  ++_it;
186  --increment;
187  }
188  return *this;
189  }
190 
191  /// Advances the iterator logically one UTF-8 character sequence in
192  /// the string. The underlying string iterator will be advanced
193  /// according to the variable length encoding of the next UTF-8
194  /// character, but will never consume non-continuation bytes after
195  /// the current one.
197  {
198  auto temp = *this;
199  ++(*this);
200  return temp;
201  }
202 
203  /// Checks if the `lhs` iterator is at or past the end for the
204  /// underlying `string_view`
205  friend bool operator==(const TfUtf8CodePointIterator& lhs,
207  {
208  return lhs._IsPastTheEnd();
209  }
210 
212  const TfUtf8CodePointIterator& rhs)
213  {
214  return rhs == lhs;
215  }
216 
217  friend bool operator!=(const TfUtf8CodePointIterator& lhs,
218  PastTheEndSentinel rhs)
219  {
220  return !(lhs == rhs);
221  }
223  const TfUtf8CodePointIterator& rhs)
224  {
225  return !(lhs == rhs);
226  }
227 
228 private:
229  using _EncodingLength = unsigned char;
230 
231  // Retrieves the variable encoding length of the UTF-8 character
232  // currently pointed to by the iterator. This can be 1, 2, 3, or 4
233  // depending on the encoding of the UTF-8 character. If the encoding
234  // cannot be determined, this method will return 0.
235  _EncodingLength _GetEncodingLength() const
236  {
237  // already at the end, no valid character sequence
238  if (_IsPastTheEnd())
239  {
240  return 0;
241  }
242  // determine what encoding length the character is
243  // 1-byte characters have a leading 0 sequence
244  // 2-byte characters have a leading 110 sequence
245  // 3-byte characters have a leading 1110 sequence
246  // 4-byte characters have a leading 11110 sequence
247  unsigned char x = static_cast<unsigned char>(*_it);
248  if (x < 0x80)
249  {
250  return 1;
251  }
252  else if ((x >= 0xc0) && (x < 0xe0))
253  {
254  return 2;
255  }
256  else if ((x >= 0xe0) && (x < 0xf0))
257  {
258  return 3;
259  }
260  else if ((x >= 0xf0) && (x < 0xf8))
261  {
262  return 4;
263  }
264  else
265  {
266  // can't determine encoding, this is an error
267  return 0;
268  }
269  }
270 
271  // Retrieves the Unicode code point of the next character in the UTF-8
272  // encoded sequence (defined by \a begin) and returns the value in
273  // \a codePoint. This method will return \a true if the encoded
274  // sequence is valid. If the encoding is invalid, this method will
275  // return \a false and \a codePoint will be set to 0.
276  TF_API uint32_t _GetCodePoint() const;
277 
278  // Returns true if the iterator at or past the end and can no longer be
279  // dereferenced.
280  bool _IsPastTheEnd() const
281  {
282  return _it >= _end;
283  }
284 
287 };
288 
289 /// \class TfUtf8CodePointView
290 /// \ingroup group_tf_String
291 ///
292 /// Wrapper for a UTF-8 encoded `std::string_view` that can be iterated over
293 /// as code points instead of bytes.
294 ///
295 /// Because of the variable length encoding, the `TfUtf8CodePointView` iterator is
296 /// a ForwardIterator and is read only.
297 ///
298 /// \code{.cpp}
299 /// std::string value{"∫dx"};
300 /// for (const auto codePoint : TfUtf8CodePointView{value}) {
301 /// if (codePoint == TfUtf8InvalidCodePoint) {
302 /// TF_WARN("String cannot be decoded.");
303 /// break;
304 /// }
305 /// }
306 /// \endcode
307 ///
308 /// The `TfUtf8CodePointView`'s sentinel `end()` is compatible with range
309 /// based for loops and the forthcoming STL ranges library; it avoids
310 /// triplicating the storage for the end iterator. `EndAsIterator()`
311 /// can be used for algorithms that require the begin and end iterators to be
312 /// of the same type but necessarily stores redundant copies of the endpoint.
313 ///
314 /// \code{.cpp}
315 /// if (std::any_of(std::cbegin(codePointView), codePointView.EndAsIterator(),
316 /// [](const auto c) { return c == TfUtf8InvalidCodePoint; }))
317 /// {
318 /// TF_WARN("String cannot be decoded");
319 /// }
320 /// \endcode
321 class TfUtf8CodePointView final {
322 public:
324 
325  TfUtf8CodePointView() = default;
326  explicit TfUtf8CodePointView(const std::string_view& view) : _view(view) {}
327 
328  inline const_iterator begin() const
329  {
330  return const_iterator{std::cbegin(_view), std::cend(_view)};
331  }
332 
333  /// The sentinel will compare as equal to any iterator at the end
334  /// of the underlying `string_view`
336  {
338  }
339 
340  inline const_iterator cbegin() const
341  {
342  return begin();
343  }
344 
345  /// The sentinel will compare as equal to any iterator at the end
346  /// of the underlying `string_view`
348  {
349  return end();
350  }
351 
352  /// Returns true if the underlying view is empty
353  bool empty() const
354  {
355  return _view.empty();
356  }
357 
358  /// Returns an iterator of the same type as `begin` that identifies the end
359  /// of the string.
360  ///
361  /// As the end iterator is stored three times, this is slightly heavier
362  /// than using the `PastTheEndSentinel` and should be avoided in performance
363  /// critical code paths. It is provided for convenience when an algorithm
364  /// restricts the iterators to have the same type.
365  ///
366  /// As C++20 ranges exposes more sentinel friendly algorithms, this can
367  /// likely be deprecated in the future.
369  {
370  return const_iterator(std::cend(_view), std::cend(_view));
371  }
372 
373 private:
374  std::string_view _view;
375 };
376 
377 /// Determines whether the given Unicode \a codePoint is in the XID_Start
378 /// character class.
379 ///
380 /// The XID_Start class of characters are derived from the Unicode
381 /// General_Category of uppercase letters, lowercase letters, titlecase
382 /// letters, modifier letters, other letters, letters numbers, plus
383 /// Other_ID_Start, minus Pattern_Syntax and Pattern_White_Space code points.
384 /// That is, the character must have a category of Lu | Ll | Lt | Lm | Lo | Nl
385 ///
386 TF_API
387 bool TfIsUtf8CodePointXidStart(uint32_t codePoint);
388 
389 /// Determines whether the given Unicode \a codePoint is in the XID_Start
390 /// character class.
391 /// \overload
392 ///
393 inline bool TfIsUtf8CodePointXidStart(const TfUtf8CodePoint codePoint)
394 {
395  return TfIsUtf8CodePointXidStart(codePoint.AsUInt32());
396 }
397 
398 /// Determines whether the given Unicode \a codePoint is in the XID_Continue
399 /// character class.
400 ///
401 /// The XID_Continue class of characters include those in XID_Start plus
402 /// characters having the Unicode General Category of nonspacing marks,
403 /// spacing combining marks, decimal number, and connector punctuation.
404 /// That is, the character must have a category of
405 /// XID_Start | Nd | Mn | Mc | Pc
406 ///
407 TF_API
408 bool TfIsUtf8CodePointXidContinue(uint32_t codePoint);
409 
410 /// Determines whether the given Unicode \a codePoint is in the XID_Continue
411 /// character class.
412 /// \overload
413 ///
414 inline bool TfIsUtf8CodePointXidContinue(const TfUtf8CodePoint codePoint)
415 {
416  return TfIsUtf8CodePointXidContinue(codePoint.AsUInt32());
417 }
418 
420 
421 #endif // PXR_BASE_TF_UNICODE_UTILS_H_
value_type operator*() const
Definition: unicodeUtils.h:130
GLint first
Definition: glcorearb.h:405
static constexpr uint32_t MaximumValue
Definition: unicodeUtils.h:44
#define TF_API
Definition: api.h:23
TfUtf8CodePointView(const std::string_view &view)
Definition: unicodeUtils.h:326
static constexpr uint32_t ReplacementValue
Definition: unicodeUtils.h:40
void
Definition: png.h:1083
GLint left
Definition: glcorearb.h:2005
const_iterator cbegin() const
Definition: unicodeUtils.h:340
TfUtf8CodePointView()=default
GLsizei const GLfloat * value
Definition: glcorearb.h:824
std::ptrdiff_t difference_type
Definition: unicodeUtils.h:103
GLdouble right
Definition: glad.h:2817
bool empty() const
Returns true if the underlying view is empty.
Definition: unicodeUtils.h:353
TfUtf8CodePointIterator const_iterator
Definition: unicodeUtils.h:323
TfUtf8CodePointIterator::PastTheEndSentinel cend() const
Definition: unicodeUtils.h:347
TfUtf8CodePointIterator::PastTheEndSentinel end() const
Definition: unicodeUtils.h:335
friend constexpr bool operator==(const TfUtf8CodePoint left, const TfUtf8CodePoint right)
Definition: unicodeUtils.h:64
basic_string_view< char > string_view
Definition: core.h:501
friend bool operator==(PastTheEndSentinel lhs, const TfUtf8CodePointIterator &rhs)
Definition: unicodeUtils.h:211
#define TF_DEV_AXIOM(cond)
constexpr TfUtf8CodePoint(uint32_t value)
Definition: unicodeUtils.h:56
friend bool operator==(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel)
Definition: unicodeUtils.h:205
std::forward_iterator_tag iterator_category
Definition: unicodeUtils.h:101
const_iterator EndAsIterator() const
Definition: unicodeUtils.h:368
TF_API std::ostream & operator<<(std::ostream &, const TfUtf8CodePoint)
GLuint GLuint end
Definition: glcorearb.h:475
TF_API bool TfIsUtf8CodePointXidContinue(uint32_t codePoint)
std::string_view::const_iterator GetBase() const
Retrieves the wrapped string iterator.
Definition: unicodeUtils.h:136
constexpr TfUtf8CodePoint()=default
Construct a code point initialized to the replacement value.
TfUtf8CodePointIterator & operator++()
Definition: unicodeUtils.h:164
GLint GLenum GLint x
Definition: glcorearb.h:409
const_iterator begin() const
Definition: unicodeUtils.h:328
PXR_NAMESPACE_CLOSE_SCOPE PXR_NAMESPACE_OPEN_SCOPE
Definition: path.h:1425
friend bool operator!=(PastTheEndSentinel lhs, const TfUtf8CodePointIterator &rhs)
Definition: unicodeUtils.h:222
const_pointer const_iterator
Definition: string_view.h:87
#define PXR_NAMESPACE_CLOSE_SCOPE
Definition: pxr.h:74
TfUtf8CodePointIterator(const std::string_view::const_iterator &it, const std::string_view::const_iterator &end)
Definition: unicodeUtils.h:118
static constexpr std::pair< uint32_t, uint32_t > SurrogateRange
Definition: unicodeUtils.h:49
constexpr TfUtf8CodePoint TfUtf8CodePointFromAscii(const char value)
Constructs a TfUtf8CodePoint from an ASCII charcter (0-127).
Definition: unicodeUtils.h:85
bool operator==(const TfUtf8CodePointIterator &rhs) const
Definition: unicodeUtils.h:145
friend bool operator!=(const TfUtf8CodePointIterator &lhs, PastTheEndSentinel rhs)
Definition: unicodeUtils.h:217
TF_API bool TfIsUtf8CodePointXidStart(uint32_t codePoint)
friend constexpr bool operator!=(const TfUtf8CodePoint left, const TfUtf8CodePoint right)
Definition: unicodeUtils.h:68
bool operator!=(const TfUtf8CodePointIterator &rhs) const
Definition: unicodeUtils.h:154
constexpr uint32_t AsUInt32() const
Definition: unicodeUtils.h:62