libdonut  2.3.2
Application framework for cross-platform game development in C++20
unicode.hpp
Go to the documentation of this file.
1 #ifndef DONUT_UNICODE_HPP
2 #define DONUT_UNICODE_HPP
3 
4 #include <array> // std::array
5 #include <cstddef> // std::size_t, std::ptrdiff_t
6 #include <iterator> // std::iterator_traits, std::input_iterator_tag, std::forward_iterator_tag
7 #include <optional> // std::optional
8 #include <string_view> // std::string_view, std::u8string_view
9 #include <type_traits> // std::is_same_v
10 #include <utility> // std::pair
11 
12 namespace donut::unicode {
13 
22 [[nodiscard]] constexpr bool isValidCodePoint(char32_t codePoint) noexcept {
23  return codePoint <= 0x10FFFF && (codePoint < 0xD800 || codePoint > 0xDFFF);
24 }
25 
30 inline constexpr char32_t CODE_POINT_ERROR{0xFFFFFFFF};
31 
51 template <typename InputIt, typename Sentinel>
52 [[nodiscard]] constexpr std::pair<char32_t, InputIt> decodeCodePointFromUTF8(InputIt it, Sentinel end) {
53  if (it == end) {
54  [[unlikely]] return {CODE_POINT_ERROR, it}; // Reached end.
55  }
56  char32_t codePoint{};
57  const char8_t c0 = static_cast<char8_t>(*it++);
58  if ((c0 & 0b10000000u) == 0) { // 0-127
59  [[likely]] codePoint = c0;
60  } else if ((c0 & 0b11100000u) == 0b11000000u) { // 128-2047
61  if (it == end) {
62  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
63  }
64  const char8_t c1 = static_cast<char8_t>(*it++);
65  if ((c1 & 0b11000000u) != 0b10000000u) {
66  [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
67  }
68  codePoint = ((c0 & 0b00011111u) << 6) | (c1 & 0b00111111u);
69  if (codePoint < 128) {
70  [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
71  }
72  } else if ((c0 & 0b11110000u) == 0b11100000u) { // 2048-65535
73  if (it == end) {
74  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
75  }
76  const char8_t c1 = static_cast<char8_t>(*it++);
77  if (it == end) {
78  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
79  }
80  const char8_t c2 = static_cast<char8_t>(*it++);
81  if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u) {
82  [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
83  }
84  codePoint = ((c0 & 0b00001111u) << 12) | ((c1 & 0b00111111u) << 6) | (c2 & 0b00111111u);
85  if (codePoint < 2048) {
86  [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
87  }
88  if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
89  [[unlikely]] return {CODE_POINT_ERROR, it}; // Surrogate code point.
90  }
91  } else if ((c0 & 0b11111000u) == 0b11110000u) { // 65536-1114111
92  if (it == end) {
93  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
94  }
95  const char8_t c1 = static_cast<char8_t>(*it++);
96  if (it == end) {
97  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
98  }
99  const char8_t c2 = static_cast<char8_t>(*it++);
100  if (it == end) {
101  [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
102  }
103  const char8_t c3 = static_cast<char8_t>(*it++);
104  if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u || (c3 & 0b11000000u) != 0b10000000u) {
105  [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
106  }
107  codePoint = ((c0 & 0b00000111u) << 18) | ((c1 & 0b00111111u) << 12) | ((c2 & 0b00111111u) << 6) | (c3 & 0b00111111u);
108  if (codePoint < 65536) {
109  [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
110  }
111  if (codePoint > 1114111) {
112  [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid code point.
113  }
114  } else {
115  [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid code unit.
116  }
117  return {codePoint, it};
118 }
119 
124  std::array<char8_t, 4> codeUnits;
125  std::size_t size;
126 };
127 
141 [[nodiscard]] constexpr EncodeUTF8FromCodePointResult encodeUTF8FromCodePoint(char32_t codePoint) noexcept {
142  if (codePoint <= 0x7F) {
143  [[likely]] return {
144  .codeUnits{
145  static_cast<char8_t>(codePoint),
146  },
147  .size = 1,
148  };
149  }
150  if (codePoint <= 0x7FF) {
151  return {
152  .codeUnits{
153  static_cast<char8_t>((codePoint >> 6) + 192),
154  static_cast<char8_t>((codePoint & 63) + 128),
155  },
156  .size = 2,
157  };
158  }
159  if (codePoint <= 0xFFFF) {
160  return {
161  .codeUnits{
162  static_cast<char8_t>((codePoint >> 12) + 224),
163  static_cast<char8_t>(((codePoint >> 6) & 63) + 128),
164  static_cast<char8_t>((codePoint & 63) + 128),
165  },
166  .size = 3,
167  };
168  }
169  return {
170  .codeUnits{
171  static_cast<char8_t>((codePoint >> 18) + 240),
172  static_cast<char8_t>(((codePoint >> 12) & 63) + 128),
173  static_cast<char8_t>(((codePoint >> 6) & 63) + 128),
174  static_cast<char8_t>((codePoint & 63) + 128),
175  },
176  .size = 4,
177  };
178 }
179 
183 struct UTF8Sentinel {};
184 
192 template <typename It, typename Sentinel = It>
194 public:
195  using difference_type = std::ptrdiff_t;
196  using value_type = char32_t;
197  using reference = const value_type&;
198  using pointer = const value_type*;
199  using iterator_category = std::forward_iterator_tag;
201 
202  constexpr UTF8Iterator() = default;
203 
204  constexpr UTF8Iterator(It it, Sentinel end)
205  : it(it)
206  , next(it)
207  , end(end) {
208  ++*this;
209  }
210 
211  [[nodiscard]] constexpr bool operator==(const UTF8Iterator& other) const {
212  return it == other.it;
213  }
214 
215  [[nodiscard]] constexpr bool operator==(const UTF8Sentinel&) const {
216  return it == end;
217  }
218 
219  [[nodiscard]] constexpr reference operator*() const {
220  return codePoint;
221  }
222 
223  [[nodiscard]] constexpr pointer operator->() const {
224  return &**this;
225  }
226 
227  constexpr UTF8Iterator& operator++() {
228  it = next;
229  const auto [newCodePoint, newNext] = decodeCodePointFromUTF8(next, end);
230  codePoint = newCodePoint;
231  next = newNext;
232  return *this;
233  }
234 
235  constexpr UTF8Iterator operator++(int) {
236  UTF8Iterator old = *this;
237  ++*this;
238  return old;
239  }
240 
241  [[nodiscard]] constexpr It base() const {
242  return it;
243  }
244 
245 private:
246  It it{};
247  It next{};
248  Sentinel end{};
249  char32_t codePoint{};
250 };
251 
258 template <typename It, typename Sentinel>
259 requires std::is_same_v<typename std::iterator_traits<It>::iterator_category, std::input_iterator_tag> //
260 class UTF8Iterator<It, Sentinel> {
261 public:
262  using difference_type = std::ptrdiff_t;
263  using value_type = char32_t;
264  using reference = const value_type&;
265  using pointer = const value_type*;
266  using iterator_category = std::input_iterator_tag;
268 
269  constexpr UTF8Iterator() = default;
270 
271  constexpr UTF8Iterator(It it, Sentinel end)
272  : it(it)
273  , end(end) {}
274 
275  [[nodiscard]] constexpr bool operator==(const UTF8Iterator& other) const {
276  if (it == end || other.it == other.end) {
277  return it == other.it && static_cast<bool>(codePoint) == static_cast<bool>(other.codePoint);
278  }
279  ensureCodePoint();
280  other.ensureCodePoint();
281  return it == other.it;
282  }
283 
284  [[nodiscard]] constexpr bool operator==(const UTF8Sentinel&) const {
285  return it == end && !codePoint;
286  }
287 
288  [[nodiscard]] constexpr reference operator*() const {
289  ensureCodePoint();
290  return *codePoint;
291  }
292 
293  [[nodiscard]] constexpr pointer operator->() const {
294  return &**this;
295  }
296 
297  constexpr UTF8Iterator& operator++() {
298  if (!codePoint) {
299  const auto [newCodePoint, newIt] = decodeCodePointFromUTF8(it, end);
300  it = newIt;
301  }
302  codePoint.reset();
303  return *this;
304  }
305 
306  constexpr UTF8Iterator operator++(int) {
307  ensureCodePoint();
308  UTF8Iterator old = *this;
309  codePoint.reset();
310  return old;
311  }
312 
313 private:
314  void ensureCodePoint() const {
315  if (!codePoint) {
316  const auto [newCodePoint, newIt] = decodeCodePointFromUTF8(it, end);
317  codePoint = newCodePoint;
318  it = newIt;
319  }
320  }
321 
322  mutable It it{};
323  Sentinel end{};
324  mutable std::optional<char32_t> codePoint{};
325 };
326 
331 class UTF8View {
332 public:
336  using reference = typename iterator::reference;
337  using pointer = typename iterator::pointer;
339  using sentinel = typename iterator::sentinel;
340 
341  constexpr UTF8View() noexcept = default;
342 
343  constexpr explicit UTF8View(std::u8string_view str) noexcept
344  : it(str.data(), str.data() + str.size()) {}
345 
346  explicit UTF8View(std::string_view str) noexcept
347  : it(reinterpret_cast<const char8_t*>(str.data()), reinterpret_cast<const char8_t*>(str.data() + str.size())) {
348  static_assert(sizeof(char) == sizeof(char8_t));
349  static_assert(alignof(char) == alignof(char8_t));
350  }
351 
352  [[nodiscard]] constexpr const iterator& begin() const noexcept {
353  return it;
354  }
355 
356  [[nodiscard]] constexpr sentinel end() const noexcept { // NOLINT(readability-convert-member-functions-to-static)
357  return {};
358  }
359 
360 private:
362 };
363 
364 } // namespace donut::unicode
365 
366 #endif
constexpr UTF8Iterator & operator++()
Definition: unicode.hpp:297
char32_t value_type
Definition: unicode.hpp:263
std::input_iterator_tag iterator_category
Definition: unicode.hpp:266
constexpr reference operator*() const
Definition: unicode.hpp:288
constexpr UTF8Iterator operator++(int)
Definition: unicode.hpp:306
const value_type & reference
Definition: unicode.hpp:264
constexpr pointer operator->() const
Definition: unicode.hpp:293
constexpr UTF8Iterator(It it, Sentinel end)
Definition: unicode.hpp:271
std::ptrdiff_t difference_type
Definition: unicode.hpp:262
const value_type * pointer
Definition: unicode.hpp:265
constexpr bool operator==(const UTF8Sentinel &) const
Definition: unicode.hpp:284
constexpr bool operator==(const UTF8Iterator &other) const
Definition: unicode.hpp:275
Iterator type for decoding Unicode code points from a UTF-8 string, wrapping an existing iterator for...
Definition: unicode.hpp:193
constexpr UTF8Iterator & operator++()
Definition: unicode.hpp:227
UTF8Sentinel sentinel
Definition: unicode.hpp:200
char32_t value_type
Definition: unicode.hpp:196
constexpr reference operator*() const
Definition: unicode.hpp:219
constexpr UTF8Iterator operator++(int)
Definition: unicode.hpp:235
constexpr It base() const
Definition: unicode.hpp:241
constexpr UTF8Iterator()=default
const value_type & reference
Definition: unicode.hpp:197
constexpr pointer operator->() const
Definition: unicode.hpp:223
constexpr UTF8Iterator(It it, Sentinel end)
Definition: unicode.hpp:204
std::ptrdiff_t difference_type
Definition: unicode.hpp:195
const value_type * pointer
Definition: unicode.hpp:198
constexpr bool operator==(const UTF8Sentinel &) const
Definition: unicode.hpp:215
std::forward_iterator_tag iterator_category
Definition: unicode.hpp:199
constexpr bool operator==(const UTF8Iterator &other) const
Definition: unicode.hpp:211
Non-owning view type for decoding Unicode code points from a contiguous UTF-8 string.
Definition: unicode.hpp:331
typename iterator::sentinel sentinel
Definition: unicode.hpp:339
typename iterator::difference_type difference_type
Definition: unicode.hpp:334
typename iterator::iterator_category iterator_category
Definition: unicode.hpp:338
typename iterator::value_type value_type
Definition: unicode.hpp:335
constexpr const iterator & begin() const noexcept
Definition: unicode.hpp:352
constexpr UTF8View() noexcept=default
constexpr sentinel end() const noexcept
Definition: unicode.hpp:356
typename iterator::pointer pointer
Definition: unicode.hpp:337
typename iterator::reference reference
Definition: unicode.hpp:336
UTF8View(std::string_view str) noexcept
Definition: unicode.hpp:346
Definition: utilities.hpp:165
constexpr char32_t CODE_POINT_ERROR
Invalid code point value, used as a return value in Unicode decoding algorithms for conveying encodin...
Definition: unicode.hpp:30
constexpr std::pair< char32_t, InputIt > decodeCodePointFromUTF8(InputIt it, Sentinel end)
Decode a single Unicode code point from an iterator of UTF-8 code units in a UTF-8-encoded string.
Definition: unicode.hpp:52
constexpr bool isValidCodePoint(char32_t codePoint) noexcept
Check if a 32-bit unsigned integer value falls within the valid ranges for a Unicode code point.
Definition: unicode.hpp:22
constexpr EncodeUTF8FromCodePointResult encodeUTF8FromCodePoint(char32_t codePoint) noexcept
Encode a Unicode code point into a sequence of UTF-8 code units.
Definition: unicode.hpp:141
Result of the encodeUTF8FromCodePoint() function.
Definition: unicode.hpp:123
std::size_t size
The length of the encoded code unit sequence stored in the codeUnits array.
Definition: unicode.hpp:125
std::array< char8_t, 4 > codeUnits
Array of UTF-8 code units that encode the given code point.
Definition: unicode.hpp:124
Sentinel type for UTF8Iterator.
Definition: unicode.hpp:183