libdonut 2.3.6
Application framework for cross-platform game development in C++20
Loading...
Searching...
No Matches
unicode.hpp
Go to the documentation of this file.
1#ifndef DONUT_UNICODE_HPP
2#define DONUT_UNICODE_HPP
3
4#include <array> // std::array
5#include <cstddef> // std::size_t, std::ptrdiff_t
6#include <iterator> // std::iterator_traits, std::input_iterator_tag, std::forward_iterator_tag
7#include <optional> // std::optional
8#include <string_view> // std::string_view, std::u8string_view
9#include <type_traits> // std::is_same_v
10#include <utility> // std::pair
11
12namespace donut::unicode {
13
22[[nodiscard]] constexpr bool isValidCodePoint(char32_t codePoint) noexcept {
23 return codePoint <= 0x10FFFF && (codePoint < 0xD800 || codePoint > 0xDFFF);
24}
25
30inline constexpr char32_t CODE_POINT_ERROR{0xFFFFFFFF};
31
51template <typename InputIt, typename Sentinel>
52[[nodiscard]] constexpr std::pair<char32_t, InputIt> decodeCodePointFromUTF8(InputIt it, Sentinel end) {
53 if (it == end) {
54 [[unlikely]] return {CODE_POINT_ERROR, it}; // Reached end.
55 }
56 char32_t codePoint{};
57 const char8_t c0 = static_cast<char8_t>(*it++);
58 if ((c0 & 0b10000000u) == 0) { // 0-127
59 [[likely]] codePoint = c0;
60 } else if ((c0 & 0b11100000u) == 0b11000000u) { // 128-2047
61 if (it == end) {
62 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
63 }
64 const char8_t c1 = static_cast<char8_t>(*it++);
65 if ((c1 & 0b11000000u) != 0b10000000u) {
66 [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
67 }
68 codePoint = ((c0 & 0b00011111u) << 6) | (c1 & 0b00111111u);
69 if (codePoint < 128) {
70 [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
71 }
72 } else if ((c0 & 0b11110000u) == 0b11100000u) { // 2048-65535
73 if (it == end) {
74 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
75 }
76 const char8_t c1 = static_cast<char8_t>(*it++);
77 if (it == end) {
78 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
79 }
80 const char8_t c2 = static_cast<char8_t>(*it++);
81 if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u) {
82 [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
83 }
84 codePoint = ((c0 & 0b00001111u) << 12) | ((c1 & 0b00111111u) << 6) | (c2 & 0b00111111u);
85 if (codePoint < 2048) {
86 [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
87 }
88 if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
89 [[unlikely]] return {CODE_POINT_ERROR, it}; // Surrogate code point.
90 }
91 } else if ((c0 & 0b11111000u) == 0b11110000u) { // 65536-1114111
92 if (it == end) {
93 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
94 }
95 const char8_t c1 = static_cast<char8_t>(*it++);
96 if (it == end) {
97 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
98 }
99 const char8_t c2 = static_cast<char8_t>(*it++);
100 if (it == end) {
101 [[unlikely]] return {CODE_POINT_ERROR, it}; // Missing continuation.
102 }
103 const char8_t c3 = static_cast<char8_t>(*it++);
104 if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u || (c3 & 0b11000000u) != 0b10000000u) {
105 [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid continuation.
106 }
107 codePoint = ((c0 & 0b00000111u) << 18) | ((c1 & 0b00111111u) << 12) | ((c2 & 0b00111111u) << 6) | (c3 & 0b00111111u);
108 if (codePoint < 65536) {
109 [[unlikely]] return {CODE_POINT_ERROR, it}; // Overlong sequence.
110 }
111 if (codePoint > 1114111) {
112 [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid code point.
113 }
114 } else {
115 [[unlikely]] return {CODE_POINT_ERROR, it}; // Invalid code unit.
116 }
117 return {codePoint, it};
118}
119
124 std::array<char8_t, 4> codeUnits;
125 std::size_t size;
126};
127
141[[nodiscard]] constexpr EncodeUTF8FromCodePointResult encodeUTF8FromCodePoint(char32_t codePoint) noexcept {
142 if (codePoint <= 0x7F) {
143 [[likely]] return {
144 .codeUnits{
145 static_cast<char8_t>(codePoint),
146 },
147 .size = 1,
148 };
149 }
150 if (codePoint <= 0x7FF) {
151 return {
152 .codeUnits{
153 static_cast<char8_t>((codePoint >> 6) + 192),
154 static_cast<char8_t>((codePoint & 63) + 128),
155 },
156 .size = 2,
157 };
158 }
159 if (codePoint <= 0xFFFF) {
160 return {
161 .codeUnits{
162 static_cast<char8_t>((codePoint >> 12) + 224),
163 static_cast<char8_t>(((codePoint >> 6) & 63) + 128),
164 static_cast<char8_t>((codePoint & 63) + 128),
165 },
166 .size = 3,
167 };
168 }
169 return {
170 .codeUnits{
171 static_cast<char8_t>((codePoint >> 18) + 240),
172 static_cast<char8_t>(((codePoint >> 12) & 63) + 128),
173 static_cast<char8_t>(((codePoint >> 6) & 63) + 128),
174 static_cast<char8_t>((codePoint & 63) + 128),
175 },
176 .size = 4,
177 };
178}
179
183struct UTF8Sentinel {};
184
192template <typename It, typename Sentinel = It>
194public:
195 using difference_type = std::ptrdiff_t;
196 using value_type = char32_t;
197 using reference = const value_type&;
198 using pointer = const value_type*;
199 using iterator_category = std::forward_iterator_tag;
201
202 constexpr UTF8Iterator() = default;
203
204 constexpr UTF8Iterator(It it, Sentinel end)
205 : it(it)
206 , next(it)
207 , end(end) {
208 ++*this;
209 }
210
211 [[nodiscard]] constexpr bool operator==(const UTF8Iterator& other) const {
212 return it == other.it;
213 }
214
215 [[nodiscard]] constexpr bool operator==(const UTF8Sentinel&) const {
216 return it == end;
217 }
218
219 [[nodiscard]] constexpr reference operator*() const {
220 return codePoint;
221 }
222
223 [[nodiscard]] constexpr pointer operator->() const {
224 return &**this;
225 }
226
228 it = next;
229 const auto [newCodePoint, newNext] = decodeCodePointFromUTF8(next, end);
230 codePoint = newCodePoint;
231 next = newNext;
232 return *this;
233 }
234
235 constexpr UTF8Iterator operator++(int) {
236 UTF8Iterator old = *this;
237 ++*this;
238 return old;
239 }
240
241 [[nodiscard]] constexpr It base() const {
242 return it;
243 }
244
245private:
246 It it{};
247 It next{};
248 Sentinel end{};
249 char32_t codePoint{};
250};
251
258template <typename It, typename Sentinel>
259requires std::is_same_v<typename std::iterator_traits<It>::iterator_category, std::input_iterator_tag> //
260class UTF8Iterator<It, Sentinel> {
261public:
262 using difference_type = std::ptrdiff_t;
263 using value_type = char32_t;
264 using reference = const value_type&;
265 using pointer = const value_type*;
266 using iterator_category = std::input_iterator_tag;
268
269 constexpr UTF8Iterator() = default;
270
271 constexpr UTF8Iterator(It it, Sentinel end)
272 : it(it)
273 , end(end) {}
274
275 [[nodiscard]] constexpr bool operator==(const UTF8Iterator& other) const {
276 if (it == end || other.it == other.end) {
277 return it == other.it && static_cast<bool>(codePoint) == static_cast<bool>(other.codePoint);
278 }
279 ensureCodePoint();
280 other.ensureCodePoint();
281 return it == other.it;
282 }
283
284 [[nodiscard]] constexpr bool operator==(const UTF8Sentinel&) const {
285 return it == end && !codePoint;
286 }
287
288 [[nodiscard]] constexpr reference operator*() const {
289 ensureCodePoint();
290 return *codePoint;
291 }
292
293 [[nodiscard]] constexpr pointer operator->() const {
294 return &**this;
295 }
296
298 if (!codePoint) {
299 const auto [newCodePoint, newIt] = decodeCodePointFromUTF8(it, end);
300 it = newIt;
301 }
302 codePoint.reset();
303 return *this;
304 }
305
306 constexpr UTF8Iterator operator++(int) {
307 ensureCodePoint();
308 UTF8Iterator old = *this;
309 codePoint.reset();
310 return old;
311 }
312
313private:
314 void ensureCodePoint() const {
315 if (!codePoint) {
316 const auto [newCodePoint, newIt] = decodeCodePointFromUTF8(it, end);
317 codePoint = newCodePoint;
318 it = newIt;
319 }
320 }
321
322 mutable It it{};
323 Sentinel end{};
324 mutable std::optional<char32_t> codePoint{};
325};
326
331class UTF8View {
332public:
337 using pointer = typename iterator::pointer;
339 using sentinel = typename iterator::sentinel;
340
341 constexpr UTF8View() noexcept = default;
342
343 constexpr explicit UTF8View(std::u8string_view str) noexcept
344 : it(str.data(), str.data() + str.size()) {}
345
346 explicit UTF8View(std::string_view str) noexcept
347 : it(reinterpret_cast<const char8_t*>(str.data()), reinterpret_cast<const char8_t*>(str.data() + str.size())) {
348 static_assert(sizeof(char) == sizeof(char8_t));
349 static_assert(alignof(char) == alignof(char8_t));
350 }
351
352 [[nodiscard]] constexpr const iterator& begin() const noexcept {
353 return it;
354 }
355
356 [[nodiscard]] constexpr sentinel end() const noexcept { // NOLINT(readability-convert-member-functions-to-static)
357 return {};
358 }
359
360private:
362};
363
364} // namespace donut::unicode
365
366#endif
char32_t value_type
Definition unicode.hpp:263
std::input_iterator_tag iterator_category
Definition unicode.hpp:266
constexpr UTF8Iterator & operator++()
Definition unicode.hpp:297
constexpr reference operator*() const
Definition unicode.hpp:288
constexpr UTF8Iterator operator++(int)
Definition unicode.hpp:306
const value_type & reference
Definition unicode.hpp:264
constexpr pointer operator->() const
Definition unicode.hpp:293
constexpr UTF8Iterator(It it, Sentinel end)
Definition unicode.hpp:271
std::ptrdiff_t difference_type
Definition unicode.hpp:262
const value_type * pointer
Definition unicode.hpp:265
constexpr bool operator==(const UTF8Sentinel &) const
Definition unicode.hpp:284
constexpr bool operator==(const UTF8Iterator &other) const
Definition unicode.hpp:275
Iterator type for decoding Unicode code points from a UTF-8 string, wrapping an existing iterator for...
Definition unicode.hpp:193
UTF8Sentinel sentinel
Definition unicode.hpp:200
char32_t value_type
Definition unicode.hpp:196
constexpr UTF8Iterator & operator++()
Definition unicode.hpp:227
constexpr reference operator*() const
Definition unicode.hpp:219
constexpr UTF8Iterator operator++(int)
Definition unicode.hpp:235
constexpr It base() const
Definition unicode.hpp:241
constexpr UTF8Iterator()=default
const value_type & reference
Definition unicode.hpp:197
constexpr pointer operator->() const
Definition unicode.hpp:223
constexpr UTF8Iterator(It it, Sentinel end)
Definition unicode.hpp:204
std::ptrdiff_t difference_type
Definition unicode.hpp:195
const value_type * pointer
Definition unicode.hpp:198
constexpr bool operator==(const UTF8Sentinel &) const
Definition unicode.hpp:215
std::forward_iterator_tag iterator_category
Definition unicode.hpp:199
constexpr bool operator==(const UTF8Iterator &other) const
Definition unicode.hpp:211
Non-owning view type for decoding Unicode code points from a contiguous UTF-8 string.
Definition unicode.hpp:331
typename iterator::sentinel sentinel
Definition unicode.hpp:339
typename iterator::difference_type difference_type
Definition unicode.hpp:334
typename iterator::iterator_category iterator_category
Definition unicode.hpp:338
typename iterator::value_type value_type
Definition unicode.hpp:335
constexpr UTF8View() noexcept=default
constexpr sentinel end() const noexcept
Definition unicode.hpp:356
typename iterator::pointer pointer
Definition unicode.hpp:337
typename iterator::reference reference
Definition unicode.hpp:336
UTF8View(std::string_view str) noexcept
Definition unicode.hpp:346
constexpr const iterator & begin() const noexcept
Definition unicode.hpp:352
Definition utilities.hpp:165
constexpr char32_t CODE_POINT_ERROR
Invalid code point value, used as a return value in Unicode decoding algorithms for conveying encodin...
Definition unicode.hpp:30
constexpr std::pair< char32_t, InputIt > decodeCodePointFromUTF8(InputIt it, Sentinel end)
Decode a single Unicode code point from an iterator of UTF-8 code units in a UTF-8-encoded string.
Definition unicode.hpp:52
constexpr bool isValidCodePoint(char32_t codePoint) noexcept
Check if a 32-bit unsigned integer value falls within the valid ranges for a Unicode code point.
Definition unicode.hpp:22
constexpr EncodeUTF8FromCodePointResult encodeUTF8FromCodePoint(char32_t codePoint) noexcept
Encode a Unicode code point into a sequence of UTF-8 code units.
Definition unicode.hpp:141
Result of the encodeUTF8FromCodePoint() function.
Definition unicode.hpp:123
std::size_t size
The length of the encoded code unit sequence stored in the codeUnits array.
Definition unicode.hpp:125
std::array< char8_t, 4 > codeUnits
Array of UTF-8 code units that encode the given code point.
Definition unicode.hpp:124
Sentinel type for UTF8Iterator.
Definition unicode.hpp:183