1 #ifndef DONUT_UNICODE_HPP
2 #define DONUT_UNICODE_HPP
23 return codePoint <= 0x10FFFF && (codePoint < 0xD800 || codePoint > 0xDFFF);
51 template <
typename InputIt,
typename Sentinel>
57 const char8_t c0 =
static_cast<char8_t
>(*it++);
58 if ((c0 & 0b10000000u) == 0) {
59 [[likely]] codePoint = c0;
60 }
else if ((c0 & 0b11100000u) == 0b11000000u) {
64 const char8_t c1 =
static_cast<char8_t
>(*it++);
65 if ((c1 & 0b11000000u) != 0b10000000u) {
68 codePoint = ((c0 & 0b00011111u) << 6) | (c1 & 0b00111111u);
69 if (codePoint < 128) {
72 }
else if ((c0 & 0b11110000u) == 0b11100000u) {
76 const char8_t c1 =
static_cast<char8_t
>(*it++);
80 const char8_t c2 =
static_cast<char8_t
>(*it++);
81 if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u) {
84 codePoint = ((c0 & 0b00001111u) << 12) | ((c1 & 0b00111111u) << 6) | (c2 & 0b00111111u);
85 if (codePoint < 2048) {
88 if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
91 }
else if ((c0 & 0b11111000u) == 0b11110000u) {
95 const char8_t c1 =
static_cast<char8_t
>(*it++);
99 const char8_t c2 =
static_cast<char8_t
>(*it++);
103 const char8_t c3 =
static_cast<char8_t
>(*it++);
104 if ((c1 & 0b11000000u) != 0b10000000u || (c2 & 0b11000000u) != 0b10000000u || (c3 & 0b11000000u) != 0b10000000u) {
107 codePoint = ((c0 & 0b00000111u) << 18) | ((c1 & 0b00111111u) << 12) | ((c2 & 0b00111111u) << 6) | (c3 & 0b00111111u);
108 if (codePoint < 65536) {
111 if (codePoint > 1114111) {
117 return {codePoint, it};
142 if (codePoint <= 0x7F) {
145 static_cast<char8_t
>(codePoint),
150 if (codePoint <= 0x7FF) {
153 static_cast<char8_t
>((codePoint >> 6) + 192),
154 static_cast<char8_t
>((codePoint & 63) + 128),
159 if (codePoint <= 0xFFFF) {
162 static_cast<char8_t
>((codePoint >> 12) + 224),
163 static_cast<char8_t
>(((codePoint >> 6) & 63) + 128),
164 static_cast<char8_t
>((codePoint & 63) + 128),
171 static_cast<char8_t
>((codePoint >> 18) + 240),
172 static_cast<char8_t
>(((codePoint >> 12) & 63) + 128),
173 static_cast<char8_t
>(((codePoint >> 6) & 63) + 128),
174 static_cast<char8_t
>((codePoint & 63) + 128),
192 template <
typename It,
typename Sentinel = It>
212 return it == other.it;
230 codePoint = newCodePoint;
241 [[nodiscard]] constexpr It
base()
const {
249 char32_t codePoint{};
258 template <
typename It,
typename Sentinel>
259 requires std::is_same_v<typename std::iterator_traits<It>::iterator_category, std::input_iterator_tag>
276 if (it == end || other.it == other.end) {
277 return it == other.it &&
static_cast<bool>(codePoint) ==
static_cast<bool>(other.codePoint);
280 other.ensureCodePoint();
281 return it == other.it;
285 return it == end && !codePoint;
314 void ensureCodePoint()
const {
317 codePoint = newCodePoint;
324 mutable std::optional<char32_t> codePoint{};
343 constexpr explicit
UTF8View(std::u8string_view str) noexcept
344 : it(str.data(), str.data() + str.size()) {}
347 : it(
reinterpret_cast<const char8_t*
>(str.data()),
reinterpret_cast<const char8_t*
>(str.data() + str.size())) {
348 static_assert(
sizeof(
char) ==
sizeof(char8_t));
349 static_assert(
alignof(
char) ==
alignof(char8_t));
constexpr UTF8Iterator & operator++()
Definition: unicode.hpp:297
char32_t value_type
Definition: unicode.hpp:263
std::input_iterator_tag iterator_category
Definition: unicode.hpp:266
constexpr reference operator*() const
Definition: unicode.hpp:288
constexpr UTF8Iterator operator++(int)
Definition: unicode.hpp:306
constexpr UTF8Iterator()=default
const value_type & reference
Definition: unicode.hpp:264
constexpr pointer operator->() const
Definition: unicode.hpp:293
constexpr UTF8Iterator(It it, Sentinel end)
Definition: unicode.hpp:271
std::ptrdiff_t difference_type
Definition: unicode.hpp:262
const value_type * pointer
Definition: unicode.hpp:265
constexpr bool operator==(const UTF8Sentinel &) const
Definition: unicode.hpp:284
constexpr bool operator==(const UTF8Iterator &other) const
Definition: unicode.hpp:275
Iterator type for decoding Unicode code points from a UTF-8 string, wrapping an existing iterator for...
Definition: unicode.hpp:193
constexpr UTF8Iterator & operator++()
Definition: unicode.hpp:227
UTF8Sentinel sentinel
Definition: unicode.hpp:200
char32_t value_type
Definition: unicode.hpp:196
constexpr reference operator*() const
Definition: unicode.hpp:219
constexpr UTF8Iterator operator++(int)
Definition: unicode.hpp:235
constexpr It base() const
Definition: unicode.hpp:241
constexpr UTF8Iterator()=default
const value_type & reference
Definition: unicode.hpp:197
constexpr pointer operator->() const
Definition: unicode.hpp:223
constexpr UTF8Iterator(It it, Sentinel end)
Definition: unicode.hpp:204
std::ptrdiff_t difference_type
Definition: unicode.hpp:195
const value_type * pointer
Definition: unicode.hpp:198
constexpr bool operator==(const UTF8Sentinel &) const
Definition: unicode.hpp:215
std::forward_iterator_tag iterator_category
Definition: unicode.hpp:199
constexpr bool operator==(const UTF8Iterator &other) const
Definition: unicode.hpp:211
Non-owning view type for decoding Unicode code points from a contiguous UTF-8 string.
Definition: unicode.hpp:331
typename iterator::sentinel sentinel
Definition: unicode.hpp:339
typename iterator::difference_type difference_type
Definition: unicode.hpp:334
typename iterator::iterator_category iterator_category
Definition: unicode.hpp:338
typename iterator::value_type value_type
Definition: unicode.hpp:335
constexpr const iterator & begin() const noexcept
Definition: unicode.hpp:352
constexpr UTF8View() noexcept=default
constexpr sentinel end() const noexcept
Definition: unicode.hpp:356
typename iterator::pointer pointer
Definition: unicode.hpp:337
typename iterator::reference reference
Definition: unicode.hpp:336
UTF8View(std::string_view str) noexcept
Definition: unicode.hpp:346
Definition: utilities.hpp:165
constexpr char32_t CODE_POINT_ERROR
Invalid code point value, used as a return value in Unicode decoding algorithms for conveying encodin...
Definition: unicode.hpp:30
constexpr std::pair< char32_t, InputIt > decodeCodePointFromUTF8(InputIt it, Sentinel end)
Decode a single Unicode code point from an iterator of UTF-8 code units in a UTF-8-encoded string.
Definition: unicode.hpp:52
constexpr bool isValidCodePoint(char32_t codePoint) noexcept
Check if a 32-bit unsigned integer value falls within the valid ranges for a Unicode code point.
Definition: unicode.hpp:22
constexpr EncodeUTF8FromCodePointResult encodeUTF8FromCodePoint(char32_t codePoint) noexcept
Encode a Unicode code point into a sequence of UTF-8 code units.
Definition: unicode.hpp:141
Result of the encodeUTF8FromCodePoint() function.
Definition: unicode.hpp:123
std::size_t size
The length of the encoded code unit sequence stored in the codeUnits array.
Definition: unicode.hpp:125
std::array< char8_t, 4 > codeUnits
Array of UTF-8 code units that encode the given code point.
Definition: unicode.hpp:124
Sentinel type for UTF8Iterator.
Definition: unicode.hpp:183