Skip to content

Commit

Permalink
Use Bjoern Hoehrmann's DFA algorithm for UTF8 decoding
Browse files Browse the repository at this point in the history
Signed-off-by: Pavel Artsishevsky <polter.rnd@gmail.com>
  • Loading branch information
polter-rnd committed Dec 10, 2024
1 parent 772de71 commit dd046dc
Showing 1 changed file with 75 additions and 5 deletions.
80 changes: 75 additions & 5 deletions include/slimlog/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,75 @@ constexpr auto code_point_length(const Char* begin) -> int
* See https://emnudge.dev/blog/utf-8, https://github.com/fmtlib/fmt/pull/3333
*/

const auto chr = static_cast<unsigned char>(*begin);
const auto chr = static_cast<std::uint8_t>(*begin);
constexpr auto CodepointLengths = 0x3a55000000000000ULL;
return static_cast<int>((CodepointLengths >> (2U * (chr >> 3U))) & 0x3U) + 1;
}
}

/**
* @brief Check whether a string is UTF-8 encoded.
*
* The function checks each byte of a string whether it is UTF-8 encoded. The
* result of the check is stored in the @a state parameter. The function must
* be called initially with state 0 (accept). State 1 means the string must
* be rejected, because the current byte is not allowed. If the string is
* completely processed, but the state is non-zero, the string ended
* prematurely; that is, the last byte indicated more bytes should have
* followed.
*
* @param[in,out] state The state of the decoding.
* @param[in,out] codep Codepoint (valid only if resulting state is UTF8_ACCEPT).
* @param[in] byte Next byte to decode.
* @return New state.
*
* @note The function has been edited: a std::array is used.
*
* @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
* @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*/
std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
{
static constexpr std::array<std::uint8_t, 400> UTF8d = {{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF
0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1,
1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8
}};

assert(byte < UTF8d.size());
const std::uint8_t type = UTF8d[byte];

codep = (state != 0) ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte);

const std::size_t index
= 256u + (static_cast<std::size_t>(state) * 16u) + static_cast<std::size_t>(type);
assert(index < UTF8d.size());
state = UTF8d[index];
return state;
}

/**
* @brief Counts the number of Unicode code points in a sequence.
*
Expand All @@ -74,12 +137,19 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size
if constexpr (sizeof(Char) != 1) {
return len;
#ifdef __cpp_char8_t
} else if constexpr (std::is_same_v<Char, char8_t>) {
} else if constexpr (std::is_same_v<Char, char8_t> || std::is_same_v<Char, char>) {
std::uint8_t state = 0;
std::size_t codepoints = 0;
for (const auto* end = std::next(begin, len); begin != end; ++codepoints) {
std::advance(begin, Util::Unicode::code_point_length(begin));
std::uint32_t codepoint = 0;
for (const auto* end = std::next(begin, len); begin != end; std::advance(begin, 1)) {
if (decode(state, codepoint, static_cast<std::uint8_t>(*begin)) == 0) {
codepoints += (codepoint > 0xFFFF) ? 2 : 1;
}
}
return codepoints - 1;
if (state != 0) {
throw std::runtime_error("mbsrtowcs_s(): conversion error");
}
return codepoints;
#endif
} else {
std::mbstate_t state = std::mbstate_t();
Expand Down

0 comments on commit dd046dc

Please sign in to comment.