From dd046dc533ba56e5d06082e89d4c366bd4a525b5 Mon Sep 17 00:00:00 2001 From: Pavel Artsishevsky Date: Wed, 11 Dec 2024 00:25:28 +0100 Subject: [PATCH] Use Bjoern Hoehrmann's DFA algorithm for UTF8 decoding Signed-off-by: Pavel Artsishevsky --- include/slimlog/util/unicode.h | 80 +++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/include/slimlog/util/unicode.h b/include/slimlog/util/unicode.h index 329e643..d64c84a 100644 --- a/include/slimlog/util/unicode.h +++ b/include/slimlog/util/unicode.h @@ -51,12 +51,75 @@ constexpr auto code_point_length(const Char* begin) -> int * See https://emnudge.dev/blog/utf-8, https://github.com/fmtlib/fmt/pull/3333 */ - const auto chr = static_cast(*begin); + const auto chr = static_cast(*begin); constexpr auto CodepointLengths = 0x3a55000000000000ULL; return static_cast((CodepointLengths >> (2U * (chr >> 3U))) & 0x3U) + 1; } } +/** + * @brief Check whether a string is UTF-8 encoded. + * + * The function checks each byte of a string whether it is UTF-8 encoded. The + * result of the check is stored in the @a state parameter. The function must + * be called initially with state 0 (accept). State 1 means the string must + * be rejected, because the current byte is not allowed. If the string is + * completely processed, but the state is non-zero, the string ended + * prematurely; that is, the last byte indicated more bytes should have + * followed. + * + * @param[in,out] state The state of the decoding. + * @param[in,out] codep Codepoint (valid only if resulting state is UTF8_ACCEPT). + * @param[in] byte Next byte to decode. + * @return New state. + * + * @note The function has been edited: a std::array is used. + * + * @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann + * @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + */ +std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept +{ + static constexpr std::array UTF8d = {{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9F + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // A0..BF + 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0..DF + 0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF + 0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF + 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2 + 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4 + 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6 + 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, + 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // s7..s8 + }}; + + assert(byte < UTF8d.size()); + const std::uint8_t type = UTF8d[byte]; + + codep = (state != 0) ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte); + + const std::size_t index + = 256u + (static_cast(state) * 16u) + static_cast(type); + assert(index < UTF8d.size()); + state = UTF8d[index]; + return state; +} + /** * @brief Counts the number of Unicode code points in a sequence. * @@ -74,12 +137,19 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size if constexpr (sizeof(Char) != 1) { return len; #ifdef __cpp_char8_t - } else if constexpr (std::is_same_v) { + } else if constexpr (std::is_same_v || std::is_same_v) { + std::uint8_t state = 0; std::size_t codepoints = 0; - for (const auto* end = std::next(begin, len); begin != end; ++codepoints) { - std::advance(begin, Util::Unicode::code_point_length(begin)); + std::uint32_t codepoint = 0; + for (const auto* end = std::next(begin, len); begin != end; std::advance(begin, 1)) { + if (decode(state, codepoint, static_cast(*begin)) == 0) { + codepoints += (codepoint > 0xFFFF) ? 2 : 1; + } } - return codepoints - 1; + if (state != 0) { + throw std::runtime_error("mbsrtowcs_s(): conversion error"); + } + return codepoints; #endif } else { std::mbstate_t state = std::mbstate_t();