Use Bjoern Hoehrmann's DFA algorithm for UTF8 decoding

Signed-off-by: Pavel Artsishevsky <polter.rnd@gmail.com>
polter-rnd · Dec 10, 2024 · dd046dc · dd046dc
1 parent 772de71
commit dd046dc
Showing 1 changed file with 75 additions and 5 deletions.
diff --git a/include/slimlog/util/unicode.h b/include/slimlog/util/unicode.h
@@ -51,12 +51,75 @@ constexpr auto code_point_length(const Char* begin) -> int
          * See https://emnudge.dev/blog/utf-8, https://github.com/fmtlib/fmt/pull/3333
          */
 
-        const auto chr = static_cast<unsigned char>(*begin);
+        const auto chr = static_cast<std::uint8_t>(*begin);
         constexpr auto CodepointLengths = 0x3a55000000000000ULL;
         return static_cast<int>((CodepointLengths >> (2U * (chr >> 3U))) & 0x3U) + 1;
     }
 }
 
+/**
+ * @brief Check whether a string is UTF-8 encoded.
+ *
+ * The function checks each byte of a string whether it is UTF-8 encoded. The
+ * result of the check is stored in the @a state parameter. The function must
+ * be called initially with state 0 (accept). State 1 means the string must
+ * be rejected, because the current byte is not allowed. If the string is
+ * completely processed, but the state is non-zero, the string ended
+ * prematurely; that is, the last byte indicated more bytes should have
+ * followed.
+ *
+ * @param[in,out] state  The state of the decoding.
+ * @param[in,out] codep  Codepoint (valid only if resulting state is UTF8_ACCEPT).
+ * @param[in] byte       Next byte to decode.
+ * @return               New state.
+ *
+ * @note The function has been edited: a std::array is used.
+ *
+ * @copyright Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * @sa http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
+ */
+std::uint8_t decode(std::uint8_t& state, std::uint32_t& codep, const std::uint8_t byte) noexcept
+{
+    static constexpr std::array<std::uint8_t, 400> UTF8d = {{
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 00..1F
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 20..3F
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 40..5F
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, // 60..7F
+        1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+        9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9, // 80..9F
+        7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+        7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7, // A0..BF
+        8,   8,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+        2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2, // C0..DF
+        0xA, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // E0..EF
+        0xB, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // F0..FF
+        0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
+        1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+        1,   0,   1,   1,   1,   1,   1,   0,   1,   0,   1,   1,   1,   1,   1,   1, // s1..s2
+        1,   2,   1,   1,   1,   1,   1,   2,   1,   2,   1,   1,   1,   1,   1,   1,
+        1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1, // s3..s4
+        1,   2,   1,   1,   1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,
+        1,   1,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1, // s5..s6
+        1,   3,   1,   1,   1,   1,   1,   3,   1,   3,   1,   1,   1,   1,   1,   1,
+        1,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1 // s7..s8
+    }};
+
+    assert(byte < UTF8d.size());
+    const std::uint8_t type = UTF8d[byte];
+
+    codep = (state != 0) ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte);
+
+    const std::size_t index
+        = 256u + (static_cast<std::size_t>(state) * 16u) + static_cast<std::size_t>(type);
+    assert(index < UTF8d.size());
+    state = UTF8d[index];
+    return state;
+}
+
 /**
  * @brief Counts the number of Unicode code points in a sequence.
  *
@@ -74,12 +137,19 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size
     if constexpr (sizeof(Char) != 1) {
         return len;
 #ifdef __cpp_char8_t
-    } else if constexpr (std::is_same_v<Char, char8_t>) {
+    } else if constexpr (std::is_same_v<Char, char8_t> || std::is_same_v<Char, char>) {
+        std::uint8_t state = 0;
         std::size_t codepoints = 0;
-        for (const auto* end = std::next(begin, len); begin != end; ++codepoints) {
-            std::advance(begin, Util::Unicode::code_point_length(begin));
+        std::uint32_t codepoint = 0;
+        for (const auto* end = std::next(begin, len); begin != end; std::advance(begin, 1)) {
+            if (decode(state, codepoint, static_cast<std::uint8_t>(*begin)) == 0) {
+                codepoints += (codepoint > 0xFFFF) ? 2 : 1;
+            }
         }
-        return codepoints - 1;
+        if (state != 0) {
+            throw std::runtime_error("mbsrtowcs_s(): conversion error");
+        }
+        return codepoints;
 #endif
     } else {
         std::mbstate_t state = std::mbstate_t();