From a89a04f7ce608e6e0688fe1b809142bd4c65579b Mon Sep 17 00:00:00 2001 From: Pavel Artsishevsky Date: Tue, 3 Dec 2024 23:20:15 +0100 Subject: [PATCH] Optimize multibyte string processing Signed-off-by: Pavel Artsishevsky --- include/slimlog/pattern-inl.h | 112 +++++++++++++++++++++++---------- include/slimlog/pattern.h | 30 ++++----- include/slimlog/util/unicode.h | 22 ++++++- 3 files changed, 116 insertions(+), 48 deletions(-) diff --git a/include/slimlog/pattern-inl.h b/include/slimlog/pattern-inl.h index 3f9d629..b678af8 100644 --- a/include/slimlog/pattern-inl.h +++ b/include/slimlog/pattern-inl.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -67,20 +68,20 @@ inline auto mbrtoc32(Args... /*unused*/) template struct FromMultibyte { - static auto get(Char* chr, const char* str, std::size_t len, mbstate_t* state) -> int + auto get(Char* chr, const char* str, std::size_t len) -> int { using namespace Fallback; if constexpr (std::is_same_v) { - return handle(mbrtowc(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe) + return handle(mbrtowc(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe) #ifdef __cpp_char8_t } else if constexpr (std::is_same_v) { - return handle(mbrtoc8(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe) + return handle(mbrtoc8(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe) #endif #ifdef __cpp_unicode_characters } else if constexpr (std::is_same_v) { - return handle(mbrtoc16(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe) + return handle(mbrtoc16(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe) } else if constexpr (std::is_same_v) { - return handle(mbrtoc32(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe) + return handle(mbrtoc32(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe) #endif } else { static_assert(Util::Types::AlwaysFalse{}, "Unsupported character type"); @@ -101,6 +102,9 @@ struct FromMultibyte { "C++ stdlib does not support conversion to given character type"); return -1; } + +private: + std::mbstate_t m_state = {}; }; template @@ -317,30 +321,27 @@ void Pattern::compile(StringViewType pattern) } } -template -void Pattern::from_multibyte(auto& out, std::string_view data) -{ - Char wchr; - auto state = std::mbstate_t{}; - const Detail::FromMultibyte dispatcher; - for (int ret{}; (ret = dispatcher.get(&wchr, data.data(), data.size(), &state)) > 0; - data = data.substr(ret)) { - out.push_back(wchr); - } -} - template template void Pattern::format_string(auto& out, const auto& item, StringView&& data) { + constexpr auto CountCodepoints = [](StringView& src) { + if constexpr (std::is_same_v) { + return Util::Unicode::count_codepoints(src.data(), src.size()); + } else { + return src.codepoints(); + } + }; + const auto codepoints = CountCodepoints(data); + if (auto& specs = std::get(item); specs.width > 0) [[unlikely]] { - write_padded(out, std::forward(data), specs); + write_padded(out, std::forward(data), specs, codepoints); } else { using DataChar = typename std::remove_cvref_t::value_type; if constexpr (std::is_same_v && !std::is_same_v) { // NOLINTNEXTLINE (cppcoreguidelines-slicing) - from_multibyte(out, std::forward(data)); + from_multibyte(out, std::forward(data), codepoints); } else { out.append(std::forward(data)); } @@ -502,20 +503,11 @@ auto Pattern::get_string_specs(StringViewType value) -> Placeholder::Strin template template -constexpr void -Pattern::write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs) +constexpr void Pattern::write_padded( + auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints) { - constexpr auto CountCodepoints = [](StringView& src) { - if constexpr (std::is_same_v) { - return Util::Unicode::count_codepoints(src.data(), src.size()); - } else { - return src.codepoints(); - } - }; - const auto spec_width = Util::Types::to_unsigned(specs.width); - const auto width = CountCodepoints(src); - const auto padding = spec_width > width ? spec_width - width : 0; + const auto padding = spec_width > codepoints ? spec_width - codepoints : 0; // Shifts are encoded as string literals because constexpr is not // supported in constexpr functions. @@ -565,7 +557,7 @@ Pattern::write_padded(auto& dst, StringView&& src, const Placeholder::Stri using DataChar = typename std::remove_cvref_t::value_type; if constexpr (std::is_same_v && !std::is_same_v) { // NOLINTNEXTLINE (cppcoreguidelines-slicing) - from_multibyte(dst, std::forward(src)); + from_multibyte(dst, std::forward(src), codepoints); } else { dst.append(std::forward(src)); } @@ -577,4 +569,60 @@ Pattern::write_padded(auto& dst, StringView&& src, const Placeholder::Stri } } +template +void Pattern::from_multibyte(auto& out, std::string_view data, std::size_t codepoints) +{ + const auto buf_size = out.size(); +#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__) + out.reserve(buf_size + codepoints + 1); +#else + out.reserve(buf_size + codepoints); +#endif + + Char* dest = std::next(out.begin(), buf_size); + const char* source = data.data(); + + std::size_t written = 0; + if constexpr (std::is_same_v) { + std::mbstate_t state = {}; +#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__) + mbsrtowcs_s(&written, dest, codepoints + 1, &source, _TRUNCATE, &state); + written -= 1; // Don't take into account null terminator +#else + // NOLINTNEXTLINE (concurrency-mt-unsafe) + written = std::mbsrtowcs(dest, &source, codepoints, &state); +#endif + } else { + Char wchr; + const Detail::FromMultibyte dispatcher; + for (auto source_size = data.size(); source_size > 0;) { + int next = dispatcher.get(&wchr, source, source_size); + switch (next) { + case 0: + // Null character, finish processing + source_size = 0; + break; + case -1: + // Encoding error occured + throw std::runtime_error("strlen_mb(): conversion error"); + break; + case -2: + // Incomplete but valid character, skip it + break; + case -3: + // Next character from surrogate pair was processed + std::next(dest, written++) = wchr; + break; + default: + // Successfuly processed + std::next(dest, written++) = wchr; + std::advance(source, next); + source_size -= next; + break; + } + } + } + out.resize(buf_size + written); +} + } // namespace SlimLog diff --git a/include/slimlog/pattern.h b/include/slimlog/pattern.h index aa578bd..294a5fe 100644 --- a/include/slimlog/pattern.h +++ b/include/slimlog/pattern.h @@ -253,18 +253,6 @@ class Pattern { */ void compile(StringViewType pattern); - /** - * @brief Converts a multi-byte string to a single-byte string. - * - * This function converts a multi-byte string to a single-byte string and appends the result to - * the provided destination stream buffer. - * - * @tparam T Character type of the source string. - * @param out Destination stream buffer where the converted string will be appended. - * @param data Source multi-byte string to be converted. - */ - static void from_multibyte(auto& out, std::string_view data); - /** * @brief Formats a string according to the specifications. * @@ -355,10 +343,24 @@ class Pattern { * @param dst Destination buffer where the string will be written. * @param src Source string view to be written. * @param specs String specifications, including alignment and fill character. + * @param codepoints Number of codepoints the source string contains. */ template - constexpr static void - write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs); + constexpr static void write_padded( + auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints); + + /** + * @brief Converts a multi-byte string to a single-byte string. + * + * This function converts a multi-byte string to a single-byte string and appends the result to + * the provided destination stream buffer. + * + * @tparam T Character type of the source string. + * @param out Destination stream buffer where the converted string will be appended. + * @param data Source multi-byte string to be converted. + * @param codepoints Number of codepoints the data string contains. + */ + static void from_multibyte(auto& out, std::string_view data, std::size_t codepoints); std::basic_string m_pattern; std::vector m_placeholders; diff --git a/include/slimlog/util/unicode.h b/include/slimlog/util/unicode.h index 8f1c372..90f0516 100644 --- a/include/slimlog/util/unicode.h +++ b/include/slimlog/util/unicode.h @@ -5,7 +5,7 @@ #pragma once -#include +#include #include #include @@ -72,11 +72,29 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size { if constexpr (sizeof(Char) != 1) { return len; - } else { +#ifdef __cpp_char8_t + } else if constexpr (std::is_same_v) { std::size_t codepoints = 0; for (const auto* end = std::next(begin, len); begin != end; ++codepoints) { std::advance(begin, Util::Unicode::code_point_length(begin)); } + return codepoints - 1; +#endif + } else { + std::mbstate_t state = std::mbstate_t(); +#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__) + std::size_t codepoints = 0; + if (mbsrtowcs_s(&codepoints, nullptr, codepoints, &begin, 0, &state) != 0) { + return 0; + } + codepoints -= 1; +#else + // NOLINTNEXTLINE (concurrency-mt-unsafe) + const auto codepoints = std::mbsrtowcs(nullptr, &begin, 0, &state); + if (codepoints == static_cast(-1)) [[unlikely]] { + return 0; + } +#endif return codepoints; } }