Skip to content

Commit

Permalink
Optimize multibyte string processing
Browse files Browse the repository at this point in the history
Signed-off-by: Pavel Artsishevsky <polter.rnd@gmail.com>
  • Loading branch information
polter-rnd committed Dec 10, 2024
1 parent d157257 commit 53bd66c
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 49 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cmake-multi-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DENABLE_ANALYZERS=ON
-DENABLE_ANALYZERS=OFF
-DENABLE_FORMATTERS=ON
-DENABLE_SANITIZERS=ON
${{ matrix.format_lib == 'std' && '-DENABLE_FMTLIB_HO=OFF' || '' }}
Expand Down
112 changes: 80 additions & 32 deletions include/slimlog/pattern-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <initializer_list>
#include <iterator>
#include <limits>
#include <stdexcept>
#include <string>
#include <string_view>
#include <type_traits>
Expand Down Expand Up @@ -67,20 +68,20 @@ inline auto mbrtoc32(Args... /*unused*/)

template<typename Char>
struct FromMultibyte {
static auto get(Char* chr, const char* str, std::size_t len, mbstate_t* state) -> int
auto get(Char* chr, const char* str, std::size_t len) -> int
{
using namespace Fallback;
if constexpr (std::is_same_v<Char, wchar_t>) {
return handle(mbrtowc(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe)
return handle(mbrtowc(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe)
#ifdef __cpp_char8_t
} else if constexpr (std::is_same_v<Char, char8_t>) {
return handle(mbrtoc8(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe)
return handle(mbrtoc8(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe)
#endif
#ifdef __cpp_unicode_characters
} else if constexpr (std::is_same_v<Char, char16_t>) {
return handle(mbrtoc16(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe)
return handle(mbrtoc16(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe)
} else if constexpr (std::is_same_v<Char, char32_t>) {
return handle(mbrtoc32(chr, str, len, state)); // NOLINT (concurrency-mt-unsafe)
return handle(mbrtoc32(chr, str, len, &m_state)); // NOLINT (concurrency-mt-unsafe)
#endif
} else {
static_assert(Util::Types::AlwaysFalse<Char>{}, "Unsupported character type");
Expand All @@ -101,6 +102,9 @@ struct FromMultibyte {
"C++ stdlib does not support conversion to given character type");
return -1;
}

private:
std::mbstate_t m_state = {};
};

template<typename Char, typename StringType>
Expand Down Expand Up @@ -317,30 +321,27 @@ void Pattern<Char>::compile(StringViewType pattern)
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data)
{
Char wchr;
auto state = std::mbstate_t{};
const Detail::FromMultibyte<Char> dispatcher;
for (int ret{}; (ret = dispatcher.get(&wchr, data.data(), data.size(), &state)) > 0;
data = data.substr(ret)) {
out.push_back(wchr);
}
}

template<typename Char>
template<typename StringView>
void Pattern<Char>::format_string(auto& out, const auto& item, StringView&& data)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};
const auto codepoints = CountCodepoints(data);

if (auto& specs = std::get<typename Placeholder::StringSpecs>(item); specs.width > 0)
[[unlikely]] {
write_padded(out, std::forward<StringView>(data), specs);
write_padded(out, std::forward<StringView>(data), specs, codepoints);
} else {
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(out, std::forward<StringView>(data));
from_multibyte(out, std::forward<StringView>(data), codepoints);
} else {
out.append(std::forward<StringView>(data));
}
Expand Down Expand Up @@ -502,20 +503,11 @@ auto Pattern<Char>::get_string_specs(StringViewType value) -> Placeholder::Strin

template<typename Char>
template<typename StringView>
constexpr void
Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs)
constexpr void Pattern<Char>::write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};

const auto spec_width = Util::Types::to_unsigned(specs.width);
const auto width = CountCodepoints(src);
const auto padding = spec_width > width ? spec_width - width : 0;
const auto padding = spec_width > codepoints ? spec_width - codepoints : 0;

// Shifts are encoded as string literals because constexpr is not
// supported in constexpr functions.
Expand Down Expand Up @@ -565,7 +557,7 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(dst, std::forward<StringView>(src));
from_multibyte(dst, std::forward<StringView>(src), codepoints);
} else {
dst.append(std::forward<StringView>(src));
}
Expand All @@ -577,4 +569,60 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data, std::size_t codepoints)
{
const auto buf_size = out.size();
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
out.reserve(buf_size + codepoints + 1);
#else
out.reserve(buf_size + codepoints);
#endif

Char* dest = std::next(out.begin(), buf_size);
const char* source = data.data();

std::size_t written = 0;
if constexpr (std::is_same_v<Char, wchar_t>) {
std::mbstate_t state = {};
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
mbsrtowcs_s(&written, dest, codepoints + 1, &source, _TRUNCATE, &state);
written -= 1; // Don't take into account null terminator
#else
// NOLINTNEXTLINE (concurrency-mt-unsafe)
written = std::mbsrtowcs(dest, &source, codepoints, &state);
#endif
} else {
Char wchr;
const Detail::FromMultibyte<Char> dispatcher;
for (auto source_size = data.size(); source_size > 0;) {
int next = dispatcher.get(&wchr, source, source_size);
switch (next) {
case 0:
// Null character, finish processing
source_size = 0;
break;
case -1:
// Encoding error occured
throw std::runtime_error("strlen_mb(): conversion error");
break;
case -2:
// Incomplete but valid character, skip it
break;
case -3:
// Next character from surrogate pair was processed
std::next(dest, written++) = wchr;
break;
default:
// Successfuly processed
std::next(dest, written++) = wchr;
std::advance(source, next);
source_size -= next;
break;
}
}
}
out.resize(buf_size + written);
}

} // namespace SlimLog
30 changes: 16 additions & 14 deletions include/slimlog/pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,18 +253,6 @@ class Pattern {
*/
void compile(StringViewType pattern);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
*/
static void from_multibyte(auto& out, std::string_view data);

/**
* @brief Formats a string according to the specifications.
*
Expand Down Expand Up @@ -355,10 +343,24 @@ class Pattern {
* @param dst Destination buffer where the string will be written.
* @param src Source string view to be written.
* @param specs String specifications, including alignment and fill character.
* @param codepoints Number of codepoints the source string contains.
*/
template<typename StringView>
constexpr static void
write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs);
constexpr static void write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
* @param codepoints Number of codepoints the data string contains.
*/
static void from_multibyte(auto& out, std::string_view data, std::size_t codepoints);

std::basic_string<Char> m_pattern;
std::vector<Placeholder> m_placeholders;
Expand Down
22 changes: 20 additions & 2 deletions include/slimlog/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#pragma once

#include <cstddef>
#include <cwchar>
#include <iterator>
#include <limits>

Expand Down Expand Up @@ -72,11 +72,29 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size
{
if constexpr (sizeof(Char) != 1) {
return len;
} else {
#ifdef __cpp_char8_t
} else if constexpr (std::is_same_v<Char, char8_t>) {
std::size_t codepoints = 0;
for (const auto* end = std::next(begin, len); begin != end; ++codepoints) {
std::advance(begin, Util::Unicode::code_point_length(begin));
}
return codepoints - 1;
#endif
} else {
std::mbstate_t state = std::mbstate_t();
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
std::size_t codepoints = 0;
if (mbsrtowcs_s(&codepoints, nullptr, codepoints, &begin, 0, &state) != 0) {
return 0;
}
codepoints -= 1;
#else
// NOLINTNEXTLINE (concurrency-mt-unsafe)
const auto codepoints = std::mbsrtowcs(nullptr, &begin, 0, &state);
if (codepoints == static_cast<std::size_t>(-1)) [[unlikely]] {
return 0;
}
#endif
return codepoints;
}
}
Expand Down

0 comments on commit 53bd66c

Please sign in to comment.