Skip to content

Commit

Permalink
Optimize multibyte string processing
Browse files Browse the repository at this point in the history
Signed-off-by: Pavel Artsishevsky <polter.rnd@gmail.com>
  • Loading branch information
polter-rnd committed Dec 10, 2024
1 parent d157257 commit dd8cd05
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 45 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/cmake-multi-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DENABLE_ANALYZERS=ON
-DENABLE_ANALYZERS=OFF
-DENABLE_FORMATTERS=ON
-DENABLE_SANITIZERS=ON
${{ matrix.format_lib == 'std' && '-DENABLE_FMTLIB_HO=OFF' || '' }}
Expand Down
98 changes: 71 additions & 27 deletions include/slimlog/pattern-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include <initializer_list>
#include <iterator>
#include <limits>
#include <stdexcept>
#include <string>
#include <string_view>
#include <type_traits>
Expand Down Expand Up @@ -317,30 +318,27 @@ void Pattern<Char>::compile(StringViewType pattern)
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data)
{
Char wchr;
auto state = std::mbstate_t{};
const Detail::FromMultibyte<Char> dispatcher;
for (int ret{}; (ret = dispatcher.get(&wchr, data.data(), data.size(), &state)) > 0;
data = data.substr(ret)) {
out.push_back(wchr);
}
}

template<typename Char>
template<typename StringView>
void Pattern<Char>::format_string(auto& out, const auto& item, StringView&& data)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};
const auto codepoints = CountCodepoints(data);

if (auto& specs = std::get<typename Placeholder::StringSpecs>(item); specs.width > 0)
[[unlikely]] {
write_padded(out, std::forward<StringView>(data), specs);
write_padded(out, std::forward<StringView>(data), specs, codepoints);
} else {
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(out, std::forward<StringView>(data));
from_multibyte(out, std::forward<StringView>(data), codepoints);
} else {
out.append(std::forward<StringView>(data));
}
Expand Down Expand Up @@ -502,20 +500,11 @@ auto Pattern<Char>::get_string_specs(StringViewType value) -> Placeholder::Strin

template<typename Char>
template<typename StringView>
constexpr void
Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs)
constexpr void Pattern<Char>::write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};

const auto spec_width = Util::Types::to_unsigned(specs.width);
const auto width = CountCodepoints(src);
const auto padding = spec_width > width ? spec_width - width : 0;
const auto padding = spec_width > codepoints ? spec_width - codepoints : 0;

// Shifts are encoded as string literals because constexpr is not
// supported in constexpr functions.
Expand Down Expand Up @@ -565,7 +554,7 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(dst, std::forward<StringView>(src));
from_multibyte(dst, std::forward<StringView>(src), codepoints);
} else {
dst.append(std::forward<StringView>(src));
}
Expand All @@ -577,4 +566,59 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data, std::size_t codepoints)
{
const auto buf_size = out.size();
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
out.reserve(buf_size + codepoints + 1);
#else
out.reserve(buf_size + codepoints);
#endif

Char* dest = std::next(out.begin(), buf_size);
const char* source = data.data();

std::mbstate_t state = {};
std::size_t written = 0;
if constexpr (std::is_same_v<Char, wchar_t>) {
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
mbsrtowcs_s(&written, dest, codepoints + 1, &source, _TRUNCATE, &state);
written -= 1; // Don't take into account null terminator
#else
written = std::mbsrtowcs(dest, &source, codepoints, &state);
#endif
} else {
Char wchr;
const Detail::FromMultibyte<Char> dispatcher;
for (auto source_size = data.size(); source_size > 0;) {
int next = dispatcher.get(&wchr, source, source_size, &state);
switch (next) {
case 0:
// Null character, finish processing
source_size = 0;
break;
case -1:
// Encoding error occured
throw std::runtime_error("strlen_mb(): conversion error");
break;
case -2:
// Incomplete but valid character, skip it
break;
case -3:
// Next character from surrogate pair was processed
dest[written++] = wchr;
break;
default:
// Successfuly processed
dest[written++] = wchr;
std::advance(source, next);
source_size -= next;
break;
}
}
}
out.resize(buf_size + written);
}

} // namespace SlimLog
30 changes: 16 additions & 14 deletions include/slimlog/pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,18 +253,6 @@ class Pattern {
*/
void compile(StringViewType pattern);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
*/
static void from_multibyte(auto& out, std::string_view data);

/**
* @brief Formats a string according to the specifications.
*
Expand Down Expand Up @@ -355,10 +343,24 @@ class Pattern {
* @param dst Destination buffer where the string will be written.
* @param src Source string view to be written.
* @param specs String specifications, including alignment and fill character.
* @param codepoints Number of codepoints the source string contains.
*/
template<typename StringView>
constexpr static void
write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs);
constexpr static void write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
* @param codepoints Number of codepoints the data string contains.
*/
static void from_multibyte(auto& out, std::string_view data, std::size_t codepoints);

std::basic_string<Char> m_pattern;
std::vector<Placeholder> m_placeholders;
Expand Down
20 changes: 17 additions & 3 deletions include/slimlog/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@

#pragma once

#include <cstddef>
#include <iterator>
#include <cwchar>
#include <limits>

namespace SlimLog::Util::Unicode {
Expand Down Expand Up @@ -73,10 +72,25 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size
if constexpr (sizeof(Char) != 1) {
return len;
} else {
std::size_t codepoints = 0;
/*std::size_t codepoints = 0;
for (const auto* end = std::next(begin, len); begin != end; ++codepoints) {
std::advance(begin, Util::Unicode::code_point_length(begin));
}
return codepoints;*/

std::mbstate_t state = std::mbstate_t();
#if defined(_WIN32) and defined(__STDC_WANT_SECURE_LIB__)
std::size_t codepoints = 0;
if (mbsrtowcs_s(&codepoints, nullptr, codepoints, &begin, 0, &state) != 0) {
return 0;
}
codepoints -= 1;
#else
const auto codepoints = std::mbsrtowcs(nullptr, static_cast<char*>(&begin), 0, &state);
if (codepoints == static_cast<std::size_t>(-1)) [[unlikely]] {
return 0;
}
#endif
return codepoints;
}
}
Expand Down

0 comments on commit dd8cd05

Please sign in to comment.