Skip to content

Commit

Permalink
Optimize multibyte string processing
Browse files Browse the repository at this point in the history
Signed-off-by: Pavel Artsishevsky <polter.rnd@gmail.com>
  • Loading branch information
polter-rnd committed Dec 3, 2024
1 parent d157257 commit 2c5d7ff
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 42 deletions.
93 changes: 66 additions & 27 deletions include/slimlog/pattern-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,30 +317,27 @@ void Pattern<Char>::compile(StringViewType pattern)
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data)
{
Char wchr;
auto state = std::mbstate_t{};
const Detail::FromMultibyte<Char> dispatcher;
for (int ret{}; (ret = dispatcher.get(&wchr, data.data(), data.size(), &state)) > 0;
data = data.substr(ret)) {
out.push_back(wchr);
}
}

template<typename Char>
template<typename StringView>
void Pattern<Char>::format_string(auto& out, const auto& item, StringView&& data)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};
const auto codepoints = CountCodepoints(data);

if (auto& specs = std::get<typename Placeholder::StringSpecs>(item); specs.width > 0)
[[unlikely]] {
write_padded(out, std::forward<StringView>(data), specs);
write_padded(out, std::forward<StringView>(data), specs, codepoints);
} else {
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(out, std::forward<StringView>(data));
from_multibyte(out, std::forward<StringView>(data), codepoints);
} else {
out.append(std::forward<StringView>(data));
}
Expand Down Expand Up @@ -502,20 +499,11 @@ auto Pattern<Char>::get_string_specs(StringViewType value) -> Placeholder::Strin

template<typename Char>
template<typename StringView>
constexpr void
Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs)
constexpr void Pattern<Char>::write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints)
{
constexpr auto CountCodepoints = [](StringView& src) {
if constexpr (std::is_same_v<StringView, StringViewType>) {
return Util::Unicode::count_codepoints(src.data(), src.size());
} else {
return src.codepoints();
}
};

const auto spec_width = Util::Types::to_unsigned(specs.width);
const auto width = CountCodepoints(src);
const auto padding = spec_width > width ? spec_width - width : 0;
const auto padding = spec_width > codepoints ? spec_width - codepoints : 0;

// Shifts are encoded as string literals because constexpr is not
// supported in constexpr functions.
Expand Down Expand Up @@ -565,7 +553,7 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
using DataChar = typename std::remove_cvref_t<StringView>::value_type;
if constexpr (std::is_same_v<DataChar, char> && !std::is_same_v<Char, char>) {
// NOLINTNEXTLINE (cppcoreguidelines-slicing)
from_multibyte(dst, std::forward<StringView>(src));
from_multibyte(dst, std::forward<StringView>(src), codepoints);
} else {
dst.append(std::forward<StringView>(src));
}
Expand All @@ -577,4 +565,55 @@ Pattern<Char>::write_padded(auto& dst, StringView&& src, const Placeholder::Stri
}
}

template<typename Char>
void Pattern<Char>::from_multibyte(auto& out, std::string_view data, std::size_t codepoints)
{
const auto buf_size = out.size();
out.resize(buf_size + codepoints);

Char* dest = std::next(out.begin(), buf_size);
const char* source = data.data();

std::mbstate_t state = {};
std::size_t written = 0;
if constexpr (std::is_same_v<Char, wchar_t>) {
const char* source = data.data();
written = std::mbsrtowcs(dest, &source, codepoints, &state);
} else {
Char wchr;
const Detail::FromMultibyte<Char> dispatcher;
auto source_size = data.size();
for (const char* const end = data.end(); source != end;) {
int next = dispatcher.get(&wchr, source, source_size, &state);
switch (next) {
case 0:
// Null character, finish processing
data = "";
break;
case -1:
// Encoding error occured
throw std::runtime_error("strlen_mb(): conversion error");
break;
case -2:
// Incomplete but valid character, skip it
break;
case -3:
// Next character from surrogate pair was processed
dest[written++] = wchr;
break;
default:
// Successfuly processed
dest[written++] = wchr;
std::advance(source, next);
source_size -= next;
break;
}
}
}

if (written < codepoints) [[unlikely]] {
out.resize(buf_size + written);
}
}

} // namespace SlimLog
30 changes: 16 additions & 14 deletions include/slimlog/pattern.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,18 +253,6 @@ class Pattern {
*/
void compile(StringViewType pattern);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
*/
static void from_multibyte(auto& out, std::string_view data);

/**
* @brief Formats a string according to the specifications.
*
Expand Down Expand Up @@ -355,10 +343,24 @@ class Pattern {
* @param dst Destination buffer where the string will be written.
* @param src Source string view to be written.
* @param specs String specifications, including alignment and fill character.
* @param codepoints Number of codepoints the source string contains.
*/
template<typename StringView>
constexpr static void
write_padded(auto& dst, StringView&& src, const Placeholder::StringSpecs& specs);
constexpr static void write_padded(
auto& dst, StringView&& src, const Placeholder::StringSpecs& specs, std::size_t codepoints);

/**
* @brief Converts a multi-byte string to a single-byte string.
*
* This function converts a multi-byte string to a single-byte string and appends the result to
* the provided destination stream buffer.
*
* @tparam T Character type of the source string.
* @param out Destination stream buffer where the converted string will be appended.
* @param data Source multi-byte string to be converted.
* @param codepoints Number of codepoints the data string contains.
*/
static void from_multibyte(auto& out, std::string_view data, std::size_t codepoints);

std::basic_string<Char> m_pattern;
std::vector<Placeholder> m_placeholders;
Expand Down
9 changes: 8 additions & 1 deletion include/slimlog/util/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,17 @@ constexpr auto count_codepoints(const Char* begin, std::size_t len) -> std::size
if constexpr (sizeof(Char) != 1) {
return len;
} else {
std::size_t codepoints = 0;
/*std::size_t codepoints = 0;
for (const auto* end = std::next(begin, len); begin != end; ++codepoints) {
std::advance(begin, Util::Unicode::code_point_length(begin));
}
return codepoints;*/

std::mbstate_t state = std::mbstate_t();
const auto codepoints = std::mbsrtowcs(nullptr, &begin, 0, &state);
if (codepoints == static_cast<std::size_t>(-1)) [[unlikely]] {
return 0;
}
return codepoints;
}
}
Expand Down

0 comments on commit 2c5d7ff

Please sign in to comment.