From 10fb526a89ac5e32e8a7599260eaac5dbeee65d5 Mon Sep 17 00:00:00 2001 From: katsuhisa yuasa Date: Thu, 3 Oct 2024 18:58:43 +0900 Subject: [PATCH] optimize ImageLoaderSVG::create_image_from_utf8_buffer --- modules/svg/image_loader_svg.cpp | 99 ++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/modules/svg/image_loader_svg.cpp b/modules/svg/image_loader_svg.cpp index 6eb7892a96b..99eacfeacb4 100644 --- a/modules/svg/image_loader_svg.cpp +++ b/modules/svg/image_loader_svg.cpp @@ -37,6 +37,16 @@ #include +#if defined(__SSSE3__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512BW__) +#ifdef _MSC_VER +#include +#else +#include +#endif +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#endif + HashMap ImageLoaderSVG::forced_color_map = HashMap(); void ImageLoaderSVG::set_forced_color_map(const HashMap &p_color_map) { @@ -79,6 +89,84 @@ Ref ImageLoaderSVG::load_mem_svg(const uint8_t *p_svg, int p_size, float return img; } +static void RGBA_to_BGRA(uint32_t width, uint32_t height, Vector &image, uint32_t *buffer) { + const uint32_t wh = width * height; + uint32_t i; +#if defined(__AVX512BW__) + // Some of recent Intel processors and recent AMD processors (from Zen 4) support AVX-512. + { + uint8_t *dst = image.ptrw(); + const __m512i mask = _mm512_setr_epi8( + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); + for (i = 0; i < wh / 16; ++i) { + __m512i s16 = _mm512_loadu_epi8(&buffer[i * 16]); + __m512i d16 = _mm512_shuffle_epi8(s16, mask); + _mm512_storeu_epi8(dst, d16); + dst += 64; + } + i = wh / 16 * 16; + } +#elif defined(__AVX2__) + // AVX2 is widely aviable on recent x86-64 processors. + { + uint8_t *dst = image.ptrw(); + const __m256i mask = _mm256_setr_epi8( + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, + 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); + for (i = 0; i < wh / 8; ++i) { + __m256i s8 = _mm256_loadu_epi8(&buffer[i * 8]); + __m256i d8 = _mm256_shuffle_epi8(s8, mask); + _mm256_storeu_epi8(dst, d8); + dst += 32; + } + i = wh / 8 * 8; + } +#elif defined(__AVX__) || defined(__SSSE3__) + // _mm_shuffle_epi8 is available from SSSE3 + // Recent x86-64 processors support AVX/AVX2 and legacy SSE instructions are covered within AVX instructions. + // VC++ doesn't define __SSSE3__ when "/arch:AVX" is specified so __AVX__ is used for VC++. + // GCC supports fine-grained CFLAGS options such as -mssse3. + { + uint8_t *dst = image.ptrw(); + const __m128i mask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); + for (i = 0; i < wh / 4; ++i) { + __m128i s4 = _mm_loadu_epi8(&buffer[i * 4]); + __m128i d4 = _mm_shuffle_epi8(s4, mask); + _mm_storeu_epi8(dst, d4); + dst += 16; + } + i = wh / 4 * 4; + } +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) + // vqtbl1q_u8 is available from ARMv8. + // ARMv7 NEON doesn't support vqtbl1q_u8. + { + uint8_t *dst = image.ptrw(); + const uint8x16_t mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 }; + for (i = 0; i < wh / 4; ++i) { + uint8x16_t s4 = vreinterpretq_u8_u32(vld1q_u32(&buffer[i * 4])); + uint8x16_t d4 = vqtbl1q_u8(s4, mask); + vst1q_u8(dst, d4); + dst += 16; + } + i = wh / 4 * 4; + } +#else + i = 0; +#endif + for (; i < wh; ++i) { + uint32_t n = buffer[i]; + const size_t offset = sizeof(uint32_t) * i; + image.write[offset + 0] = (n >> 16) & 0xff; + image.write[offset + 1] = (n >> 8) & 0xff; + image.write[offset + 2] = n & 0xff; + image.write[offset + 3] = (n >> 24) & 0xff; + } +} + Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref p_image, const uint8_t *p_buffer, int p_buffer_size, float p_scale, bool p_upsample) { ERR_FAIL_COND_V_MSG(Math::is_zero_approx(p_scale), ERR_INVALID_PARAMETER, "ImageLoaderSVG: Can't load SVG with a scale of 0."); @@ -136,16 +224,7 @@ Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref p_image, const ui Vector image; image.resize(width * height * sizeof(uint32_t)); - for (uint32_t y = 0; y < height; y++) { - for (uint32_t x = 0; x < width; x++) { - uint32_t n = buffer[y * width + x]; - const size_t offset = sizeof(uint32_t) * width * y + sizeof(uint32_t) * x; - image.write[offset + 0] = (n >> 16) & 0xff; - image.write[offset + 1] = (n >> 8) & 0xff; - image.write[offset + 2] = n & 0xff; - image.write[offset + 3] = (n >> 24) & 0xff; - } - } + RGBA_to_BGRA(width, height, image, buffer); res = sw_canvas->clear(true); memfree(buffer);