diff --git a/include/streamvbyte.h b/include/streamvbyte.h index 2b913a1..e88ab08 100644 --- a/include/streamvbyte.h +++ b/include/streamvbyte.h @@ -43,21 +43,7 @@ static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) { // Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond // the compressed data: the user needs to ensure that this region is allocated, and it // is not included by streamvbyte_compressedbytes. -static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) { - // number of control bytes: - size_t cb = (length + 3) / 4; - // maximum number of control bytes: - size_t db = 0; - for (uint32_t c = 0; c < length; c++) { - uint32_t val = in[c]; - - if (val < (1 << 8)) db += 1; - else if (val < (1 << 16)) db += 2; - else if (val < (1 << 24)) db += 3; - else db += 4; - } - return cb + db; -} +size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length); // return the exact number of compressed bytes given length input integers // runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you @@ -65,21 +51,7 @@ static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t le // Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond // the compressed data: the user needs to ensure that this region is allocated, and it // is not included by streamvbyte_compressedbytes. -static inline size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) { - // number of control bytes: - size_t cb = (length + 3) / 4; - // maximum number of control bytes: - size_t db = 0; - for (uint32_t c = 0; c < length; c++) { - uint32_t val = in[c]; - - if (val == 0) db += 0; - else if (val < (1 << 8)) db += 1; - else if (val < (1 << 16)) db += 2; - else db += 4; - } - return cb + db; -} +size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length); // Read "length" 32-bit integers in varint format from in, storing the result in out. // Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes diff --git a/src/streamvbyte_encode.c b/src/streamvbyte_encode.c index 18e02ab..4441faa 100644 --- a/src/streamvbyte_encode.c +++ b/src/streamvbyte_encode.c @@ -63,6 +63,46 @@ static uint8_t *svb_encode_scalar(const uint32_t *in, #include "streamvbyte_arm_encode.c" #endif +static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) { + size_t db = 0; + for (uint32_t c = 0; c < length; c++) { + uint32_t val = in[c]; + + uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF); + db += bytes; + } + return db; +} + +static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) { + size_t db = 0; + for (uint32_t c = 0; c < length; c++) { + uint32_t val = in[c]; + + uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2; + db += bytes; + } + return db; +} + +size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) { + // number of control bytes: + size_t cb = (length + 3) / 4; + +#ifdef STREAMVBYTE_X64 + if (streamvbyte_sse41()) { + return cb + svb_data_bytes_SSE41(in, length); + } +#endif + return cb + svb_data_bytes_scalar(in, length); +} + +size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) { + // number of control bytes: + size_t cb = (length + 3) / 4; + + return cb + svb_data_bytes_0124_scalar(in, length); +} // Encode an array of a given length read from in to bout in streamvbyte format. diff --git a/src/streamvbyte_x64_encode.c b/src/streamvbyte_x64_encode.c index 6415926..1ee063f 100644 --- a/src/streamvbyte_x64_encode.c +++ b/src/streamvbyte_x64_encode.c @@ -1,15 +1,55 @@ #include "streamvbyte_isadetection.h" #ifdef STREAMVBYTE_X64 // contributed by aqrit + +static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length); + +STREAMVBYTE_TARGET_SSE41 +static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) { + const __m128i mask_01 = _mm_set1_epi8(0x01); + const __m128i mask_7F00 = _mm_set1_epi16(0x7F00); + + __m128i m0, m1; + size_t keys; + + m0 = _mm_min_epu8(mask_01, lo); + m1 = _mm_min_epu8(mask_01, hi); + m0 = _mm_packus_epi16(m0, m1); + m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101 + m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF + keys = (size_t)_mm_movemask_epi8(m0); + return keys; +} +STREAMVBYTE_UNTARGET_REGION + +STREAMVBYTE_TARGET_SSE41 +size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) { + size_t dataLen = 0; + + for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8) + { + __m128i r0, r1; + size_t keys; + + r0 = _mm_loadu_si128((__m128i *) &in[0]); + r1 = _mm_loadu_si128((__m128i *) &in[4]); + + keys = svb_control_SSE41(r0, r1); + dataLen += len_lut[keys & 0xFF]; + dataLen += len_lut[keys >> 8]; + } + + dataLen += svb_data_bytes_scalar(in, count & 7); + return dataLen; +} +STREAMVBYTE_UNTARGET_REGION + STREAMVBYTE_TARGET_SSE41 size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) { uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundry uint8_t *restrict keyPtr = &out[0]; uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys - const __m128i mask_01 = _mm_set1_epi8(0x01); - const __m128i mask_7F00 = _mm_set1_epi16(0x7F00); - for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8) { __m128i r0, r1, r2, r3; @@ -18,12 +58,7 @@ size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* ou r0 = _mm_loadu_si128((__m128i*)&in[0]); r1 = _mm_loadu_si128((__m128i*)&in[4]); - r2 = _mm_min_epu8(mask_01, r0); - r3 = _mm_min_epu8(mask_01, r1); - r2 = _mm_packus_epi16(r2, r3); - r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101 - r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF - keys = (size_t)_mm_movemask_epi8(r2); + keys = svb_control_SSE41(r0, r1); r2 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys << 4) & 0x03F0]); r3 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys >> 4) & 0x03F0]);