diff --git a/include/streamvbyte.h b/include/streamvbyte.h
index 2b913a1..e88ab08 100644
--- a/include/streamvbyte.h
+++ b/include/streamvbyte.h
@@ -43,21 +43,7 @@ static inline size_t streamvbyte_max_compressedbytes(const uint32_t length) {
 // Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
 // the compressed data: the user needs to ensure that this region is allocated, and it
 // is not included by streamvbyte_compressedbytes.
-static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
-   // number of control bytes:
-   size_t cb = (length + 3) / 4;
-   // maximum number of control bytes:
-   size_t db = 0;
-   for (uint32_t c = 0; c < length; c++) {
-      uint32_t val = in[c];
-
-      if (val < (1 << 8)) db += 1;
-      else if (val < (1 << 16)) db += 2;
-      else if (val < (1 << 24)) db += 3;
-      else db += 4;
-   }
-   return cb + db;
-}
+size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length);
 
 // return the exact number of compressed bytes given length input integers
 // runtime in O(n) wrt. in; use streamvbyte_max_compressedbyte if you
@@ -65,21 +51,7 @@ static inline size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t le
 // Our decoding functions may read (but not use) STREAMVBYTE_PADDING extra bytes beyond
 // the compressed data: the user needs to ensure that this region is allocated, and it
 // is not included by streamvbyte_compressedbytes.
-static inline size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
-   // number of control bytes:
-   size_t cb = (length + 3) / 4;
-   // maximum number of control bytes:
-   size_t db = 0;
-   for (uint32_t c = 0; c < length; c++) {
-      uint32_t val = in[c];
-
-      if (val == 0) db += 0;
-      else if (val < (1 << 8)) db += 1;
-      else if (val < (1 << 16)) db += 2;
-      else db += 4;
-   }
-   return cb + db;
-}
+size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length);
 
 // Read "length" 32-bit integers in varint format from in, storing the result in out.
 // Returns the number of bytes read. We may read up to STREAMVBYTE_PADDING extra bytes
diff --git a/src/streamvbyte_encode.c b/src/streamvbyte_encode.c
index 18e02ab..4441faa 100644
--- a/src/streamvbyte_encode.c
+++ b/src/streamvbyte_encode.c
@@ -63,6 +63,46 @@ static uint8_t *svb_encode_scalar(const uint32_t *in,
 #include "streamvbyte_arm_encode.c"
 #endif
 
+static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length) {
+   size_t db = 0;
+   for (uint32_t c = 0; c < length; c++) {
+      uint32_t val = in[c];
+      
+      uint32_t bytes = 1 + (val > 0x000000FF) + (val > 0x0000FFFF) + (val > 0x00FFFFFF);
+      db += bytes;
+   }
+   return db;
+}
+
+static size_t svb_data_bytes_0124_scalar(const uint32_t* in, uint32_t length) {
+   size_t db = 0;
+   for (uint32_t c = 0; c < length; c++) {
+      uint32_t val = in[c];
+
+      uint32_t bytes = (val > 0x00000000) + (val > 0x000000FF) + (val > 0x0000FFFF) * 2;
+      db += bytes;
+   }
+   return db;
+}
+
+size_t streamvbyte_compressedbytes(const uint32_t* in, uint32_t length) {
+   // number of control bytes:
+   size_t cb = (length + 3) / 4;
+
+#ifdef STREAMVBYTE_X64
+   if (streamvbyte_sse41()) {
+      return cb + svb_data_bytes_SSE41(in, length);
+   }
+#endif
+   return cb + svb_data_bytes_scalar(in, length);
+}
+
+size_t streamvbyte_compressedbytes_0124(const uint32_t* in, uint32_t length) {
+   // number of control bytes:
+   size_t cb = (length + 3) / 4;
+
+   return cb + svb_data_bytes_0124_scalar(in, length);
+}
 
 
 // Encode an array of a given length read from in to bout in streamvbyte format.
diff --git a/src/streamvbyte_x64_encode.c b/src/streamvbyte_x64_encode.c
index 6415926..1ee063f 100644
--- a/src/streamvbyte_x64_encode.c
+++ b/src/streamvbyte_x64_encode.c
@@ -1,15 +1,55 @@
 #include "streamvbyte_isadetection.h"
 #ifdef STREAMVBYTE_X64
 // contributed by aqrit
+
+static size_t svb_data_bytes_scalar(const uint32_t* in, uint32_t length);
+
+STREAMVBYTE_TARGET_SSE41
+static inline size_t svb_control_SSE41 (__m128i lo, __m128i hi) {
+    const __m128i mask_01 = _mm_set1_epi8(0x01);
+    const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);
+
+    __m128i m0, m1;
+    size_t keys;
+
+    m0 = _mm_min_epu8(mask_01, lo);
+    m1 = _mm_min_epu8(mask_01, hi);
+    m0 = _mm_packus_epi16(m0, m1);
+    m0 = _mm_min_epi16(m0, mask_01); // convert 0x01FF to 0x0101
+    m0 = _mm_adds_epu16(m0, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
+    keys = (size_t)_mm_movemask_epi8(m0);
+    return keys;
+}
+STREAMVBYTE_UNTARGET_REGION
+
+STREAMVBYTE_TARGET_SSE41
+size_t svb_data_bytes_SSE41 (const uint32_t* in, uint32_t count) {
+    size_t dataLen = 0;
+
+    for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
+    {
+        __m128i r0, r1;
+        size_t keys;
+
+        r0 = _mm_loadu_si128((__m128i *) &in[0]);
+        r1 = _mm_loadu_si128((__m128i *) &in[4]);
+
+        keys = svb_control_SSE41(r0, r1);
+        dataLen += len_lut[keys & 0xFF];
+        dataLen += len_lut[keys >> 8];
+    }
+
+    dataLen += svb_data_bytes_scalar(in, count & 7);
+    return dataLen;
+}
+STREAMVBYTE_UNTARGET_REGION
+
 STREAMVBYTE_TARGET_SSE41
 size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* out) {
 	uint32_t keyLen = (count >> 2) + (((count & 3) + 3) >> 2); // 2-bits per each rounded up to byte boundry
 	uint8_t *restrict keyPtr = &out[0];
 	uint8_t *restrict dataPtr = &out[keyLen]; // variable length data after keys
 
-	const __m128i mask_01 = _mm_set1_epi8(0x01);
-	const __m128i mask_7F00 = _mm_set1_epi16(0x7F00);
-
 	for (const uint32_t* end = &in[(count & ~7)]; in != end; in += 8)
 	{
 		__m128i r0, r1, r2, r3;
@@ -18,12 +58,7 @@ size_t streamvbyte_encode_SSE41 (const uint32_t* in, uint32_t count, uint8_t* ou
 		r0 = _mm_loadu_si128((__m128i*)&in[0]);
 		r1 = _mm_loadu_si128((__m128i*)&in[4]);
 
-		r2 = _mm_min_epu8(mask_01, r0);
-		r3 = _mm_min_epu8(mask_01, r1);
-		r2 = _mm_packus_epi16(r2, r3);
-		r2 = _mm_min_epi16(r2, mask_01); // convert 0x01FF to 0x0101
-		r2 = _mm_adds_epu16(r2, mask_7F00); // convert: 0x0101 to 0x8001, 0xFF01 to 0xFFFF
-		keys = (size_t)_mm_movemask_epi8(r2);
+		keys = svb_control_SSE41(r0, r1);
 
 		r2 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys << 4) & 0x03F0]);
 		r3 = _mm_loadu_si128((__m128i*)&shuf_lut[(keys >> 4) & 0x03F0]);