Skip to content

Commit

Permalink
Add NEON implementation of validate
Browse files Browse the repository at this point in the history
Nothing fancy, just processing 4 keys per vector. In both micro- and
macro-benchmarks this performs at basically the same speed as the loop
added in the previous commit (tested with clang), both of which are
significantly faster than the original version.
  • Loading branch information
blawrence-ont committed Dec 6, 2024
1 parent 72d911f commit ac965c8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
28 changes: 28 additions & 0 deletions src/streamvbyte_arm_decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,32 @@ static const uint8_t *svb_decode_vector(uint32_t *out, const uint8_t *keyPtr, co

return dataPtr;
}

static uint64_t svb_validate_vector(const uint8_t **keyPtrPtr,
uint32_t *countPtr) {
// Reduce the count by how many we'll process
const uint32_t count = *countPtr & ~7U;
const uint8_t *keyPtr = *keyPtrPtr;
*countPtr &= 7;
*keyPtrPtr += count / 4;

// Deal with each of the 4 keys in a separate lane
const int32x4_t shifts = {0, -2, -4, -6};
const uint32x4_t mask = vdupq_n_u32(3);
uint32x4_t acc0 = vdupq_n_u32(0);
uint32x4_t acc1 = vdupq_n_u32(0);

// Unrolling more than twice doesn't seem to improve performance
for (uint32_t c = 0; c < count; c += 8) {
uint32x4_t shifted0 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
acc0 = vaddq_u32(acc0, vandq_u32(shifted0, mask));
uint32x4_t shifted1 = vshlq_u32(vdupq_n_u32(*keyPtr++), shifts);
acc1 = vaddq_u32(acc1, vandq_u32(shifted1, mask));
}

// Accumulate the sums and add the +1 for each element (count)
uint64x2_t sum0 = vpaddlq_u32(acc0);
uint64x2_t sum1 = vpaddlq_u32(acc1);
return sum0[0] + sum0[1] + sum1[0] + sum1[1] + count;
}
#endif
4 changes: 4 additions & 0 deletions src/streamvbyte_decode.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ bool streamvbyte_validate_stream(const uint8_t *in, size_t inCount,
const uint8_t *keyPtr = in;
uint64_t encodedSize = 0;

#if defined(__ARM_NEON__)
encodedSize = svb_validate_vector(&keyPtr, &outCount);
#endif

// Give the compiler a hint that it can avoid branches in the inner loop
for (uint32_t c = 0; c < outCount / 4; c++) {
uint32_t key = *keyPtr++;
Expand Down

0 comments on commit ac965c8

Please sign in to comment.