Skip to content

Commit

Permalink
simdize splitfields quote path
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Oct 5, 2024
1 parent cd63032 commit 92af4ae
Showing 1 changed file with 94 additions and 40 deletions.
134 changes: 94 additions & 40 deletions crates/polars-io/src/csv/read/splitfields.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,9 +136,9 @@ mod inner {

#[cfg(feature = "simd")]
mod inner {
use std::ops::BitOr;
use std::simd::prelude::*;

use polars_utils::clmul::prefix_xorsum_inclusive;
use polars_utils::slice::GetSaferUnchecked;
use polars_utils::unwrap::UnwrapUncheckedRelease;

Expand All @@ -156,6 +156,7 @@ mod inner {
eol_char: u8,
simd_separator: SimdVec,
simd_eol_char: SimdVec,
simd_quote_char: SimdVec,
}

impl<'a> SplitFields<'a> {
Expand All @@ -167,16 +168,20 @@ mod inner {
) -> Self {
let simd_separator = SimdVec::splat(separator);
let simd_eol_char = SimdVec::splat(eol_char);
let quoting = quote_char.is_some();
let quote_char = quote_char.unwrap_or(b'"');
let simd_quote_char = SimdVec::splat(quote_char);

Self {
v: slice,
separator,
finished: false,
quote_char: quote_char.unwrap_or(b'"'),
quoting: quote_char.is_some(),
quote_char,
quoting,
eol_char,
simd_separator,
simd_eol_char,
simd_quote_char,
}
}

Expand Down Expand Up @@ -219,46 +224,94 @@ mod inner {
// SAFETY:
// we have checked bounds
let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char {
let mut total_idx = 0;
needs_escaping = true;
// There can be a pair of double-quotes within a string.
// Each of the embedded double-quote characters must be represented
// by a pair of double-quote characters:
// e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020
let mut not_in_field_previous_iter = true;

// denotes if we are in a string field, started with a quote
let mut in_field = false;
loop {
let bytes = unsafe { self.v.get_unchecked_release(total_idx..) };

let mut idx = 0u32;
let mut current_idx = 0u32;
// micro optimizations
#[allow(clippy::explicit_counter_loop)]
for &c in self.v.iter() {
if c == self.quote_char {
// toggle between string field enclosure
// if we encounter a starting '"' -> in_field = true;
// if we encounter a closing '"' -> in_field = false;
in_field = !in_field;
}
if bytes.len() > SIMD_SIZE {
let lane: [u8; SIMD_SIZE] = unsafe {
bytes
.get_unchecked(0..SIMD_SIZE)
.try_into()
.unwrap_unchecked_release()
};
let simd_bytes = SimdVec::from(lane);
let eol_mask = simd_bytes.simd_eq(self.simd_eol_char).to_bitmask();
let sep_mask = simd_bytes.simd_eq(self.simd_separator).to_bitmask();
let quote_mask = simd_bytes.simd_eq(self.simd_quote_char).to_bitmask();
let mut end_mask = sep_mask | eol_mask;

if !in_field && self.eof_oel(c) {
if c == self.eol_char {
// SAFETY:
// we are in bounds
return unsafe {
self.finish_eol(needs_escaping, current_idx as usize)
};
let mut not_in_quote_field = prefix_xorsum_inclusive(quote_mask);

if not_in_field_previous_iter {
not_in_quote_field = !not_in_quote_field;
}
idx = current_idx;
not_in_field_previous_iter =
(not_in_quote_field & (1 << (SIMD_SIZE - 1))) > 0;
end_mask &= not_in_quote_field;

if end_mask != 0 {
total_idx += end_mask.trailing_zeros() as usize;
debug_assert!(
self.v[total_idx] == self.eol_char
|| self.v[total_idx] == self.separator
);
break;
} else {
total_idx += SIMD_SIZE;
}
} else {
// There can be a pair of double-quotes within a string.
// Each of the embedded double-quote characters must be represented
// by a pair of double-quote characters:
// e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020

// denotes if we are in a string field, started with a quote
let mut in_field = !not_in_field_previous_iter;

// usize::MAX is unset.
let mut idx = usize::MAX;
let mut current_idx = 0;
// micro optimizations
#[allow(clippy::explicit_counter_loop)]
for &c in bytes.iter() {
if c == self.quote_char {
// toggle between string field enclosure
// if we encounter a starting '"' -> in_field = true;
// if we encounter a closing '"' -> in_field = false;
in_field = !in_field;
}

if !in_field && self.eof_oel(c) {
if c == self.eol_char {
// SAFETY:
// we are in bounds
return unsafe {
self.finish_eol(needs_escaping, current_idx + total_idx)
};
}
idx = current_idx;
break;
}
current_idx += 1;
}

if idx == usize::MAX {
return self.finish(needs_escaping);
}

total_idx += idx;
debug_assert!(
self.v[total_idx] == self.eol_char
|| self.v[total_idx] == self.separator
);
break;
}
current_idx += 1;
}

if idx == 0 {
return self.finish(needs_escaping);
}

idx as usize
total_idx
} else {
let mut total_idx = 0;

Expand All @@ -273,11 +326,12 @@ mod inner {
.unwrap_unchecked_release()
};
let simd_bytes = SimdVec::from(lane);
let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char);
let has_separator = simd_bytes.simd_eq(self.simd_separator);
let has_any = has_separator.bitor(has_eol_char);
if let Some(idx) = has_any.first_set() {
total_idx += idx;
let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char).to_bitmask();
let has_separator = simd_bytes.simd_eq(self.simd_separator).to_bitmask();
let has_any = has_separator | has_eol_char;

if has_any != 0 {
total_idx += has_any.trailing_zeros() as usize;
break;
} else {
total_idx += SIMD_SIZE;
Expand Down

0 comments on commit 92af4ae

Please sign in to comment.