diff --git a/crates/polars-io/src/csv/read/splitfields.rs b/crates/polars-io/src/csv/read/splitfields.rs index 59d8d31a1c7f..bf28c6ad05c1 100644 --- a/crates/polars-io/src/csv/read/splitfields.rs +++ b/crates/polars-io/src/csv/read/splitfields.rs @@ -136,9 +136,9 @@ mod inner { #[cfg(feature = "simd")] mod inner { - use std::ops::BitOr; use std::simd::prelude::*; + use polars_utils::clmul::prefix_xorsum_inclusive; use polars_utils::slice::GetSaferUnchecked; use polars_utils::unwrap::UnwrapUncheckedRelease; @@ -156,6 +156,7 @@ mod inner { eol_char: u8, simd_separator: SimdVec, simd_eol_char: SimdVec, + simd_quote_char: SimdVec, } impl<'a> SplitFields<'a> { @@ -167,16 +168,20 @@ mod inner { ) -> Self { let simd_separator = SimdVec::splat(separator); let simd_eol_char = SimdVec::splat(eol_char); + let quoting = quote_char.is_some(); + let quote_char = quote_char.unwrap_or(b'"'); + let simd_quote_char = SimdVec::splat(quote_char); Self { v: slice, separator, finished: false, - quote_char: quote_char.unwrap_or(b'"'), - quoting: quote_char.is_some(), + quote_char, + quoting, eol_char, simd_separator, simd_eol_char, + simd_quote_char, } } @@ -219,46 +224,94 @@ mod inner { // SAFETY: // we have checked bounds let pos = if self.quoting && unsafe { *self.v.get_unchecked(0) } == self.quote_char { + let mut total_idx = 0; needs_escaping = true; - // There can be a pair of double-quotes within a string. - // Each of the embedded double-quote characters must be represented - // by a pair of double-quote characters: - // e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020 + let mut not_in_field_previous_iter = true; - // denotes if we are in a string field, started with a quote - let mut in_field = false; + loop { + let bytes = unsafe { self.v.get_unchecked_release(total_idx..) }; - let mut idx = 0u32; - let mut current_idx = 0u32; - // micro optimizations - #[allow(clippy::explicit_counter_loop)] - for &c in self.v.iter() { - if c == self.quote_char { - // toggle between string field enclosure - // if we encounter a starting '"' -> in_field = true; - // if we encounter a closing '"' -> in_field = false; - in_field = !in_field; - } + if bytes.len() > SIMD_SIZE { + let lane: [u8; SIMD_SIZE] = unsafe { + bytes + .get_unchecked(0..SIMD_SIZE) + .try_into() + .unwrap_unchecked_release() + }; + let simd_bytes = SimdVec::from(lane); + let eol_mask = simd_bytes.simd_eq(self.simd_eol_char).to_bitmask(); + let sep_mask = simd_bytes.simd_eq(self.simd_separator).to_bitmask(); + let quote_mask = simd_bytes.simd_eq(self.simd_quote_char).to_bitmask(); + let mut end_mask = sep_mask | eol_mask; - if !in_field && self.eof_oel(c) { - if c == self.eol_char { - // SAFETY: - // we are in bounds - return unsafe { - self.finish_eol(needs_escaping, current_idx as usize) - }; + let mut not_in_quote_field = prefix_xorsum_inclusive(quote_mask); + + if not_in_field_previous_iter { + not_in_quote_field = !not_in_quote_field; } - idx = current_idx; + not_in_field_previous_iter = + (not_in_quote_field & (1 << (SIMD_SIZE - 1))) > 0; + end_mask &= not_in_quote_field; + + if end_mask != 0 { + total_idx += end_mask.trailing_zeros() as usize; + debug_assert!( + self.v[total_idx] == self.eol_char + || self.v[total_idx] == self.separator + ); + break; + } else { + total_idx += SIMD_SIZE; + } + } else { + // There can be a pair of double-quotes within a string. + // Each of the embedded double-quote characters must be represented + // by a pair of double-quote characters: + // e.g. 1997,Ford,E350,"Super, ""luxurious"" truck",20020 + + // denotes if we are in a string field, started with a quote + let mut in_field = !not_in_field_previous_iter; + + // usize::MAX is unset. + let mut idx = usize::MAX; + let mut current_idx = 0; + // micro optimizations + #[allow(clippy::explicit_counter_loop)] + for &c in bytes.iter() { + if c == self.quote_char { + // toggle between string field enclosure + // if we encounter a starting '"' -> in_field = true; + // if we encounter a closing '"' -> in_field = false; + in_field = !in_field; + } + + if !in_field && self.eof_oel(c) { + if c == self.eol_char { + // SAFETY: + // we are in bounds + return unsafe { + self.finish_eol(needs_escaping, current_idx + total_idx) + }; + } + idx = current_idx; + break; + } + current_idx += 1; + } + + if idx == usize::MAX { + return self.finish(needs_escaping); + } + + total_idx += idx; + debug_assert!( + self.v[total_idx] == self.eol_char + || self.v[total_idx] == self.separator + ); break; } - current_idx += 1; } - - if idx == 0 { - return self.finish(needs_escaping); - } - - idx as usize + total_idx } else { let mut total_idx = 0; @@ -273,11 +326,12 @@ mod inner { .unwrap_unchecked_release() }; let simd_bytes = SimdVec::from(lane); - let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char); - let has_separator = simd_bytes.simd_eq(self.simd_separator); - let has_any = has_separator.bitor(has_eol_char); - if let Some(idx) = has_any.first_set() { - total_idx += idx; + let has_eol_char = simd_bytes.simd_eq(self.simd_eol_char).to_bitmask(); + let has_separator = simd_bytes.simd_eq(self.simd_separator).to_bitmask(); + let has_any = has_separator | has_eol_char; + + if has_any != 0 { + total_idx += has_any.trailing_zeros() as usize; break; } else { total_idx += SIMD_SIZE;