Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimistic text parsing for 20% improvement #153

Merged
merged 1 commit into from
Jan 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 21 additions & 20 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,34 @@ const fn create_windows_1252_table() -> [char; 256] {
}

pub(crate) static WINDOWS_1252: [char; 256] = create_windows_1252_table();
pub(crate) const BOUNDARY: u8 = 1;
pub(crate) const WHITESPACE: u8 = 2;
pub(crate) const OPERATOR: u8 = 4;
pub(crate) const COMMENT: u8 = 8;

#[inline]
pub(crate) fn is_boundary(b: u8) -> bool {
CHARACTER_CLASS[usize::from(b)] != 0
boundary(b) != 0
}

#[inline]
pub(crate) fn boundary(b: u8) -> u8 {
CHARACTER_CLASS[usize::from(b)]
}

const fn create_character_class_table() -> [u8; 256] {
let mut table = [0u8; 256];
table[b'\t' as usize] = WHITESPACE;
table[b'\n' as usize] = WHITESPACE;
table[b'\x0b' as usize] = WHITESPACE; // \v
table[b'\x0c' as usize] = WHITESPACE; // \f
table[b'\r' as usize] = WHITESPACE;
table[b' ' as usize] = WHITESPACE;
table[b'!' as usize] = OPERATOR;
table[b'#' as usize] = COMMENT;
table[b'<' as usize] = OPERATOR;
table[b'=' as usize] = OPERATOR;
table[b'>' as usize] = OPERATOR;
table[b'[' as usize] = BOUNDARY;
table[b']' as usize] = BOUNDARY;
table[b'}' as usize] = BOUNDARY;
table[b'{' as usize] = BOUNDARY;
table[b'\t' as usize] = 1;
table[b'\n' as usize] = 1;
table[b'\x0b' as usize] = 1; // \v
table[b'\x0c' as usize] = 1; // \f
table[b'\r' as usize] = 1;
table[b' ' as usize] = 1;
table[b'!' as usize] = 1;
table[b'#' as usize] = 1;
table[b'<' as usize] = 1;
table[b'=' as usize] = 1;
table[b'>' as usize] = 1;
table[b'[' as usize] = 1;
table[b']' as usize] = 1;
table[b'}' as usize] = 1;
table[b'{' as usize] = 1;
table
}

Expand Down
106 changes: 88 additions & 18 deletions src/text/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::Operator;
use crate::{
buffer::{BufferError, BufferWindow, BufferWindowBuilder},
data::is_boundary,
util::{contains_zero_byte, count_chunk, repeat_byte},
util::{contains_zero_byte, count_chunk, leading_whitespace, repeat_byte},
Scalar,
};
use std::io::Read;
Expand Down Expand Up @@ -134,8 +134,7 @@ where
self.buf.position()
}

#[inline]
unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
unsafe fn next_opt_fallback(&mut self) -> (Option<Token>, Option<ReaderError>) {
#[derive(Debug)]
enum ParseState {
None,
Expand All @@ -155,21 +154,7 @@ where

'inner: loop {
match *ptr {
c @ b' ' | c @ b'\t' => {
ptr = ptr.add(1);
loop {
if ptr == end {
break 'eof (0, 0);
}

if *ptr != c {
break;
}

ptr = ptr.add(1)
}
}
b'\n' | b'\r' | b';' => {
b' ' | b'\t' | b'\n' | b'\r' | b';' => {
ptr = ptr.add(1);
break 'inner;
}
Expand Down Expand Up @@ -425,6 +410,91 @@ where
}
}

#[inline]
unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
let mut ptr = self.buf.start;
let end = self.buf.end;

if end.offset_from(ptr) < 9 {
return self.next_opt_fallback();
}

// 3.4 million newlines followed by an average of 3.3 tabs
let data = ptr.cast::<u64>().read_unaligned().to_le();
ptr = ptr.add(leading_whitespace(data) as usize);

// Eagerly check for brackets, there'll be millions of them
if *ptr == b'{' {
self.buf.advance_to(ptr.add(1));
return (Some(Token::Open), None);
} else if *ptr == b'}' {
self.buf.advance_to(ptr.add(1));
return (Some(Token::Close), None);
}
// unquoted values are the most frequent type of values in
// text so if we see something that is alphanumeric or a
// dash (for negative numbers) we eagerly attempt to match
// against it. Loop unrolling is used to minimize the number
// of access to the boundary lookup table.
else if matches!(*ptr, b'a'..=b'z' | b'0'..=b'9' | b'A'..=b'Z' | b'-') {
let start_ptr = ptr;
let mut opt_ptr = start_ptr.add(1);
while end.offset_from(opt_ptr) > 8 {
for _ in 0..8 {
if is_boundary(*opt_ptr) {
self.buf.advance_to(opt_ptr);

// for space delimited arrays, advance one
if *opt_ptr == b' ' {
self.buf.advance(1);
}

let scalar = self.buf.get(start_ptr..opt_ptr);
return (Some(Token::Unquoted(scalar)), None);
}
opt_ptr = opt_ptr.add(1);
}
}

// optimization failed, fallback to inner parsing loop
} else if *ptr == b'\"' {
let start_ptr = ptr.add(1);
let mut opt_ptr = start_ptr;
let mut escaped = false;
while end.offset_from(opt_ptr) > 8 {
let data = opt_ptr.cast::<u64>().read_unaligned().to_le();
escaped |= contains_zero_byte(data ^ repeat_byte(b'\\'));

// http://0x80.pl/notesen/2023-03-06-swar-find-any.html#faster-swar-procedure
let mask = repeat_byte(0x7f);
let lobits = data & mask;
let x0 = (lobits ^ repeat_byte(b'\"')) + mask;
let t0 = x0 | data;
let t1 = t0 & repeat_byte(0x80);
let t2 = t1 ^ repeat_byte(0x80);

if t2 != 0 {
let quote_ind = t2.trailing_zeros() >> 3;

if !escaped {
opt_ptr = opt_ptr.add(quote_ind as usize);
self.buf.advance_to(opt_ptr.add(1));
let scalar = self.buf.get(start_ptr..opt_ptr);
return (Some(Token::Quoted(scalar)), None);
} else {
break;
}
} else {
opt_ptr = opt_ptr.add(8);
}
}

// optimization failed, fallback to inner parsing loop
}

self.next_opt_fallback()
}

/// Advance a given number of bytes and return them.
///
/// The internal buffer must be large enough to accomodate all bytes.
Expand Down
20 changes: 20 additions & 0 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,31 @@ pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 {
sum_usize(bytewise_equal(value, repeat_byte(byte)))
}

#[inline]
pub(crate) fn leading_whitespace(value: u64) -> u32 {
let mask1 = repeat_byte(b'\t');
let mask2 = repeat_byte(b'\n');
let res1 = value ^ mask1;
let res2 = value ^ mask2;
(res1 & res2).trailing_zeros() >> 3
}

#[cfg(test)]
mod tests {
use super::*;
use rstest::*;

#[rstest]
#[case(*b"\t\t\t\t\t\t\t\t", 8)]
#[case(*b"a\t\t\t\t\t\t\t", 0)]
#[case(*b"\t ", 1)]
#[case(*b"\n\na ", 2)]
#[case(*b"\n\ta ", 2)]
fn test_leading_whitespace(#[case] input: [u8; 8], #[case] expected: u32) {
let lhs = u64::from_le_bytes(input);
assert_eq!(leading_whitespace(lhs), expected);
}

#[rstest]
#[case(*b" ", 0)]
#[case(*b" { ", 1)]
Expand Down