Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add integration tests using example files from apache/orc #65

Merged
merged 3 commits into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@ tokio = { version = "1.28", features = [
zstd = "0.12"

[dev-dependencies]
arrow-json = "50.0.0"
async-trait = "0.1.77"
criterion = { version = "0.5", default-features = false, features = ["async_tokio"] }
datafusion = "36.0.0"
datafusion-expr = "36.0.0"
datafusion-physical-expr = "36.0.0"
object_store = "0.9.0"
pretty_assertions = "1.3.0"
serde_json = { version = "1.0", default-features = false, features = ["std"] }

[[bench]]
name = "arrow_reader"
Expand Down
1 change: 1 addition & 0 deletions tests/integration/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
These files are imported from [Apache ORC's examples](https://github.com/apache/orc/tree/207085de3722054485e685811f8e5f2e11aa4deb/examples)
10 changes: 10 additions & 0 deletions tests/integration/data/TestCSVFileImport.test10rows.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
0,a,0.0
1,b,1.1
2,c,2.2
3,d,
4,,4.4
,f,5.5
,,
7,h,7.7
8,i,8.8
9,j,9.9
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2021-12-27 00:00:00.000
Binary file not shown.
Binary file added tests/integration/data/TestOrcFile.emptyFile.orc
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/TestOrcFile.test1.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/TestOrcFile.testSeek.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/TestOrcFile.testTimestamp.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/bad_bloom_filter_1.6.0.orc
Binary file not shown.
Binary file added tests/integration/data/bad_bloom_filter_1.6.11.orc
Binary file not shown.
Binary file added tests/integration/data/complextypes_iceberg.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/decimal.orc
Binary file not shown.
Binary file added tests/integration/data/decimal64_v2.orc
Binary file not shown.
Binary file added tests/integration/data/decimal64_v2_cplusplus.orc
Binary file not shown.
Binary file added tests/integration/data/demo-11-none.orc
Binary file not shown.
Binary file added tests/integration/data/demo-11-zlib.orc
Binary file not shown.
Binary file added tests/integration/data/demo-12-zlib.orc
Binary file not shown.
Binary file added tests/integration/data/encrypted/kms.keystore
Binary file not shown.
Binary file added tests/integration/data/encrypted/sample1.orc
Binary file not shown.
Binary file added tests/integration/data/encrypted/sample2.orc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/expected/decimal.jsn.gz
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added tests/integration/data/nulls-at-end-snappy.orc
Binary file not shown.
Binary file added tests/integration/data/orc-file-11-format.orc
Binary file not shown.
Binary file added tests/integration/data/orc_index_int_string.orc
Binary file not shown.
Binary file added tests/integration/data/orc_no_format.orc
Binary file not shown.
Binary file added tests/integration/data/orc_split_elim.orc
Binary file not shown.
Binary file added tests/integration/data/orc_split_elim_cpp.orc
Binary file not shown.
Binary file added tests/integration/data/orc_split_elim_new.orc
Binary file not shown.
Binary file added tests/integration/data/over1k_bloom.orc
Binary file not shown.
Binary file added tests/integration/data/version1999.orc
Binary file not shown.
Empty file added tests/integration/data/zero.orc
Empty file.
197 changes: 197 additions & 0 deletions tests/integration/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#![allow(non_snake_case)]

/// Tests against `.orc` and `.jsn.gz` in the official test suite (`orc/examples/`)
use std::fs::File;
use std::io::Read;

use pretty_assertions::assert_eq;

use arrow::array::StructArray;
use arrow::record_batch::RecordBatch;
use datafusion_orc::arrow_reader::ArrowReaderBuilder;

/// Checks parsing a `.orc` file produces the expected result in the `.jsn.gz` path
fn test_expected_file(name: &str) {
let dir = env!("CARGO_MANIFEST_DIR");
let orc_path = format!("{}/tests/integration/data/{}.orc", dir, name);
let jsn_gz_path = format!("{}/tests/integration/data/expected/{}.jsn.gz", dir, name);
let f = File::open(orc_path).expect("Could not open .orc");
let builder = ArrowReaderBuilder::try_new(f).unwrap();
let orc_reader = builder.build();
let total_row_count = orc_reader.total_row_count();

// Read .orc into JSON objects
let batches: Vec<RecordBatch> = orc_reader.collect::<Result<Vec<_>, _>>().unwrap();
let objects: Vec<serde_json::Value> = batches
.into_iter()
.map(|batch| -> StructArray { batch.into() })
.flat_map(|array| {
arrow_json::writer::array_to_json_array(&array)
.expect("Could not serialize convert row from .orc to JSON value")
})
.collect();

// Read expected JSON objects
let mut expected_json = String::new();
flate2::read::GzDecoder::new(&File::open(jsn_gz_path).expect("Could not open .jsn.gz"))
.read_to_string(&mut expected_json)
.expect("Could not read .jsn.gz");

let objects_count = objects.len();

// Reencode the input to normalize it
let expected_lines = expected_json
.split('\n')
.filter(|line| !line.is_empty())
.map(|line| {
serde_json::from_str::<serde_json::Value>(line)
.expect("Could not parse line in .jsn.gz")
})
.map(|v| {
serde_json::to_string_pretty(&v).expect("Could not re-serialize line from .jsn.gz")
})
.collect::<Vec<_>>()
.join("\n");

let lines = objects
.into_iter()
.map(|v| serde_json::to_string_pretty(&v).expect("Could not serialize row from .orc"))
.collect::<Vec<_>>()
.join("\n");

if lines.len() < 1000 {
assert_eq!(lines, expected_lines);
} else {
// pretty_assertions consumes too much RAM and CPU on large diffs,
// and it's unreadable anyway
assert_eq!(lines[0..1000], expected_lines[0..1000]);
assert!(lines == expected_lines);
Comment on lines +65 to +68
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can parse the expected JSON into Arrow first with arrow_json then compare on RecordBatches

Assuming the schema inference works in our favour 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would avoid the issue of different decimal representation.

However, it makes the tests a little unreliable, as they wouldn't detect data lost both by arrow_json and datafusion_orc. Probably not a big deal

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a fair point. I guess we're a bit handicapped by the expected data being in JSON form which can make it harder for us to be rigorous. I'll raise an issue for exploring ways around this 👍

}

assert_eq!(total_row_count, objects_count as u64);
}

#[test]
fn columnProjection() {
test_expected_file("TestOrcFile.columnProjection");
}
#[test]
fn emptyFile() {
test_expected_file("TestOrcFile.emptyFile");
}
#[test]
#[ignore] // TODO: Why?
fn metaData() {
test_expected_file("TestOrcFile.metaData");
}
#[test]
#[ignore] // TODO: Why?
fn test1() {
test_expected_file("TestOrcFile.test1");
}
#[test]
#[ignore] // TODO: Incorrect timezone + representation differs
fn testDate1900() {
test_expected_file("TestOrcFile.testDate1900");
}
#[test]
#[ignore] // TODO: Incorrect timezone + representation differs
fn testDate2038() {
test_expected_file("TestOrcFile.testDate2038");
}
#[test]
fn testMemoryManagementV11() {
test_expected_file("TestOrcFile.testMemoryManagementV11");
}
#[test]
fn testMemoryManagementV12() {
test_expected_file("TestOrcFile.testMemoryManagementV12");
}
#[test]
fn testPredicatePushdown() {
test_expected_file("TestOrcFile.testPredicatePushdown");
}
#[test]
#[ignore] // TODO: Why?
fn testSeek() {
test_expected_file("TestOrcFile.testSeek");
}
#[test]
fn testSnappy() {
test_expected_file("TestOrcFile.testSnappy");
}
#[test]
#[ignore] // TODO: arrow_json does not support binaries
fn testStringAndBinaryStatistics() {
test_expected_file("TestOrcFile.testStringAndBinaryStatistics");
}
#[test]
fn testStripeLevelStats() {
test_expected_file("TestOrcFile.testStripeLevelStats");
}
#[test]
#[ignore] // TODO: Non-struct root type are not supported yet
fn testTimestamp() {
test_expected_file("TestOrcFile.testTimestamp");
}
#[test]
#[ignore] // TODO: Unions are not supported yet
fn testUnionAndTimestamp() {
test_expected_file("TestOrcFile.testUnionAndTimestamp");
}
#[test]
fn testWithoutIndex() {
test_expected_file("TestOrcFile.testWithoutIndex");
}
#[test]
fn testLz4() {
test_expected_file("TestVectorOrcFile.testLz4");
}
#[test]
fn testLzo() {
test_expected_file("TestVectorOrcFile.testLzo");
}
#[test]
#[ignore] // TODO: Differs on representation of some Decimals
fn decimal() {
test_expected_file("decimal");
}
#[test]
#[ignore] // TODO: Too slow
fn zlib() {
test_expected_file("demo-12-zlib");
}
#[test]
#[ignore] // TODO: Why?
fn nulls_at_end_snappy() {
test_expected_file("nulls-at-end-snappy");
}
#[test]
#[ignore] // TODO: Why?
fn orc_11_format() {
test_expected_file("orc-file-11-format");
}
#[test]
fn orc_index_int_string() {
test_expected_file("orc_index_int_string");
}
#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim() {
test_expected_file("orc_split_elim");
}
#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim_cpp() {
test_expected_file("orc_split_elim_cpp");
}
#[test]
#[ignore] // TODO: not yet implemented
fn orc_split_elim_new() {
test_expected_file("orc_split_elim_new");
}
#[test]
#[ignore] // TODO: not yet implemented
fn over1k_bloom() {
test_expected_file("over1k_bloom");
}
Loading