diff --git a/Cargo.toml b/Cargo.toml index f5cd5d8e..a5b3344f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,12 +40,15 @@ tokio = { version = "1.28", features = [ zstd = "0.12" [dev-dependencies] +arrow-json = "50.0.0" async-trait = "0.1.77" criterion = { version = "0.5", default-features = false, features = ["async_tokio"] } datafusion = "36.0.0" datafusion-expr = "36.0.0" datafusion-physical-expr = "36.0.0" object_store = "0.9.0" +pretty_assertions = "1.3.0" +serde_json = { version = "1.0", default-features = false, features = ["std"] } [[bench]] name = "arrow_reader" diff --git a/tests/integration/data/README.md b/tests/integration/data/README.md new file mode 100644 index 00000000..97ab678b --- /dev/null +++ b/tests/integration/data/README.md @@ -0,0 +1 @@ +These files are imported from [Apache ORC's examples](https://github.com/apache/orc/tree/207085de3722054485e685811f8e5f2e11aa4deb/examples) diff --git a/tests/integration/data/TestCSVFileImport.test10rows.csv b/tests/integration/data/TestCSVFileImport.test10rows.csv new file mode 100644 index 00000000..5404dac1 --- /dev/null +++ b/tests/integration/data/TestCSVFileImport.test10rows.csv @@ -0,0 +1,10 @@ +0,a,0.0 +1,b,1.1 +2,c,2.2 +3,d, +4,,4.4 +,f,5.5 +,, +7,h,7.7 +8,i,8.8 +9,j,9.9 \ No newline at end of file diff --git a/tests/integration/data/TestCSVFileImport.testTimezoneOption.csv b/tests/integration/data/TestCSVFileImport.testTimezoneOption.csv new file mode 100644 index 00000000..9e87f23a --- /dev/null +++ b/tests/integration/data/TestCSVFileImport.testTimezoneOption.csv @@ -0,0 +1 @@ +2021-12-27 00:00:00.000 \ No newline at end of file diff --git a/tests/integration/data/TestOrcFile.columnProjection.orc b/tests/integration/data/TestOrcFile.columnProjection.orc new file mode 100644 index 00000000..b0f91f37 Binary files /dev/null and b/tests/integration/data/TestOrcFile.columnProjection.orc differ diff --git a/tests/integration/data/TestOrcFile.emptyFile.orc b/tests/integration/data/TestOrcFile.emptyFile.orc new file mode 100644 index 00000000..ecdadcbf Binary files /dev/null and b/tests/integration/data/TestOrcFile.emptyFile.orc differ diff --git a/tests/integration/data/TestOrcFile.metaData.orc b/tests/integration/data/TestOrcFile.metaData.orc new file mode 100644 index 00000000..d454581c Binary files /dev/null and b/tests/integration/data/TestOrcFile.metaData.orc differ diff --git a/tests/integration/data/TestOrcFile.test1.orc b/tests/integration/data/TestOrcFile.test1.orc new file mode 100644 index 00000000..4fb0beff Binary files /dev/null and b/tests/integration/data/TestOrcFile.test1.orc differ diff --git a/tests/integration/data/TestOrcFile.testDate1900.orc b/tests/integration/data/TestOrcFile.testDate1900.orc new file mode 100644 index 00000000..f51ffdbd Binary files /dev/null and b/tests/integration/data/TestOrcFile.testDate1900.orc differ diff --git a/tests/integration/data/TestOrcFile.testDate2038.orc b/tests/integration/data/TestOrcFile.testDate2038.orc new file mode 100644 index 00000000..cd11fa8a Binary files /dev/null and b/tests/integration/data/TestOrcFile.testDate2038.orc differ diff --git a/tests/integration/data/TestOrcFile.testMemoryManagementV11.orc b/tests/integration/data/TestOrcFile.testMemoryManagementV11.orc new file mode 100644 index 00000000..98852b6e Binary files /dev/null and b/tests/integration/data/TestOrcFile.testMemoryManagementV11.orc differ diff --git a/tests/integration/data/TestOrcFile.testMemoryManagementV12.orc b/tests/integration/data/TestOrcFile.testMemoryManagementV12.orc new file mode 100644 index 00000000..dd6fc1e6 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testMemoryManagementV12.orc differ diff --git a/tests/integration/data/TestOrcFile.testPredicatePushdown.orc b/tests/integration/data/TestOrcFile.testPredicatePushdown.orc new file mode 100644 index 00000000..4865dd81 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testPredicatePushdown.orc differ diff --git a/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc b/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc new file mode 100644 index 00000000..d32bb1f1 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexCPlusPlus.orc differ diff --git a/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc b/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc new file mode 100644 index 00000000..c57c5b73 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testSargSkipPickupGroupWithoutIndexJava.orc differ diff --git a/tests/integration/data/TestOrcFile.testSeek.orc b/tests/integration/data/TestOrcFile.testSeek.orc new file mode 100644 index 00000000..006b83f2 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testSeek.orc differ diff --git a/tests/integration/data/TestOrcFile.testSnappy.orc b/tests/integration/data/TestOrcFile.testSnappy.orc new file mode 100644 index 00000000..aa6cc9c9 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testSnappy.orc differ diff --git a/tests/integration/data/TestOrcFile.testStringAndBinaryStatistics.orc b/tests/integration/data/TestOrcFile.testStringAndBinaryStatistics.orc new file mode 100644 index 00000000..4282c2a1 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testStringAndBinaryStatistics.orc differ diff --git a/tests/integration/data/TestOrcFile.testStripeLevelStats.orc b/tests/integration/data/TestOrcFile.testStripeLevelStats.orc new file mode 100644 index 00000000..7073bfad Binary files /dev/null and b/tests/integration/data/TestOrcFile.testStripeLevelStats.orc differ diff --git a/tests/integration/data/TestOrcFile.testTimestamp.orc b/tests/integration/data/TestOrcFile.testTimestamp.orc new file mode 100644 index 00000000..505d42cd Binary files /dev/null and b/tests/integration/data/TestOrcFile.testTimestamp.orc differ diff --git a/tests/integration/data/TestOrcFile.testUnionAndTimestamp.orc b/tests/integration/data/TestOrcFile.testUnionAndTimestamp.orc new file mode 100644 index 00000000..377862df Binary files /dev/null and b/tests/integration/data/TestOrcFile.testUnionAndTimestamp.orc differ diff --git a/tests/integration/data/TestOrcFile.testWithoutCompressionBlockSize.orc b/tests/integration/data/TestOrcFile.testWithoutCompressionBlockSize.orc new file mode 100644 index 00000000..552f4e77 Binary files /dev/null and b/tests/integration/data/TestOrcFile.testWithoutCompressionBlockSize.orc differ diff --git a/tests/integration/data/TestOrcFile.testWithoutIndex.orc b/tests/integration/data/TestOrcFile.testWithoutIndex.orc new file mode 100644 index 00000000..a150df7a Binary files /dev/null and b/tests/integration/data/TestOrcFile.testWithoutIndex.orc differ diff --git a/tests/integration/data/TestStringDictionary.testRowIndex.orc b/tests/integration/data/TestStringDictionary.testRowIndex.orc new file mode 100644 index 00000000..cba483d1 Binary files /dev/null and b/tests/integration/data/TestStringDictionary.testRowIndex.orc differ diff --git a/tests/integration/data/TestVectorOrcFile.testLz4.orc b/tests/integration/data/TestVectorOrcFile.testLz4.orc new file mode 100644 index 00000000..dacba8de Binary files /dev/null and b/tests/integration/data/TestVectorOrcFile.testLz4.orc differ diff --git a/tests/integration/data/TestVectorOrcFile.testLzo.orc b/tests/integration/data/TestVectorOrcFile.testLzo.orc new file mode 100644 index 00000000..2b01fb56 Binary files /dev/null and b/tests/integration/data/TestVectorOrcFile.testLzo.orc differ diff --git a/tests/integration/data/TestVectorOrcFile.testZstd.0.12.orc b/tests/integration/data/TestVectorOrcFile.testZstd.0.12.orc new file mode 100644 index 00000000..d2644194 Binary files /dev/null and b/tests/integration/data/TestVectorOrcFile.testZstd.0.12.orc differ diff --git a/tests/integration/data/bad_bloom_filter_1.6.0.orc b/tests/integration/data/bad_bloom_filter_1.6.0.orc new file mode 100644 index 00000000..6c52f4f0 Binary files /dev/null and b/tests/integration/data/bad_bloom_filter_1.6.0.orc differ diff --git a/tests/integration/data/bad_bloom_filter_1.6.11.orc b/tests/integration/data/bad_bloom_filter_1.6.11.orc new file mode 100644 index 00000000..fc3ffef8 Binary files /dev/null and b/tests/integration/data/bad_bloom_filter_1.6.11.orc differ diff --git a/tests/integration/data/complextypes_iceberg.orc b/tests/integration/data/complextypes_iceberg.orc new file mode 100644 index 00000000..2829e501 Binary files /dev/null and b/tests/integration/data/complextypes_iceberg.orc differ diff --git a/tests/integration/data/corrupt/missing_blob_stream_in_string_dict.orc b/tests/integration/data/corrupt/missing_blob_stream_in_string_dict.orc new file mode 100644 index 00000000..1c7f7420 Binary files /dev/null and b/tests/integration/data/corrupt/missing_blob_stream_in_string_dict.orc differ diff --git a/tests/integration/data/corrupt/missing_length_stream_in_string_dict.orc b/tests/integration/data/corrupt/missing_length_stream_in_string_dict.orc new file mode 100644 index 00000000..92912b0e Binary files /dev/null and b/tests/integration/data/corrupt/missing_length_stream_in_string_dict.orc differ diff --git a/tests/integration/data/corrupt/negative_dict_entry_lengths.orc b/tests/integration/data/corrupt/negative_dict_entry_lengths.orc new file mode 100644 index 00000000..171537db Binary files /dev/null and b/tests/integration/data/corrupt/negative_dict_entry_lengths.orc differ diff --git a/tests/integration/data/corrupt/stripe_footer_bad_column_encodings.orc b/tests/integration/data/corrupt/stripe_footer_bad_column_encodings.orc new file mode 100644 index 00000000..24466239 Binary files /dev/null and b/tests/integration/data/corrupt/stripe_footer_bad_column_encodings.orc differ diff --git a/tests/integration/data/decimal.orc b/tests/integration/data/decimal.orc new file mode 100644 index 00000000..cb0f7b9d Binary files /dev/null and b/tests/integration/data/decimal.orc differ diff --git a/tests/integration/data/decimal64_v2.orc b/tests/integration/data/decimal64_v2.orc new file mode 100644 index 00000000..196be7fe Binary files /dev/null and b/tests/integration/data/decimal64_v2.orc differ diff --git a/tests/integration/data/decimal64_v2_cplusplus.orc b/tests/integration/data/decimal64_v2_cplusplus.orc new file mode 100644 index 00000000..faf35247 Binary files /dev/null and b/tests/integration/data/decimal64_v2_cplusplus.orc differ diff --git a/tests/integration/data/demo-11-none.orc b/tests/integration/data/demo-11-none.orc new file mode 100644 index 00000000..1d1d714a Binary files /dev/null and b/tests/integration/data/demo-11-none.orc differ diff --git a/tests/integration/data/demo-11-zlib.orc b/tests/integration/data/demo-11-zlib.orc new file mode 100644 index 00000000..db0ff15e Binary files /dev/null and b/tests/integration/data/demo-11-zlib.orc differ diff --git a/tests/integration/data/demo-12-zlib.orc b/tests/integration/data/demo-12-zlib.orc new file mode 100644 index 00000000..862dd27a Binary files /dev/null and b/tests/integration/data/demo-12-zlib.orc differ diff --git a/tests/integration/data/encrypted/kms.keystore b/tests/integration/data/encrypted/kms.keystore new file mode 100644 index 00000000..cd9152b7 Binary files /dev/null and b/tests/integration/data/encrypted/kms.keystore differ diff --git a/tests/integration/data/encrypted/sample1.orc b/tests/integration/data/encrypted/sample1.orc new file mode 100644 index 00000000..aa0e502a Binary files /dev/null and b/tests/integration/data/encrypted/sample1.orc differ diff --git a/tests/integration/data/encrypted/sample2.orc b/tests/integration/data/encrypted/sample2.orc new file mode 100644 index 00000000..7f9b2146 Binary files /dev/null and b/tests/integration/data/encrypted/sample2.orc differ diff --git a/tests/integration/data/expected/TestOrcFile.columnProjection.jsn.gz b/tests/integration/data/expected/TestOrcFile.columnProjection.jsn.gz new file mode 100644 index 00000000..ab851520 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.columnProjection.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.emptyFile.jsn.gz b/tests/integration/data/expected/TestOrcFile.emptyFile.jsn.gz new file mode 100644 index 00000000..91c85cd7 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.emptyFile.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.metaData.jsn.gz b/tests/integration/data/expected/TestOrcFile.metaData.jsn.gz new file mode 100644 index 00000000..da9dffbe Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.metaData.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.test1.jsn.gz b/tests/integration/data/expected/TestOrcFile.test1.jsn.gz new file mode 100644 index 00000000..5eab19a4 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.test1.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testDate1900.jsn.gz b/tests/integration/data/expected/TestOrcFile.testDate1900.jsn.gz new file mode 100644 index 00000000..62dbaba4 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testDate1900.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testDate2038.jsn.gz b/tests/integration/data/expected/TestOrcFile.testDate2038.jsn.gz new file mode 100644 index 00000000..8a84655d Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testDate2038.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testMemoryManagementV11.jsn.gz b/tests/integration/data/expected/TestOrcFile.testMemoryManagementV11.jsn.gz new file mode 100644 index 00000000..591e3c79 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testMemoryManagementV11.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testMemoryManagementV12.jsn.gz b/tests/integration/data/expected/TestOrcFile.testMemoryManagementV12.jsn.gz new file mode 100644 index 00000000..14ece1d0 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testMemoryManagementV12.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testPredicatePushdown.jsn.gz b/tests/integration/data/expected/TestOrcFile.testPredicatePushdown.jsn.gz new file mode 100644 index 00000000..cf80773f Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testPredicatePushdown.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testSeek.jsn.gz b/tests/integration/data/expected/TestOrcFile.testSeek.jsn.gz new file mode 100644 index 00000000..d6afadde Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testSeek.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testSnappy.jsn.gz b/tests/integration/data/expected/TestOrcFile.testSnappy.jsn.gz new file mode 100644 index 00000000..1bd2ae96 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testSnappy.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testStringAndBinaryStatistics.jsn.gz b/tests/integration/data/expected/TestOrcFile.testStringAndBinaryStatistics.jsn.gz new file mode 100644 index 00000000..76d962e6 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testStringAndBinaryStatistics.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testStripeLevelStats.jsn.gz b/tests/integration/data/expected/TestOrcFile.testStripeLevelStats.jsn.gz new file mode 100644 index 00000000..f5437136 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testStripeLevelStats.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testTimestamp.jsn.gz b/tests/integration/data/expected/TestOrcFile.testTimestamp.jsn.gz new file mode 100644 index 00000000..375c9a01 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testTimestamp.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testUnionAndTimestamp.jsn.gz b/tests/integration/data/expected/TestOrcFile.testUnionAndTimestamp.jsn.gz new file mode 100644 index 00000000..bd0c5a6f Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testUnionAndTimestamp.jsn.gz differ diff --git a/tests/integration/data/expected/TestOrcFile.testWithoutIndex.jsn.gz b/tests/integration/data/expected/TestOrcFile.testWithoutIndex.jsn.gz new file mode 100644 index 00000000..f2128ca1 Binary files /dev/null and b/tests/integration/data/expected/TestOrcFile.testWithoutIndex.jsn.gz differ diff --git a/tests/integration/data/expected/TestStringDictionary.testRowIndex.jsn.gz b/tests/integration/data/expected/TestStringDictionary.testRowIndex.jsn.gz new file mode 100644 index 00000000..5fb46b6b Binary files /dev/null and b/tests/integration/data/expected/TestStringDictionary.testRowIndex.jsn.gz differ diff --git a/tests/integration/data/expected/TestVectorOrcFile.testLz4.jsn.gz b/tests/integration/data/expected/TestVectorOrcFile.testLz4.jsn.gz new file mode 100644 index 00000000..60a846e3 Binary files /dev/null and b/tests/integration/data/expected/TestVectorOrcFile.testLz4.jsn.gz differ diff --git a/tests/integration/data/expected/TestVectorOrcFile.testLzo.jsn.gz b/tests/integration/data/expected/TestVectorOrcFile.testLzo.jsn.gz new file mode 100644 index 00000000..e002379c Binary files /dev/null and b/tests/integration/data/expected/TestVectorOrcFile.testLzo.jsn.gz differ diff --git a/tests/integration/data/expected/decimal.jsn.gz b/tests/integration/data/expected/decimal.jsn.gz new file mode 100644 index 00000000..e634bd70 Binary files /dev/null and b/tests/integration/data/expected/decimal.jsn.gz differ diff --git a/tests/integration/data/expected/demo-12-zlib.jsn.gz b/tests/integration/data/expected/demo-12-zlib.jsn.gz new file mode 100644 index 00000000..ab532846 Binary files /dev/null and b/tests/integration/data/expected/demo-12-zlib.jsn.gz differ diff --git a/tests/integration/data/expected/nulls-at-end-snappy.jsn.gz b/tests/integration/data/expected/nulls-at-end-snappy.jsn.gz new file mode 100644 index 00000000..a51a95bf Binary files /dev/null and b/tests/integration/data/expected/nulls-at-end-snappy.jsn.gz differ diff --git a/tests/integration/data/expected/orc-file-11-format.jsn.gz b/tests/integration/data/expected/orc-file-11-format.jsn.gz new file mode 100644 index 00000000..819f4a22 Binary files /dev/null and b/tests/integration/data/expected/orc-file-11-format.jsn.gz differ diff --git a/tests/integration/data/expected/orc_index_int_string.jsn.gz b/tests/integration/data/expected/orc_index_int_string.jsn.gz new file mode 100644 index 00000000..4e6de3dc Binary files /dev/null and b/tests/integration/data/expected/orc_index_int_string.jsn.gz differ diff --git a/tests/integration/data/expected/orc_split_elim.jsn.gz b/tests/integration/data/expected/orc_split_elim.jsn.gz new file mode 100644 index 00000000..52636996 Binary files /dev/null and b/tests/integration/data/expected/orc_split_elim.jsn.gz differ diff --git a/tests/integration/data/expected/orc_split_elim_cpp.jsn.gz b/tests/integration/data/expected/orc_split_elim_cpp.jsn.gz new file mode 100644 index 00000000..52636996 Binary files /dev/null and b/tests/integration/data/expected/orc_split_elim_cpp.jsn.gz differ diff --git a/tests/integration/data/expected/orc_split_elim_new.jsn.gz b/tests/integration/data/expected/orc_split_elim_new.jsn.gz new file mode 100644 index 00000000..52636996 Binary files /dev/null and b/tests/integration/data/expected/orc_split_elim_new.jsn.gz differ diff --git a/tests/integration/data/expected/over1k_bloom.jsn.gz b/tests/integration/data/expected/over1k_bloom.jsn.gz new file mode 100644 index 00000000..b159f0c2 Binary files /dev/null and b/tests/integration/data/expected/over1k_bloom.jsn.gz differ diff --git a/tests/integration/data/nulls-at-end-snappy.orc b/tests/integration/data/nulls-at-end-snappy.orc new file mode 100644 index 00000000..2099c484 Binary files /dev/null and b/tests/integration/data/nulls-at-end-snappy.orc differ diff --git a/tests/integration/data/orc-file-11-format.orc b/tests/integration/data/orc-file-11-format.orc new file mode 100644 index 00000000..41653c84 Binary files /dev/null and b/tests/integration/data/orc-file-11-format.orc differ diff --git a/tests/integration/data/orc_index_int_string.orc b/tests/integration/data/orc_index_int_string.orc new file mode 100644 index 00000000..17f90729 Binary files /dev/null and b/tests/integration/data/orc_index_int_string.orc differ diff --git a/tests/integration/data/orc_no_format.orc b/tests/integration/data/orc_no_format.orc new file mode 100644 index 00000000..3efb93a4 Binary files /dev/null and b/tests/integration/data/orc_no_format.orc differ diff --git a/tests/integration/data/orc_split_elim.orc b/tests/integration/data/orc_split_elim.orc new file mode 100644 index 00000000..cd145d34 Binary files /dev/null and b/tests/integration/data/orc_split_elim.orc differ diff --git a/tests/integration/data/orc_split_elim_cpp.orc b/tests/integration/data/orc_split_elim_cpp.orc new file mode 100644 index 00000000..86921f34 Binary files /dev/null and b/tests/integration/data/orc_split_elim_cpp.orc differ diff --git a/tests/integration/data/orc_split_elim_new.orc b/tests/integration/data/orc_split_elim_new.orc new file mode 100644 index 00000000..24e58f13 Binary files /dev/null and b/tests/integration/data/orc_split_elim_new.orc differ diff --git a/tests/integration/data/over1k_bloom.orc b/tests/integration/data/over1k_bloom.orc new file mode 100755 index 00000000..245f3976 Binary files /dev/null and b/tests/integration/data/over1k_bloom.orc differ diff --git a/tests/integration/data/version1999.orc b/tests/integration/data/version1999.orc new file mode 100644 index 00000000..1748c703 Binary files /dev/null and b/tests/integration/data/version1999.orc differ diff --git a/tests/integration/data/zero.orc b/tests/integration/data/zero.orc new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/main.rs b/tests/integration/main.rs new file mode 100644 index 00000000..261262b3 --- /dev/null +++ b/tests/integration/main.rs @@ -0,0 +1,197 @@ +#![allow(non_snake_case)] + +/// Tests against `.orc` and `.jsn.gz` in the official test suite (`orc/examples/`) +use std::fs::File; +use std::io::Read; + +use pretty_assertions::assert_eq; + +use arrow::array::StructArray; +use arrow::record_batch::RecordBatch; +use datafusion_orc::arrow_reader::ArrowReaderBuilder; + +/// Checks parsing a `.orc` file produces the expected result in the `.jsn.gz` path +fn test_expected_file(name: &str) { + let dir = env!("CARGO_MANIFEST_DIR"); + let orc_path = format!("{}/tests/integration/data/{}.orc", dir, name); + let jsn_gz_path = format!("{}/tests/integration/data/expected/{}.jsn.gz", dir, name); + let f = File::open(orc_path).expect("Could not open .orc"); + let builder = ArrowReaderBuilder::try_new(f).unwrap(); + let orc_reader = builder.build(); + let total_row_count = orc_reader.total_row_count(); + + // Read .orc into JSON objects + let batches: Vec = orc_reader.collect::, _>>().unwrap(); + let objects: Vec = batches + .into_iter() + .map(|batch| -> StructArray { batch.into() }) + .flat_map(|array| { + arrow_json::writer::array_to_json_array(&array) + .expect("Could not serialize convert row from .orc to JSON value") + }) + .collect(); + + // Read expected JSON objects + let mut expected_json = String::new(); + flate2::read::GzDecoder::new(&File::open(jsn_gz_path).expect("Could not open .jsn.gz")) + .read_to_string(&mut expected_json) + .expect("Could not read .jsn.gz"); + + let objects_count = objects.len(); + + // Reencode the input to normalize it + let expected_lines = expected_json + .split('\n') + .filter(|line| !line.is_empty()) + .map(|line| { + serde_json::from_str::(line) + .expect("Could not parse line in .jsn.gz") + }) + .map(|v| { + serde_json::to_string_pretty(&v).expect("Could not re-serialize line from .jsn.gz") + }) + .collect::>() + .join("\n"); + + let lines = objects + .into_iter() + .map(|v| serde_json::to_string_pretty(&v).expect("Could not serialize row from .orc")) + .collect::>() + .join("\n"); + + if lines.len() < 1000 { + assert_eq!(lines, expected_lines); + } else { + // pretty_assertions consumes too much RAM and CPU on large diffs, + // and it's unreadable anyway + assert_eq!(lines[0..1000], expected_lines[0..1000]); + assert!(lines == expected_lines); + } + + assert_eq!(total_row_count, objects_count as u64); +} + +#[test] +fn columnProjection() { + test_expected_file("TestOrcFile.columnProjection"); +} +#[test] +fn emptyFile() { + test_expected_file("TestOrcFile.emptyFile"); +} +#[test] +#[ignore] // TODO: Why? +fn metaData() { + test_expected_file("TestOrcFile.metaData"); +} +#[test] +#[ignore] // TODO: Why? +fn test1() { + test_expected_file("TestOrcFile.test1"); +} +#[test] +#[ignore] // TODO: Incorrect timezone + representation differs +fn testDate1900() { + test_expected_file("TestOrcFile.testDate1900"); +} +#[test] +#[ignore] // TODO: Incorrect timezone + representation differs +fn testDate2038() { + test_expected_file("TestOrcFile.testDate2038"); +} +#[test] +fn testMemoryManagementV11() { + test_expected_file("TestOrcFile.testMemoryManagementV11"); +} +#[test] +fn testMemoryManagementV12() { + test_expected_file("TestOrcFile.testMemoryManagementV12"); +} +#[test] +fn testPredicatePushdown() { + test_expected_file("TestOrcFile.testPredicatePushdown"); +} +#[test] +#[ignore] // TODO: Why? +fn testSeek() { + test_expected_file("TestOrcFile.testSeek"); +} +#[test] +fn testSnappy() { + test_expected_file("TestOrcFile.testSnappy"); +} +#[test] +#[ignore] // TODO: arrow_json does not support binaries +fn testStringAndBinaryStatistics() { + test_expected_file("TestOrcFile.testStringAndBinaryStatistics"); +} +#[test] +fn testStripeLevelStats() { + test_expected_file("TestOrcFile.testStripeLevelStats"); +} +#[test] +#[ignore] // TODO: Non-struct root type are not supported yet +fn testTimestamp() { + test_expected_file("TestOrcFile.testTimestamp"); +} +#[test] +#[ignore] // TODO: Unions are not supported yet +fn testUnionAndTimestamp() { + test_expected_file("TestOrcFile.testUnionAndTimestamp"); +} +#[test] +fn testWithoutIndex() { + test_expected_file("TestOrcFile.testWithoutIndex"); +} +#[test] +fn testLz4() { + test_expected_file("TestVectorOrcFile.testLz4"); +} +#[test] +fn testLzo() { + test_expected_file("TestVectorOrcFile.testLzo"); +} +#[test] +#[ignore] // TODO: Differs on representation of some Decimals +fn decimal() { + test_expected_file("decimal"); +} +#[test] +#[ignore] // TODO: Too slow +fn zlib() { + test_expected_file("demo-12-zlib"); +} +#[test] +#[ignore] // TODO: Why? +fn nulls_at_end_snappy() { + test_expected_file("nulls-at-end-snappy"); +} +#[test] +#[ignore] // TODO: Why? +fn orc_11_format() { + test_expected_file("orc-file-11-format"); +} +#[test] +fn orc_index_int_string() { + test_expected_file("orc_index_int_string"); +} +#[test] +#[ignore] // TODO: not yet implemented +fn orc_split_elim() { + test_expected_file("orc_split_elim"); +} +#[test] +#[ignore] // TODO: not yet implemented +fn orc_split_elim_cpp() { + test_expected_file("orc_split_elim_cpp"); +} +#[test] +#[ignore] // TODO: not yet implemented +fn orc_split_elim_new() { + test_expected_file("orc_split_elim_new"); +} +#[test] +#[ignore] // TODO: not yet implemented +fn over1k_bloom() { + test_expected_file("over1k_bloom"); +}