From ac5a8ab29e0fb43629ca93ffff1e722412418b36 Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Fri, 24 May 2024 15:34:26 +0200 Subject: [PATCH] Avoid adding a NullBuffer when decoding timestamp offsets (#90) Since 60288cdee57f72dec84c3fd9f6085561568aad49 we applied an unary_opt kernel to decode timezones. This kernel always returns `Some` unless the date is outside the 1677-2262. Unfortunately, even though the kernel is unlikely to return `None`, applying the kernel always causes the resulting array to get a `NullBuffer`, even if the source array did not have one. In order to avoid unnecessarily adding a `NullBuffer`, this commit first tries to apply a non-nullable kernel; and only falls back to `unary_opt` in the rare case it fails. An alternative implementation that does not risk running the kernel twice would be to check the `NullBuffer`'s `null_count` after running the kernel, then strip it if its `null_count` is zero; but it requires the unnecessary allocation of a `NullBuffer`. --- src/array_decoder/timestamp.rs | 12 ++++++++++-- tests/integration/main.rs | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/array_decoder/timestamp.rs b/src/array_decoder/timestamp.rs index 783354eb..38e4c0b6 100644 --- a/src/array_decoder/timestamp.rs +++ b/src/array_decoder/timestamp.rs @@ -109,7 +109,8 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder { let array = self .inner .next_primitive_batch(batch_size, parent_present)?; - let array = array.unary_opt::<_, TimestampNanosecondType>(|ts| { + + let convert_timezone = |ts| { // Convert from writer timezone to reader timezone (which we default to UTC) // TODO: more efficient way of doing this? self.writer_tz @@ -117,7 +118,14 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder { .naive_local() .and_utc() .timestamp_nanos_opt() - }); + }; + let array = array + // first try to convert all non-nullable batches to non-nullable batches + .try_unary::<_, TimestampNanosecondType, _>(|ts| convert_timezone(ts).ok_or(())) + // in the rare case one of the values was out of the 1677-2262 range (see + // ), + // try again by allowing a nullable batch as output + .unwrap_or_else(|()| array.unary_opt::<_, TimestampNanosecondType>(convert_timezone)); let array = Arc::new(array) as ArrayRef; Ok(array) } diff --git a/tests/integration/main.rs b/tests/integration/main.rs index aecee913..b4838300 100644 --- a/tests/integration/main.rs +++ b/tests/integration/main.rs @@ -3,7 +3,9 @@ use std::fs::File; use arrow::{ + array::{Array, AsArray}, compute::concat_batches, + datatypes::TimestampNanosecondType, ipc::reader::FileReader, record_batch::{RecordBatch, RecordBatchReader}, }; @@ -97,6 +99,27 @@ fn test_date_1900() { test_expected_file("TestOrcFile.testDate1900"); } +#[test] +fn test_date_1900_not_null() { + // Don't use read_orc_file() because it concatenate batches, which would detect + // there are no nulls and remove the NullBuffer, making this test useless + let path = format!( + "{}/tests/integration/data/TestOrcFile.testDate1900.orc", + env!("CARGO_MANIFEST_DIR"), + ); + let f = File::open(path).unwrap(); + let reader = ArrowReaderBuilder::try_new(f).unwrap().build(); + let batches = reader.collect::, _>>().unwrap(); + + for batch in batches { + assert!(batch.columns()[0] + .as_primitive_opt::() + .unwrap() + .nulls() + .is_none()); + } +} + #[test] #[ignore] // TODO: pending https://github.com/chronotope/chrono-tz/issues/155