From ac5a8ab29e0fb43629ca93ffff1e722412418b36 Mon Sep 17 00:00:00 2001
From: Val Lorentz <vlorentz@softwareheritage.org>
Date: Fri, 24 May 2024 15:34:26 +0200
Subject: [PATCH] Avoid adding a NullBuffer when decoding timestamp offsets
 (#90)

Since 60288cdee57f72dec84c3fd9f6085561568aad49 we applied an unary_opt
kernel to decode timezones. This kernel always returns `Some` unless
the date is outside the 1677-2262.

Unfortunately, even though the kernel is unlikely to return `None`,
applying the kernel always causes the resulting array to get a
`NullBuffer`, even if the source array did not have one.

In order to avoid unnecessarily adding a `NullBuffer`, this commit first
tries to apply a non-nullable kernel; and only falls back to `unary_opt`
in the rare case it fails.

An alternative implementation that does not risk running the kernel
twice would be to check the `NullBuffer`'s `null_count` after running
the kernel, then strip it if its `null_count` is zero; but it requires
the unnecessary allocation of a `NullBuffer`.
---
 src/array_decoder/timestamp.rs | 12 ++++++++++--
 tests/integration/main.rs      | 23 +++++++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/src/array_decoder/timestamp.rs b/src/array_decoder/timestamp.rs
index 783354eb..38e4c0b6 100644
--- a/src/array_decoder/timestamp.rs
+++ b/src/array_decoder/timestamp.rs
@@ -109,7 +109,8 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder {
         let array = self
             .inner
             .next_primitive_batch(batch_size, parent_present)?;
-        let array = array.unary_opt::<_, TimestampNanosecondType>(|ts| {
+
+        let convert_timezone = |ts| {
             // Convert from writer timezone to reader timezone (which we default to UTC)
             // TODO: more efficient way of doing this?
             self.writer_tz
@@ -117,7 +118,14 @@ impl ArrayBatchDecoder for TimestampOffsetArrayDecoder {
                 .naive_local()
                 .and_utc()
                 .timestamp_nanos_opt()
-        });
+        };
+        let array = array
+            // first try to convert all non-nullable batches to non-nullable batches
+            .try_unary::<_, TimestampNanosecondType, _>(|ts| convert_timezone(ts).ok_or(()))
+            // in the rare case one of the values was out of the 1677-2262 range (see
+            // <https://docs.rs/chrono/latest/chrono/struct.DateTime.html#method.timestamp_nanos_opt>),
+            // try again by allowing a nullable batch as output
+            .unwrap_or_else(|()| array.unary_opt::<_, TimestampNanosecondType>(convert_timezone));
         let array = Arc::new(array) as ArrayRef;
         Ok(array)
     }
diff --git a/tests/integration/main.rs b/tests/integration/main.rs
index aecee913..b4838300 100644
--- a/tests/integration/main.rs
+++ b/tests/integration/main.rs
@@ -3,7 +3,9 @@
 use std::fs::File;
 
 use arrow::{
+    array::{Array, AsArray},
     compute::concat_batches,
+    datatypes::TimestampNanosecondType,
     ipc::reader::FileReader,
     record_batch::{RecordBatch, RecordBatchReader},
 };
@@ -97,6 +99,27 @@ fn test_date_1900() {
     test_expected_file("TestOrcFile.testDate1900");
 }
 
+#[test]
+fn test_date_1900_not_null() {
+    // Don't use read_orc_file() because it concatenate batches, which would detect
+    // there are no nulls and remove the NullBuffer, making this test useless
+    let path = format!(
+        "{}/tests/integration/data/TestOrcFile.testDate1900.orc",
+        env!("CARGO_MANIFEST_DIR"),
+    );
+    let f = File::open(path).unwrap();
+    let reader = ArrowReaderBuilder::try_new(f).unwrap().build();
+    let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+    for batch in batches {
+        assert!(batch.columns()[0]
+            .as_primitive_opt::<TimestampNanosecondType>()
+            .unwrap()
+            .nulls()
+            .is_none());
+    }
+}
+
 #[test]
 #[ignore]
 // TODO: pending https://github.com/chronotope/chrono-tz/issues/155