Error instead of panic on timestamp overflow (#91)

I would have prefered either returning a null value or trying to encode as a non-nanosecond timestamp (sometimes possible if the ORC data does not use nanosecond-level precision), but this would require more significant changes to the code.
datafusion-contrib · May 24, 2024 · f223788 · f223788
1 parent 4931852
commit f223788
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -53,27 +53,27 @@ Versions will be released on an ad-hoc basis (with no fixed schedule).
 
 The following table lists how ORC data types are read into Arrow data types:
 
-| ORC Data Type     | Arrow Data Type             |
-| ----------------- | --------------------------  |
-| Boolean           | Boolean                     |
-| TinyInt           | Int8                        |
-| SmallInt          | Int16                       |
-| Int               | Int32                       |
-| BigInt            | Int64                       |
-| Float             | Float32                     |
-| Double            | Float64                     |
-| String            | Utf8                        |
-| Char              | Utf8                        |
-| VarChar           | Utf8                        |
-| Binary            | Binary                      |
-| Decimal           | Decimal128                  |
-| Date              | Date32                      |
-| Timestamp         | Timestamp(Nanosecond, None) |
-| Timestamp instant | Timestamp(Nanosecond, UTC)  |
-| Struct            | Struct                      |
-| List              | List                        |
-| Map               | Map                         |
-| Union             | Union(_, Sparse)*           |
+| ORC Data Type     | Arrow Data Type             | Notes |
+| ----------------- | --------------------------  | ----- |
+| Boolean           | Boolean                     |       |
+| TinyInt           | Int8                        |       |
+| SmallInt          | Int16                       |       |
+| Int               | Int32                       |       |
+| BigInt            | Int64                       |       |
+| Float             | Float32                     |       |
+| Double            | Float64                     |       |
+| String            | Utf8                        |       |
+| Char              | Utf8                        |       |
+| VarChar           | Utf8                        |       |
+| Binary            | Binary                      |       |
+| Decimal           | Decimal128                  |       |
+| Date              | Date32                      |       |
+| Timestamp         | Timestamp(Nanosecond, None) | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds |
+| Timestamp instant | Timestamp(Nanosecond, UTC)  | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds |
+| Struct            | Struct                      |       |
+| List              | List                        |       |
+| Map               | Map                         |       |
+| Union             | Union(_, Sparse)*           |       |
 
 *Currently only supports a maximum of 127 variants
 

diff --git a/src/error.rs b/src/error.rs
@@ -50,6 +50,17 @@ pub enum OrcError {
         source: std::io::Error,
     },
 
+    #[snafu(display(
+        "Overflow while decoding timestamp (seconds={}, nanoseconds={}) to nanoseconds",
+        seconds,
+        nanoseconds
+    ))]
+    DecodeTimestamp {
+        location: Location,
+        seconds: i64,
+        nanoseconds: u64,
+    },
+
     #[snafu(display("Failed to decode proto, source: {}", source))]
     DecodeProto {
         location: Location,

diff --git a/src/reader/decode/timestamp.rs b/src/reader/decode/timestamp.rs
@@ -1,4 +1,4 @@
-use crate::error::Result;
+use crate::error::{DecodeTimestampSnafu, Result};
 
 const NANOSECONDS_IN_SECOND: i64 = 1_000_000_000;
 
@@ -58,6 +58,18 @@ fn decode_timestamp(
     };
     // Convert into nanoseconds since epoch, which Arrow uses as native representation
     // of timestamps
-    let nanoseconds_since_epoch = seconds * NANOSECONDS_IN_SECOND + (nanoseconds as i64);
+    // The timestamp may overflow as ORC encodes them as a pair of (seconds, nanoseconds)
+    // while we encode them as a single i64 of nanoseconds in Arrow.
+    let nanoseconds_since_epoch = seconds
+        .checked_mul(NANOSECONDS_IN_SECOND)
+        .and_then(|seconds_in_ns| seconds_in_ns.checked_add(nanoseconds as i64))
+        .ok_or(())
+        .or_else(|()| {
+            DecodeTimestampSnafu {
+                seconds,
+                nanoseconds,
+            }
+            .fail()
+        })?;
     Ok(Some(nanoseconds_since_epoch))
 }
diff --git a/tests/basic/data/generate_orc_timestamps.py b/tests/basic/data/generate_orc_timestamps.py
@@ -2,6 +2,7 @@
 import pyarrow as pa
 from pyarrow import orc
 from pyarrow import parquet
+import pyorc
 
 schema = pa.schema([
     pa.field('timestamp_notz', pa.timestamp("ns")),
@@ -17,8 +18,30 @@
     dttm(2262,  4, 11, 11, 47, 16),
     dttm(2001,  4, 13,  2, 14,  0),
     dttm(2000,  1,  1, 23, 10, 10),
-    dttm(1900,  1,  1, 14, 25, 14)
+    dttm(1900,  1,  1, 14, 25, 14),
 ])
 table = pa.Table.from_arrays([arr, arr], schema=schema)
 orc.write_table(table, "pyarrow_timestamps.orc")
 
+
+# pyarrow overflows when trying to write this, so we have to use pyorc instead
+class TimestampConverter:
+    @staticmethod
+    def from_orc(obj, tz):
+        return obj
+    @staticmethod
+    def to_orc(obj, tz):
+        return obj
+schema = pyorc.Struct(
+    id=pyorc.Int(),
+    timestamp=pyorc.Timestamp()
+)
+with open("overflowing_timestamps.orc", "wb") as f:
+    with pyorc.Writer(
+        f,
+        schema,
+        converters={pyorc.TypeKind.TIMESTAMP: TimestampConverter},
+    ) as writer:
+        writer.write((1, (12345678, 0)))
+        writer.write((2, (-62135596800, 0)))
+        writer.write((3, (12345678, 0)))
diff --git a/tests/basic/data/overflowing_timestamps.orc b/tests/basic/data/overflowing_timestamps.orc
diff --git a/tests/basic/main.rs b/tests/basic/main.rs
@@ -412,6 +412,13 @@ pub fn timestamps_test() {
     }
 }
 
+#[test]
+pub fn overflowing_timestamps_test() {
+    let path = basic_path("overflowing_timestamps.orc");
+    let reader = new_arrow_reader_root(&path);
+    assert!(reader.collect::<Result<Vec<_>, _>>().is_err());
+}
+
 // From https://github.com/apache/arrow-rs/blob/7705acad845e8b2a366a08640f7acb4033ed7049/arrow-flight/src/sql/metadata/mod.rs#L67-L75
 pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
     let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();