diff --git a/README.md b/README.md index 6d108b2d..c356e9dd 100644 --- a/README.md +++ b/README.md @@ -53,27 +53,27 @@ Versions will be released on an ad-hoc basis (with no fixed schedule). The following table lists how ORC data types are read into Arrow data types: -| ORC Data Type | Arrow Data Type | -| ----------------- | -------------------------- | -| Boolean | Boolean | -| TinyInt | Int8 | -| SmallInt | Int16 | -| Int | Int32 | -| BigInt | Int64 | -| Float | Float32 | -| Double | Float64 | -| String | Utf8 | -| Char | Utf8 | -| VarChar | Utf8 | -| Binary | Binary | -| Decimal | Decimal128 | -| Date | Date32 | -| Timestamp | Timestamp(Nanosecond, None) | -| Timestamp instant | Timestamp(Nanosecond, UTC) | -| Struct | Struct | -| List | List | -| Map | Map | -| Union | Union(_, Sparse)* | +| ORC Data Type | Arrow Data Type | Notes | +| ----------------- | -------------------------- | ----- | +| Boolean | Boolean | | +| TinyInt | Int8 | | +| SmallInt | Int16 | | +| Int | Int32 | | +| BigInt | Int64 | | +| Float | Float32 | | +| Double | Float64 | | +| String | Utf8 | | +| Char | Utf8 | | +| VarChar | Utf8 | | +| Binary | Binary | | +| Decimal | Decimal128 | | +| Date | Date32 | | +| Timestamp | Timestamp(Nanosecond, None) | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds | +| Timestamp instant | Timestamp(Nanosecond, UTC) | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds | +| Struct | Struct | | +| List | List | | +| Map | Map | | +| Union | Union(_, Sparse)* | | *Currently only supports a maximum of 127 variants diff --git a/src/error.rs b/src/error.rs index 328b1f00..b7ff01c9 100644 --- a/src/error.rs +++ b/src/error.rs @@ -50,6 +50,17 @@ pub enum OrcError { source: std::io::Error, }, + #[snafu(display( + "Overflow while decoding timestamp (seconds={}, nanoseconds={}) to nanoseconds", + seconds, + nanoseconds + ))] + DecodeTimestamp { + location: Location, + seconds: i64, + nanoseconds: u64, + }, + #[snafu(display("Failed to decode proto, source: {}", source))] DecodeProto { location: Location, diff --git a/src/reader/decode/timestamp.rs b/src/reader/decode/timestamp.rs index 1b6ec62e..fa69a7a9 100644 --- a/src/reader/decode/timestamp.rs +++ b/src/reader/decode/timestamp.rs @@ -1,4 +1,4 @@ -use crate::error::Result; +use crate::error::{DecodeTimestampSnafu, Result}; const NANOSECONDS_IN_SECOND: i64 = 1_000_000_000; @@ -58,6 +58,18 @@ fn decode_timestamp( }; // Convert into nanoseconds since epoch, which Arrow uses as native representation // of timestamps - let nanoseconds_since_epoch = seconds * NANOSECONDS_IN_SECOND + (nanoseconds as i64); + // The timestamp may overflow as ORC encodes them as a pair of (seconds, nanoseconds) + // while we encode them as a single i64 of nanoseconds in Arrow. + let nanoseconds_since_epoch = seconds + .checked_mul(NANOSECONDS_IN_SECOND) + .and_then(|seconds_in_ns| seconds_in_ns.checked_add(nanoseconds as i64)) + .ok_or(()) + .or_else(|()| { + DecodeTimestampSnafu { + seconds, + nanoseconds, + } + .fail() + })?; Ok(Some(nanoseconds_since_epoch)) } diff --git a/tests/basic/data/generate_orc_timestamps.py b/tests/basic/data/generate_orc_timestamps.py index 59848f2b..6b5acaf9 100644 --- a/tests/basic/data/generate_orc_timestamps.py +++ b/tests/basic/data/generate_orc_timestamps.py @@ -2,6 +2,7 @@ import pyarrow as pa from pyarrow import orc from pyarrow import parquet +import pyorc schema = pa.schema([ pa.field('timestamp_notz', pa.timestamp("ns")), @@ -17,8 +18,30 @@ dttm(2262, 4, 11, 11, 47, 16), dttm(2001, 4, 13, 2, 14, 0), dttm(2000, 1, 1, 23, 10, 10), - dttm(1900, 1, 1, 14, 25, 14) + dttm(1900, 1, 1, 14, 25, 14), ]) table = pa.Table.from_arrays([arr, arr], schema=schema) orc.write_table(table, "pyarrow_timestamps.orc") + +# pyarrow overflows when trying to write this, so we have to use pyorc instead +class TimestampConverter: + @staticmethod + def from_orc(obj, tz): + return obj + @staticmethod + def to_orc(obj, tz): + return obj +schema = pyorc.Struct( + id=pyorc.Int(), + timestamp=pyorc.Timestamp() +) +with open("overflowing_timestamps.orc", "wb") as f: + with pyorc.Writer( + f, + schema, + converters={pyorc.TypeKind.TIMESTAMP: TimestampConverter}, + ) as writer: + writer.write((1, (12345678, 0))) + writer.write((2, (-62135596800, 0))) + writer.write((3, (12345678, 0))) diff --git a/tests/basic/data/overflowing_timestamps.orc b/tests/basic/data/overflowing_timestamps.orc new file mode 100644 index 00000000..ca798f0a Binary files /dev/null and b/tests/basic/data/overflowing_timestamps.orc differ diff --git a/tests/basic/main.rs b/tests/basic/main.rs index 1860335d..e0620ccb 100644 --- a/tests/basic/main.rs +++ b/tests/basic/main.rs @@ -412,6 +412,13 @@ pub fn timestamps_test() { } } +#[test] +pub fn overflowing_timestamps_test() { + let path = basic_path("overflowing_timestamps.orc"); + let reader = new_arrow_reader_root(&path); + assert!(reader.collect::, _>>().is_err()); +} + // From https://github.com/apache/arrow-rs/blob/7705acad845e8b2a366a08640f7acb4033ed7049/arrow-flight/src/sql/metadata/mod.rs#L67-L75 pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) { let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();