Skip to content

Commit

Permalink
Error instead of panic on timestamp overflow (#91)
Browse files Browse the repository at this point in the history
I would have prefered either returning a null value or trying to encode
as a non-nanosecond timestamp (sometimes possible if the ORC data does
not use nanosecond-level precision), but this would require more significant
changes to the code.
  • Loading branch information
progval authored May 24, 2024
1 parent 4931852 commit f223788
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 24 deletions.
42 changes: 21 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,27 +53,27 @@ Versions will be released on an ad-hoc basis (with no fixed schedule).

The following table lists how ORC data types are read into Arrow data types:

| ORC Data Type | Arrow Data Type |
| ----------------- | -------------------------- |
| Boolean | Boolean |
| TinyInt | Int8 |
| SmallInt | Int16 |
| Int | Int32 |
| BigInt | Int64 |
| Float | Float32 |
| Double | Float64 |
| String | Utf8 |
| Char | Utf8 |
| VarChar | Utf8 |
| Binary | Binary |
| Decimal | Decimal128 |
| Date | Date32 |
| Timestamp | Timestamp(Nanosecond, None) |
| Timestamp instant | Timestamp(Nanosecond, UTC) |
| Struct | Struct |
| List | List |
| Map | Map |
| Union | Union(_, Sparse)* |
| ORC Data Type | Arrow Data Type | Notes |
| ----------------- | -------------------------- | ----- |
| Boolean | Boolean | |
| TinyInt | Int8 | |
| SmallInt | Int16 | |
| Int | Int32 | |
| BigInt | Int64 | |
| Float | Float32 | |
| Double | Float64 | |
| String | Utf8 | |
| Char | Utf8 | |
| VarChar | Utf8 | |
| Binary | Binary | |
| Decimal | Decimal128 | |
| Date | Date32 | |
| Timestamp | Timestamp(Nanosecond, None) | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds |
| Timestamp instant | Timestamp(Nanosecond, UTC) | Timestamps before 1677-09-21 or after 2261-04-12 return `OrcError` because they cannot be represented as an i64 of nanoseconds |
| Struct | Struct | |
| List | List | |
| Map | Map | |
| Union | Union(_, Sparse)* | |

*Currently only supports a maximum of 127 variants

Expand Down
11 changes: 11 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ pub enum OrcError {
source: std::io::Error,
},

#[snafu(display(
"Overflow while decoding timestamp (seconds={}, nanoseconds={}) to nanoseconds",
seconds,
nanoseconds
))]
DecodeTimestamp {
location: Location,
seconds: i64,
nanoseconds: u64,
},

#[snafu(display("Failed to decode proto, source: {}", source))]
DecodeProto {
location: Location,
Expand Down
16 changes: 14 additions & 2 deletions src/reader/decode/timestamp.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::error::Result;
use crate::error::{DecodeTimestampSnafu, Result};

const NANOSECONDS_IN_SECOND: i64 = 1_000_000_000;

Expand Down Expand Up @@ -58,6 +58,18 @@ fn decode_timestamp(
};
// Convert into nanoseconds since epoch, which Arrow uses as native representation
// of timestamps
let nanoseconds_since_epoch = seconds * NANOSECONDS_IN_SECOND + (nanoseconds as i64);
// The timestamp may overflow as ORC encodes them as a pair of (seconds, nanoseconds)
// while we encode them as a single i64 of nanoseconds in Arrow.
let nanoseconds_since_epoch = seconds
.checked_mul(NANOSECONDS_IN_SECOND)
.and_then(|seconds_in_ns| seconds_in_ns.checked_add(nanoseconds as i64))
.ok_or(())
.or_else(|()| {
DecodeTimestampSnafu {
seconds,
nanoseconds,
}
.fail()
})?;
Ok(Some(nanoseconds_since_epoch))
}
25 changes: 24 additions & 1 deletion tests/basic/data/generate_orc_timestamps.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pyarrow as pa
from pyarrow import orc
from pyarrow import parquet
import pyorc

schema = pa.schema([
pa.field('timestamp_notz', pa.timestamp("ns")),
Expand All @@ -17,8 +18,30 @@
dttm(2262, 4, 11, 11, 47, 16),
dttm(2001, 4, 13, 2, 14, 0),
dttm(2000, 1, 1, 23, 10, 10),
dttm(1900, 1, 1, 14, 25, 14)
dttm(1900, 1, 1, 14, 25, 14),
])
table = pa.Table.from_arrays([arr, arr], schema=schema)
orc.write_table(table, "pyarrow_timestamps.orc")


# pyarrow overflows when trying to write this, so we have to use pyorc instead
class TimestampConverter:
@staticmethod
def from_orc(obj, tz):
return obj
@staticmethod
def to_orc(obj, tz):
return obj
schema = pyorc.Struct(
id=pyorc.Int(),
timestamp=pyorc.Timestamp()
)
with open("overflowing_timestamps.orc", "wb") as f:
with pyorc.Writer(
f,
schema,
converters={pyorc.TypeKind.TIMESTAMP: TimestampConverter},
) as writer:
writer.write((1, (12345678, 0)))
writer.write((2, (-62135596800, 0)))
writer.write((3, (12345678, 0)))
Binary file added tests/basic/data/overflowing_timestamps.orc
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/basic/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,13 @@ pub fn timestamps_test() {
}
}

#[test]
pub fn overflowing_timestamps_test() {
let path = basic_path("overflowing_timestamps.orc");
let reader = new_arrow_reader_root(&path);
assert!(reader.collect::<Result<Vec<_>, _>>().is_err());
}

// From https://github.com/apache/arrow-rs/blob/7705acad845e8b2a366a08640f7acb4033ed7049/arrow-flight/src/sql/metadata/mod.rs#L67-L75
pub fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
let formatted = pretty::pretty_format_batches(batches).unwrap().to_string();
Expand Down

0 comments on commit f223788

Please sign in to comment.