Skip to content

Commit

Permalink
Update Spark test data and add PyArrow timestamp data generator
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Mar 26, 2024
1 parent 24bd57e commit 26d73bd
Show file tree
Hide file tree
Showing 10 changed files with 49 additions and 31 deletions.
Binary file modified tests/basic/data/alltypes.lz4.orc
Binary file not shown.
Binary file modified tests/basic/data/alltypes.lzo.orc
Binary file not shown.
Binary file modified tests/basic/data/alltypes.none.orc
Binary file not shown.
Binary file modified tests/basic/data/alltypes.snappy.orc
Binary file not shown.
Binary file modified tests/basic/data/alltypes.zlib.orc
Binary file not shown.
Binary file modified tests/basic/data/alltypes.zstd.orc
Binary file not shown.
37 changes: 21 additions & 16 deletions tests/basic/data/generate_orc.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,44 @@
import shutil
import glob
from datetime import date
from decimal import Decimal
from datetime import date as dt
from decimal import Decimal as Dec
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# We're using Spark because it supports lzo compression writing
# (PyArrow supports all except lzo writing)

spark = SparkSession.builder.getOrCreate()

# TODO: int8, char, varchar, decimal, timestamp, struct, list, map, union
# TODO: how to do char and varchar?
# TODO: struct, list, map, union
df = spark.createDataFrame(
[ # bool, int16, int32, int64, float32, float64, binary, utf8, date32, decimal
( None, None, None, None, None, None, None, None, None, None),
( True, 0, 0, 0, 0.0, 0.0, "".encode(), "", date(1970, 1, 1), Decimal(0)),
(False, 1, 1, 1, 1.0, 1.0, "a".encode(), "a", date(1970, 1, 2), Decimal(1)),
(False, -1, -1, -1, -1.0, -1.0, " ".encode(), " ", date(1969, 12, 31), Decimal(-1)),
( True, (1 << 15) - 1, (1 << 31) - 1, (1 << 63) - 1, float("inf"), float("inf"), "encode".encode(), "encode", date(9999, 12, 31), Decimal(123456789.12345)),
( True, -(1 << 15), -(1 << 31), -(1 << 63), float("-inf"), float("-inf"), "decode".encode(), "decode", date(1582, 10, 15), Decimal(-999999999.99999)),
( True, 50, 50, 50, 3.1415927, 3.14159265359, "大熊和奏".encode(), "大熊和奏", date(1582, 10, 16), Decimal(-31256.123)),
( True, 51, 51, 51, -3.1415927, -3.14159265359, "斉藤朱夏".encode(), "斉藤朱夏", date(2000, 1, 1), Decimal(1241000)),
( True, 52, 52, 52, 1.1, 1.1, "鈴原希実".encode(), "鈴原希実", date(3000, 12, 31), Decimal(1.1)),
(False, 53, 53, 53, -1.1, -1.1, "🤔".encode(), "🤔", date(1900, 1, 1), Decimal(0.99999)),
( None, None, None, None, None, None, None, None, None, None),
[ # bool, int8, int16, int32, int64, float32, float64, decimal, binary, utf8, date32
( None, None, None, None, None, None, None, None, None, None, None),
( True, 0, 0, 0, 0, 0.0, 0.0, Dec(0), "".encode(), "", dt(1970, 1, 1)),
(False, 1, 1, 1, 1, 1.0, 1.0, Dec(1), "a".encode(), "a", dt(1970, 1, 2)),
(False, -1, -1, -1, -1, -1.0, -1.0, Dec(-1), " ".encode(), " ", dt(1969, 12, 31)),
( True, 127, (1 << 15) - 1, (1 << 31) - 1, (1 << 63) - 1, float("inf"), float("inf"), Dec(123456789.12345), "encode".encode(), "encode", dt(9999, 12, 31)),
( True, -128, -(1 << 15), -(1 << 31), -(1 << 63), float("-inf"), float("-inf"), Dec(-999999999.99999), "decode".encode(), "decode", dt(1582, 10, 15)),
( True, 50, 50, 50, 50, 3.1415927, 3.14159265359, Dec(-31256.123), "大熊和奏".encode(), "大熊和奏", dt(1582, 10, 16)),
( True, 51, 51, 51, 51, -3.1415927, -3.14159265359, Dec(1241000), "斉藤朱夏".encode(), "斉藤朱夏", dt(2000, 1, 1)),
( True, 52, 52, 52, 52, 1.1, 1.1, Dec(1.1), "鈴原希実".encode(), "鈴原希実", dt(3000, 12, 31)),
(False, 53, 53, 53, 53, -1.1, -1.1, Dec(0.99999), "🤔".encode(), "🤔", dt(1900, 1, 1)),
( None, None, None, None, None, None, None, None, None, None, None),
],
StructType(
[
StructField("boolean", BooleanType()),
StructField( "int8", ByteType()),
StructField( "int16", ShortType()),
StructField( "int32", IntegerType()),
StructField( "int64", LongType()),
StructField("float32", FloatType()),
StructField("float64", DoubleType()),
StructField("decimal", DecimalType(15, 5)),
StructField( "binary", BinaryType()),
StructField( "utf8", StringType()),
StructField( "date32", DateType()),
StructField("decimal", DecimalType(15, 5)),
]
),
).coalesce(1)
Expand Down
13 changes: 13 additions & 0 deletions tests/basic/data/generate_orc_timestamps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from datetime import datetime as dttm
import pyarrow as pa
from pyarrow import orc

schema = pa.schema([
pa.field('timestamp1', pa.timestamp("ns")),
pa.field('timestamp2', pa.timestamp("ns", tz="UTC")),
])
c1 = pa.array([None, dttm(1970, 1, 1), dttm(1970, 1, 2, 23, 59, 59), dttm(1969, 12, 31, 23, 59, 59), dttm(2262, 4, 11, 11, 47, 16), dttm(2001, 4, 13), dttm(2000, 1, 1, 23, 10, 10), dttm(1900, 1, 1)])
c2 = pa.array([None, dttm(1970, 1, 1), dttm(1970, 1, 2, 23, 59, 59), dttm(1969, 12, 31, 23, 59, 59), dttm(2262, 4, 11, 11, 47, 16), dttm(2001, 4, 13), dttm(2000, 1, 1, 23, 10, 10), dttm(1900, 1, 1)])
table = pa.Table.from_arrays([c1, c2], schema=schema)
orc.write_table(table, "pyarrow_timestamps.orc")

Binary file added tests/basic/data/pyarrow_timestamps.orc
Binary file not shown.
30 changes: 15 additions & 15 deletions tests/basic/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,21 +349,21 @@ pub fn alltypes_test() {
let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();

let expected = [
"+---------+--------+-------------+----------------------+------------+----------------+--------------------------+----------+------------+------------------+",
"| boolean | int16 | int32 | int64 | float32 | float64 | binary | utf8 | date32 | decimal |",
"+---------+--------+-------------+----------------------+------------+----------------+--------------------------+----------+------------+------------------+",
"| | | | | | | | | | |",
"| true | 0 | 0 | 0 | 0.0 | 0.0 | | | 1970-01-01 | 0.00000 |",
"| false | 1 | 1 | 1 | 1.0 | 1.0 | 61 | a | 1970-01-02 | 1.00000 |",
"| false | -1 | -1 | -1 | -1.0 | -1.0 | 20 | | 1969-12-31 | -1.00000 |",
"| true | 32767 | 2147483647 | 9223372036854775807 | inf | inf | 656e636f6465 | encode | 9999-12-31 | 123456789.12345 |",
"| true | -32768 | -2147483648 | -9223372036854775808 | -inf | -inf | 6465636f6465 | decode | 1582-10-15 | -999999999.99999 |",
"| true | 50 | 50 | 50 | 3.1415927 | 3.14159265359 | e5a4a7e7868ae5928ce5a58f | 大熊和奏 | 1582-10-16 | -31256.12300 |",
"| true | 51 | 51 | 51 | -3.1415927 | -3.14159265359 | e69689e897a4e69cb1e5a48f | 斉藤朱夏 | 2000-01-01 | 1241000.00000 |",
"| true | 52 | 52 | 52 | 1.1 | 1.1 | e988b4e58e9fe5b88ce5ae9f | 鈴原希実 | 3000-12-31 | 1.10000 |",
"| false | 53 | 53 | 53 | -1.1 | -1.1 | f09fa494 | 🤔 | 1900-01-01 | 0.99999 |",
"| | | | | | | | | | |",
"+---------+--------+-------------+----------------------+------------+----------------+--------------------------+----------+------------+------------------+"
"+---------+------+--------+-------------+----------------------+------------+----------------+------------------+--------------------------+----------+------------+",
"| boolean | int8 | int16 | int32 | int64 | float32 | float64 | decimal | binary | utf8 | date32 |",
"+---------+------+--------+-------------+----------------------+------------+----------------+------------------+--------------------------+----------+------------+",
"| | | | | | | | | | | |",
"| true | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.00000 | | | 1970-01-01 |",
"| false | 1 | 1 | 1 | 1 | 1.0 | 1.0 | 1.00000 | 61 | a | 1970-01-02 |",
"| false | -1 | -1 | -1 | -1 | -1.0 | -1.0 | -1.00000 | 20 | | 1969-12-31 |",
"| true | 127 | 32767 | 2147483647 | 9223372036854775807 | inf | inf | 123456789.12345 | 656e636f6465 | encode | 9999-12-31 |",
"| true | -128 | -32768 | -2147483648 | -9223372036854775808 | -inf | -inf | -999999999.99999 | 6465636f6465 | decode | 1582-10-15 |",
"| true | 50 | 50 | 50 | 50 | 3.1415927 | 3.14159265359 | -31256.12300 | e5a4a7e7868ae5928ce5a58f | 大熊和奏 | 1582-10-16 |",
"| true | 51 | 51 | 51 | 51 | -3.1415927 | -3.14159265359 | 1241000.00000 | e69689e897a4e69cb1e5a48f | 斉藤朱夏 | 2000-01-01 |",
"| true | 52 | 52 | 52 | 52 | 1.1 | 1.1 | 1.10000 | e988b4e58e9fe5b88ce5ae9f | 鈴原希実 | 3000-12-31 |",
"| false | 53 | 53 | 53 | 53 | -1.1 | -1.1 | 0.99999 | f09fa494 | 🤔 | 1900-01-01 |",
"| | | | | | | | | | | |",
"+---------+------+--------+-------------+----------------------+------------+----------------+------------------+--------------------------+----------+------------+",
];
assert_batches_eq(&batches, &expected);
}
Expand Down

0 comments on commit 26d73bd

Please sign in to comment.