Skip to content

Commit

Permalink
More mixed dict + non-dict column chunks tests
Browse files Browse the repository at this point in the history
  • Loading branch information
gaborcsardi committed Feb 8, 2025
1 parent 11da9d4 commit cfe713f
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 1 deletion.
40 changes: 40 additions & 0 deletions tests/testthat/_snaps/read-parquet-5.md
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,43 @@
11 DATA_PAGE 1024 RLE_DICTIONARY
12 DATA_PAGE 176 PLAIN

# mixing RLE_DICTIONARY and PLAIN, BYTE_ARRAY

Code
as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
Output
type repetition_type
1 <NA> REQUIRED
2 BYTE_ARRAY REQUIRED
3 BYTE_ARRAY OPTIONAL
Code
as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
Output
page_type num_values encoding
1 DICTIONARY_PAGE 400 PLAIN
2 DATA_PAGE 1024 RLE_DICTIONARY
3 DATA_PAGE 176 PLAIN
4 DICTIONARY_PAGE 400 PLAIN
5 DATA_PAGE 1024 RLE_DICTIONARY
6 DATA_PAGE 176 PLAIN

# mixing RLE_DICTIONARY and PLAIN, FLOAT16

Code
as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
Output
type repetition_type
1 <NA> REQUIRED
2 FIXED_LEN_BYTE_ARRAY REQUIRED
3 FIXED_LEN_BYTE_ARRAY OPTIONAL
Code
as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
Output
page_type num_values encoding
1 DICTIONARY_PAGE 400 PLAIN
2 DATA_PAGE 1024 RLE_DICTIONARY
3 DATA_PAGE 176 PLAIN
4 DICTIONARY_PAGE 401 PLAIN
5 DATA_PAGE 1024 RLE_DICTIONARY
6 DATA_PAGE 176 PLAIN

Binary file added tests/testthat/data/binary.parquet
Binary file not shown.
90 changes: 90 additions & 0 deletions tests/testthat/data/create-data.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,14 +129,104 @@ def do_decimal():
dictionary_pagesize_limit = 400
)

def do_binary():
import pyarrow as pa
import pyarrow.parquet as pq
import random
random.seed(10)
fields = [
pa.field(name = 'ba', type = pa.binary(), nullable = False),
pa.field(name = 'bam', type = pa.binary()),
]
schema = pa.schema(fields = fields)
data = [
[ str(x) for x in range(400) ] * 3,
[ str(x) for x in range(400) ] * 3,
]
for i in range(10):
data[1][random.randint(0, 1200-1)] = None

table = pa.table(data = data, schema = schema)
pq.write_table(
table,
'tests/testthat/data/binary.parquet',
data_page_size = 400,
dictionary_pagesize_limit = 400
)

def do_uuid():
import pyarrow as pa
import pyarrow.parquet as pq
import random
import uuid
random.seed(10)
fields = [
pa.field(name = 'ba', type = pa.uuid(), nullable = False),
pa.field(name = 'bam', type = pa.uuid()),
]
schema = pa.schema(fields = fields)
data = [
[ uuid.uuid4().bytes for x in range(400) ] * 3,
[ uuid.uuid4().bytes for x in range(400) ] * 3,
]
for i in range(10):
data[1][random.randint(0, 1200-1)] = None

table = pa.table(data = data, schema = schema)
pq.write_table(
table,
'tests/testthat/data/uuid.parquet',
version='2.6',
data_page_size = 400,
dictionary_pagesize_limit = 400
)

def do_float16():
import pyarrow as pa
import pyarrow.parquet as pq
import random
import numpy as np
random.seed(10)
fields = [
pa.field(name = 'dba', type = pa.float16(), nullable = False),
pa.field(name = 'dbam', type = pa.float16()),
]
schema = pa.schema(fields = fields)
data = [
np.array(list(range(400)) * 3, dtype=np.float16),
np.array(list(range(400)) * 3, dtype=np.float16)
]
for i in range(10):
p = random.randint(0, 1200-1)
print(p)
data[1][p] = None

table = pa.table(data = data, schema = schema)
pq.write_table(
table,
'tests/testthat/data/float16.parquet',
data_page_size = 400,
dictionary_pagesize_limit = 400
)

if __name__ == "__main__":
import sys
if len(sys.argv) == 1:
do_float()
do_mixed()
do_decimal()
do_binary()
do_uuid()
do_float16()
elif sys.argv[1] == 'float':
do_float()
elif sys.argv[1] == 'mixed':
do_mixed()
elif sys.argv[1] == 'decimal':
do_decimal()
elif sys.argv[1] == 'binary':
do_binary()
elif sys.argv[1] == 'uuid':
do_uuid()
elif sys.argv[1] == 'float16':
do_float16()
Binary file added tests/testthat/data/float16.parquet
Binary file not shown.
35 changes: 34 additions & 1 deletion tests/testthat/test-read-parquet-5.R
Original file line number Diff line number Diff line change
Expand Up @@ -188,4 +188,37 @@ test_that("mixing RLE_DICTIONARY and PLAIN, DECIMAL", {
expect_equal(t1[,2], t2[,2])
expect_equal(t1[,3], t2[,3])
expect_equal(t1[,4], t2[,4])
})
})

test_that("mixing RLE_DICTIONARY and PLAIN, BYTE_ARRAY", {
skip_on_cran()
pf <- test_path("data/binary.parquet")
expect_snapshot({
as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
})
t1 <- as.data.frame(read_parquet(pf))
t2 <- as.data.frame(arrow::read_parquet(pf))
expect_equal(t1[,1], unclass(t2[,1]))
expect_equal(t1[,2], unclass(t2[,2]))
})

test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", {
skip_on_cran()
pf <- test_path("data/float16.parquet")
expect_snapshot({
as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
})
t1 <- as.data.frame(read_parquet(pf))
t2 <- as.data.frame(arrow::read_parquet(pf))
# arrow is buggy, even the missingness pattern is wrong :(
expect_equal(t1[,1], rep(0:399, 3))
expect_equal(
which(is.na(t1[,2])),
c(30, 66, 422, 568, 878, 947, 988, 1006, 1170, 1183) + 1
)
bs2 <- rep(0:399, 3)
bs2[is.na(t1[,2])] <- NA
expect_equal(t1[,2], bs2)
})

0 comments on commit cfe713f

Please sign in to comment.