More mixed dict + non-dict column chunks tests

r-lib · Feb 8, 2025 · cfe713f · cfe713f
1 parent 11da9d4
commit cfe713f
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 1 deletion.
diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md
@@ -226,3 +226,43 @@
       11       DATA_PAGE       1024 RLE_DICTIONARY
       12       DATA_PAGE        176          PLAIN
 
+# mixing RLE_DICTIONARY and PLAIN, BYTE_ARRAY
+
+    Code
+      as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
+    Output
+              type repetition_type
+      1       <NA>        REQUIRED
+      2 BYTE_ARRAY        REQUIRED
+      3 BYTE_ARRAY        OPTIONAL
+    Code
+      as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
+    Output
+              page_type num_values       encoding
+      1 DICTIONARY_PAGE        400          PLAIN
+      2       DATA_PAGE       1024 RLE_DICTIONARY
+      3       DATA_PAGE        176          PLAIN
+      4 DICTIONARY_PAGE        400          PLAIN
+      5       DATA_PAGE       1024 RLE_DICTIONARY
+      6       DATA_PAGE        176          PLAIN
+
+# mixing RLE_DICTIONARY and PLAIN, FLOAT16
+
+    Code
+      as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
+    Output
+                        type repetition_type
+      1                 <NA>        REQUIRED
+      2 FIXED_LEN_BYTE_ARRAY        REQUIRED
+      3 FIXED_LEN_BYTE_ARRAY        OPTIONAL
+    Code
+      as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
+    Output
+              page_type num_values       encoding
+      1 DICTIONARY_PAGE        400          PLAIN
+      2       DATA_PAGE       1024 RLE_DICTIONARY
+      3       DATA_PAGE        176          PLAIN
+      4 DICTIONARY_PAGE        401          PLAIN
+      5       DATA_PAGE       1024 RLE_DICTIONARY
+      6       DATA_PAGE        176          PLAIN
+
diff --git a/tests/testthat/data/binary.parquet b/tests/testthat/data/binary.parquet
diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py
@@ -129,14 +129,104 @@ def do_decimal():
     dictionary_pagesize_limit = 400
   )
 
+def do_binary():
+  import pyarrow as pa
+  import pyarrow.parquet as pq
+  import random
+  random.seed(10)
+  fields = [
+      pa.field(name = 'ba', type = pa.binary(), nullable = False),
+      pa.field(name = 'bam', type = pa.binary()),
+  ]
+  schema = pa.schema(fields = fields)
+  data = [
+    [ str(x) for x in range(400) ] * 3,
+    [ str(x) for x in range(400) ] * 3,
+  ]
+  for i in range(10):
+    data[1][random.randint(0, 1200-1)] = None
+
+  table = pa.table(data = data, schema = schema)
+  pq.write_table(
+    table,
+    'tests/testthat/data/binary.parquet',
+    data_page_size = 400,
+    dictionary_pagesize_limit = 400
+  )
+
+def do_uuid():
+  import pyarrow as pa
+  import pyarrow.parquet as pq
+  import random
+  import uuid
+  random.seed(10)
+  fields = [
+      pa.field(name = 'ba', type = pa.uuid(), nullable = False),
+      pa.field(name = 'bam', type = pa.uuid()),
+  ]
+  schema = pa.schema(fields = fields)
+  data = [
+    [ uuid.uuid4().bytes for x in range(400) ] * 3,
+    [ uuid.uuid4().bytes for x in range(400) ] * 3,
+  ]
+  for i in range(10):
+    data[1][random.randint(0, 1200-1)] = None
+
+  table = pa.table(data = data, schema = schema)
+  pq.write_table(
+    table,
+    'tests/testthat/data/uuid.parquet',
+    version='2.6',
+    data_page_size = 400,
+    dictionary_pagesize_limit = 400
+  )
+
+def do_float16():
+  import pyarrow as pa
+  import pyarrow.parquet as pq
+  import random
+  import numpy as np
+  random.seed(10)
+  fields = [
+      pa.field(name = 'dba', type = pa.float16(), nullable = False),
+      pa.field(name = 'dbam', type = pa.float16()),
+  ]
+  schema = pa.schema(fields = fields)
+  data = [
+    np.array(list(range(400)) * 3, dtype=np.float16),
+    np.array(list(range(400)) * 3, dtype=np.float16)
+  ]
+  for i in range(10):
+    p = random.randint(0, 1200-1)
+    print(p)
+    data[1][p] = None
+
+  table = pa.table(data = data, schema = schema)
+  pq.write_table(
+    table,
+    'tests/testthat/data/float16.parquet',
+    data_page_size = 400,
+    dictionary_pagesize_limit = 400
+  )
+
 if __name__ == "__main__":
   import sys
   if len(sys.argv) == 1:
     do_float()
     do_mixed()
+    do_decimal()
+    do_binary()
+    do_uuid()
+    do_float16()
   elif sys.argv[1] == 'float':
     do_float()
   elif sys.argv[1] == 'mixed':
     do_mixed()
   elif sys.argv[1] == 'decimal':
     do_decimal()
+  elif sys.argv[1] == 'binary':
+    do_binary()
+  elif sys.argv[1] == 'uuid':
+    do_uuid()
+  elif sys.argv[1] == 'float16':
+    do_float16()
diff --git a/tests/testthat/data/float16.parquet b/tests/testthat/data/float16.parquet
diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R
@@ -188,4 +188,37 @@ test_that("mixing RLE_DICTIONARY and PLAIN, DECIMAL", {
   expect_equal(t1[,2], t2[,2])
   expect_equal(t1[,3], t2[,3])
   expect_equal(t1[,4], t2[,4])
-})
+})
+
+test_that("mixing RLE_DICTIONARY and PLAIN, BYTE_ARRAY", {
+  skip_on_cran()
+  pf <- test_path("data/binary.parquet")
+  expect_snapshot({
+    as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
+    as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
+  })
+  t1 <- as.data.frame(read_parquet(pf))
+  t2 <- as.data.frame(arrow::read_parquet(pf))
+  expect_equal(t1[,1], unclass(t2[,1]))
+  expect_equal(t1[,2], unclass(t2[,2]))
+})
+
+test_that("mixing RLE_DICTIONARY and PLAIN, FLOAT16", {
+  skip_on_cran()
+  pf <- test_path("data/float16.parquet")
+  expect_snapshot({
+    as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")])
+    as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")])
+  })
+  t1 <- as.data.frame(read_parquet(pf))
+  t2 <- as.data.frame(arrow::read_parquet(pf))
+  # arrow is buggy, even the missingness pattern is wrong :(
+  expect_equal(t1[,1], rep(0:399, 3))
+  expect_equal(
+    which(is.na(t1[,2])),
+    c(30, 66, 422, 568, 878, 947, 988, 1006, 1170, 1183) + 1
+  )
+  bs2 <- rep(0:399, 3)
+  bs2[is.na(t1[,2])] <- NA
+  expect_equal(t1[,2], bs2)
+})