Tests for temporal types in arrow metadata

When writing.
r-lib · Feb 17, 2025 · 4f04cae · 4f04cae
1 parent c06e892
commit 4f04cae
Show file tree

Hide file tree

Showing 3 changed files with 187 additions and 0 deletions.
diff --git a/R/arrow-schema.R b/R/arrow-schema.R
@@ -1,3 +1,12 @@
+read_arrow_schema <- function(file) {
+  mtd <- read_parquet_metadata(file)
+  kvm <- mtd[["file_meta_data"]][["key_value_metadata"]][[1]]
+  if ("ARROW:schema" %in% kvm[["key"]]) {
+    as <- kvm[["value"]][match("ARROW:schema", kvm[["key"]])]
+    parse_arrow_schema(as)
+  }
+}
+
 apply_arrow_schema <- function(tab, file, arrow_schema, dicts, types,
                                col_select) {
   if (is.na(arrow_schema)) {

diff --git a/tests/testthat/_snaps/arrow-schema.md b/tests/testthat/_snaps/arrow-schema.md
@@ -22,3 +22,118 @@
       Error in `base64_decode()`:
       ! Base64 decoding error at position 6
 
+# temporal types
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type             nullable dictionary custom_metadata 
+        <chr> <chr>     <I<list>>        <lgl>    <I<list>>  <I<list>>       
+      1 x     Date      <named list [1]> TRUE     <NULL>     <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      [[1]]$date_unit
+      [1] "DAY"
+      
+      
+
+---
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type             nullable dictionary custom_metadata 
+        <chr> <chr>     <I<list>>        <lgl>    <I<list>>  <I<list>>       
+      1 x     Time      <named list [2]> TRUE     <NULL>     <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      [[1]]$time_unit
+      [1] "SECOND"
+      
+      [[1]]$bit_width
+      [1] 32
+      
+      
+
+---
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type             nullable dictionary custom_metadata 
+        <chr> <chr>     <I<list>>        <lgl>    <I<list>>  <I<list>>       
+      1 x     Time      <named list [2]> TRUE     <NULL>     <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      [[1]]$time_unit
+      [1] "SECOND"
+      
+      [[1]]$bit_width
+      [1] 32
+      
+      
+
+---
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type             nullable dictionary custom_metadata 
+        <chr> <chr>     <I<list>>        <lgl>    <I<list>>  <I<list>>       
+      1 x     Duration  <named list [1]> TRUE     <NULL>     <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      [[1]]$unit
+      [1] "NANOSECOND"
+      
+      
+
+---
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type             nullable dictionary custom_metadata 
+        <chr> <chr>     <I<list>>        <lgl>    <I<list>>  <I<list>>       
+      1 x     Timestamp <named list [2]> TRUE     <NULL>     <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      [[1]]$unit
+      [1] "MICROSECOND"
+      
+      [[1]]$timezone
+      [1] "UTC"
+      
+      
+
+---
+
+    Code
+      np[["columns"]]
+    Output
+      # A data frame: 1 x 6
+        name  type_type type      nullable dictionary       custom_metadata 
+        <chr> <chr>     <I<list>> <lgl>    <I<list>>        <I<list>>       
+      1 x     Utf8      <NULL>    TRUE     <named list [4]> <named list [2]>
+    Code
+      np[["columns"]][["type"]]
+    Output
+      [[1]]
+      NULL
+      
+
diff --git a/tests/testthat/test-arrow-schema.R b/tests/testthat/test-arrow-schema.R
@@ -43,3 +43,66 @@ test_that("factor_bits", {
   mockery::stub(factor_bits, "length", 2147483648)
   expect_equal(factor_bits(fct), 64L)
 })
+
+test_that("temporal types", {
+  tmp <- tempfile(fileext = ".parquet")
+  on.exit(unlink(tmp), add = TRUE)
+  withr::local_options(nanoparquet.write_arrow_metadata = TRUE)
+
+  # Date
+  df_date <- data.frame(x = Sys.Date())
+  write_parquet(df_date, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+
+  # hms, integer
+  # it is unclear if this ever comes up in practice
+  df_hmsi <- data.frame(
+    x = structure(0L, units = "secs", class = c("hms", "difftime"))
+  )
+  write_parquet(df_hmsi, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+
+  # hms, double
+  df_hmsd <- data.frame(x = hms::hms(0))
+  write_parquet(df_hmsd, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+
+  # difftime
+  df_difftime <- data.frame(x = as.difftime(1, units = "secs"))
+  write_parquet(df_difftime, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+
+  # POSIXct
+  df_posixct <- data.frame(x = Sys.time())
+  write_parquet(df_posixct, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+
+  # factor
+  df_factor <- data.frame(x = as.factor(c("a", "a")))
+  write_parquet(df_factor, tmp)
+  np <- read_arrow_schema(tmp)
+  expect_snapshot({
+    np[["columns"]]
+    np[["columns"]][["type"]]
+  })
+})