Skip to content

Commit

Permalink
Tests for temporal types in arrow metadata
Browse files Browse the repository at this point in the history
When writing.
  • Loading branch information
gaborcsardi committed Feb 17, 2025
1 parent c06e892 commit 4f04cae
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 0 deletions.
9 changes: 9 additions & 0 deletions R/arrow-schema.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
read_arrow_schema <- function(file) {
mtd <- read_parquet_metadata(file)
kvm <- mtd[["file_meta_data"]][["key_value_metadata"]][[1]]
if ("ARROW:schema" %in% kvm[["key"]]) {
as <- kvm[["value"]][match("ARROW:schema", kvm[["key"]])]
parse_arrow_schema(as)
}
}

apply_arrow_schema <- function(tab, file, arrow_schema, dicts, types,
col_select) {
if (is.na(arrow_schema)) {
Expand Down
115 changes: 115 additions & 0 deletions tests/testthat/_snaps/arrow-schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,118 @@
Error in `base64_decode()`:
! Base64 decoding error at position 6

# temporal types

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Date <named list [1]> TRUE <NULL> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
[[1]]$date_unit
[1] "DAY"

---

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Time <named list [2]> TRUE <NULL> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
[[1]]$time_unit
[1] "SECOND"
[[1]]$bit_width
[1] 32

---

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Time <named list [2]> TRUE <NULL> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
[[1]]$time_unit
[1] "SECOND"
[[1]]$bit_width
[1] 32

---

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Duration <named list [1]> TRUE <NULL> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
[[1]]$unit
[1] "NANOSECOND"

---

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Timestamp <named list [2]> TRUE <NULL> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
[[1]]$unit
[1] "MICROSECOND"
[[1]]$timezone
[1] "UTC"

---

Code
np[["columns"]]
Output
# A data frame: 1 x 6
name type_type type nullable dictionary custom_metadata
<chr> <chr> <I<list>> <lgl> <I<list>> <I<list>>
1 x Utf8 <NULL> TRUE <named list [4]> <named list [2]>
Code
np[["columns"]][["type"]]
Output
[[1]]
NULL

63 changes: 63 additions & 0 deletions tests/testthat/test-arrow-schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,66 @@ test_that("factor_bits", {
mockery::stub(factor_bits, "length", 2147483648)
expect_equal(factor_bits(fct), 64L)
})

test_that("temporal types", {
tmp <- tempfile(fileext = ".parquet")
on.exit(unlink(tmp), add = TRUE)
withr::local_options(nanoparquet.write_arrow_metadata = TRUE)

# Date
df_date <- data.frame(x = Sys.Date())
write_parquet(df_date, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})

# hms, integer
# it is unclear if this ever comes up in practice
df_hmsi <- data.frame(
x = structure(0L, units = "secs", class = c("hms", "difftime"))
)
write_parquet(df_hmsi, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})

# hms, double
df_hmsd <- data.frame(x = hms::hms(0))
write_parquet(df_hmsd, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})

# difftime
df_difftime <- data.frame(x = as.difftime(1, units = "secs"))
write_parquet(df_difftime, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})

# POSIXct
df_posixct <- data.frame(x = Sys.time())
write_parquet(df_posixct, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})

# factor
df_factor <- data.frame(x = as.factor(c("a", "a")))
write_parquet(df_factor, tmp)
np <- read_arrow_schema(tmp)
expect_snapshot({
np[["columns"]]
np[["columns"]][["type"]]
})
})

0 comments on commit 4f04cae

Please sign in to comment.