From 7c2b7254601d1bd00c7945814d2aae764e26782c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Thu, 6 Feb 2025 08:09:30 +0100 Subject: [PATCH 01/18] Tests for mixed encodings in a column chunk --- tests/testthat/_snaps/read-parquet-5.md | 16 ++++++++++++++++ tests/testthat/data/mixed-int32.parquet | Bin 0 -> 9802 bytes tests/testthat/test-read-parquet-5.R | 14 ++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 tests/testthat/data/mixed-int32.parquet diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 2dd4726..84a3aae 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -52,3 +52,19 @@ Error in `read_parquet()`: ! Columns cyl, disp selected multiple times in `read_parquet()`. +# mixing RLE_DICTIONARY and PLAIN + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 INT32 OPTIONAL + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 1024 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 976 PLAIN + diff --git a/tests/testthat/data/mixed-int32.parquet b/tests/testthat/data/mixed-int32.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5d9933685ff3c4a79f00473a7965f723471425c0 GIT binary patch literal 9802 zcmYk?cU;YH|G@F0g%CO^LTI6dB7{mMgix}z2Q9ShP|03NcG5y+Wv_N23N3{8UT7iQ zufE^=_xRn{kESQ%>@2Z0-=FILjDS^zr}?}!svmX z5P>Mf&^;)DUg(WJkVIefLw`tN09a3G5Tqf136n3;(1!sGF$+d8h6!fF6mu{a z^I(SgSb&8v$097o5?H_zOJRj&SdJA~iB(vQHCPL4*uWNcu!jR2;RI*6U>(-O6>e~c z2RyL>8?gyq@P-e3;fKxGg00ww?bv~x@W(Ff#vTM95JA|BVC=(w9Kb;w!eJc2Q5?f@ zoInUpA{3_(hSNBMvp9$IxPXfY$0bDIG9nR$Xv82ES8x^A5Qlgq;5u&LCT`(25^)E2 zaS!+L01xp9kMRUg@eD~wMha4qhIC{g6IpnU7kG(myuxebAQx}&7VnUUd=#J%MR<=7 z_=sYB!e@Lz2})6ha#WxaRj5V{YEg%JG@ubpXhsXZ;v2rB6>Vro2RiWsKk*A)_>DjK zi|)-Yj2`F-5r{$z;*dZu^hO^@qA&WPKcp}K12G8F5I_clAqzPS!BEI!7=~j66fhE_ zFdB+bf-+Q~iZK|AaZrOgG@uDBjK>5_gf?_A36r6VDVU0B(8F}hz)a}F0EU){GFxWfaU*no}L1TT2Q2fpybW^BP$Y{Pc!z)tvM7j|P00uYEG>_ss4VLuMwAP(U$ zj^HSc;W$np1Sb)SQwYOpoWWU~!+Bi5MTFxLB5)azh(a`C5Q{6giff2NJQ8pnH*gcT za2tuZgS)tg`*?tdc!bAzf~R3SD#|L~wF+SllzMur9C__0aP>Cv3qXxC8Lp>VMh$b|n1z+(E-_eRTw4(!^_<^7J zg)aQYAN&=r{n7eMT(}3@A3Y%gQHVht66l5A=mSagML+b16b4`*20#j|Er=b1cGQEP(|qu@qKVhUHj+l~{$a7e){Cga||-260HB7kZ-)B+(cB&>vD5fPokUX$T;L z!H|Uy`u?xGg z2LT8~5cVP%`>-Dea1e)Z7)Njv$8a1c5Q38k#VLg0G|u2G&fz>R;3C3t2@$xANJJqT zF^I(#T*WoSAsz|1jvKg%Teyux+`(Pk!+ku!Lp;J`Ji${uLlTmaf>fj-9T~_(7M|k; zULqT>@ESSD#T&fEJLDlB1t>%j-s1y4q8Oj>8DCI>Qk0<_6{tiNs!@Yl)S(^?Xhaj5 z(Son|hVN)a8`{x9?_5{L=Jr)OVb9K8;wktwy>~C7c8^ClrFQ%t2BMEb->6BS^E&13_0h>OBq94 z6H7COdZvxcl=sTB$sFcaaVc~7w${?j5&j~hvJ?WOZL>xOD@J6EI;dTiHTtO0sOO3y z7Pilo!t5iSE1&Z!d#(~5FzSVBWQgsHF|m;mFUH0tmc1BvBW=`6wZuHzm+JQ`B3^1d zZY_JMnItkgTPsc4E_-~I;^piK+1lmV6LXD5ztYaLuzRIbWPkb9q++k~SCdNuM!(jr z2(f!Tr6%(7>!}Ti<*%o;q>aweYt6IEnci7(IcG*!YkAI0AyLI#eG!3uu7QM7WUis4 zPDSo4DPzSqM$(q{Z;WLfBHx(Edsn=ftq`dA)>QGN{o6UJQIT)ws@o2NYWE~)I zEV2n!iY~G}s8dyBchp$G;7VHY)nVy0|-4 zAJ*SUSN`alnD6+}?S5tSNB75VRUbW)L{*AC(*#b%8?uyQiZ^EKR2Of`HCFlLm1pVn z$-BrQ=95pccl9UVl0cQueibL3K5wpxiut^y;ZF7Etu5&)U$(X8JAK){v$Hbh%Z{$L z>MuKm#8gZCMP!^yc1bA5mh6_CR8z7?%0#s^KzgZjX`rlQY-y0ZPfh7wg&@_kV8u}9 zvVE%2v1R+!?$(qY(8^FPKd4>cTz*KmDz^NvUVBaX5q+^S6-SL^Tq=&4DqpEMZZ@g5 z;)J=$n92}~r7o2xmpNXk3|-|@TY1VlXiQa@eW**-Y3JxGRcBo9)>fVM%otOB&a1$s z`n+G&mFf%I+H0#X`iqUN2@jB2S92*?`D#tX!AW&Bmyeo^t&I#>x~?`V%<*b%^f{lp z+L-X5v30SLq3i0d#71AOyBc@5uI}26jIs4`i3RKGbp(JJ)1fV`ARY^^JFm9IrLrE%vEzyjK!5uIYY7==!DyHPP3a9yZ*q zZ+g^{F|PS>Yr*>FC!JN-nxA&H*Ec^C5?5 zuj#T*abGj!eH*@JD(qGJmZf;g_1kmRn7D5*)b2HWd#RPF_B~s>(DnN(-RijSuk|__ zzUSzRtGDJF4R&jNW2zG0`qpf6W9vKf+3Ib17FKR;`OBQ*+X`0sHntU7?^SOvvOne4 z{@ytzzWsyiy~g&Bo|)<$#a@MO9iRNF<2ycY>uBuw;xDezSrRbVy|Xk}C84wI;N+&x z@}sjgepG~5x&NpPb4vJ8bOQ4tc$zX^t1j(rpB*^#6tI9 zjrXe)eluZ+E^{#K(lbgG~=g!vr-I{0R@w=_a>H6>XV&CT9 z9VL4;|8!QI^7!+kCg%E|pAGk#|NIhd$<+MY)mrHB_jhOY^}m0*I-39fC4L^BE(n{h zCJ66O{onYRBmPhPB!YIw&;M@`HZzB}n)L6EpW*$5g_t0W9_R@Xh(ZkFkU%f=MjuF` zFZ!WBq%Z&jF$mHSKn8=+9m3@>1VbT@VHl1PP{2rx!e}T$3Cd7`D#ly5!%qfBus`breG?jK@Zb012dry0~lf!j9?5C%!VoEU@qpt4D+!73t^5$Sd1mG zfF+i~3d^t@E3gu)uo`Qy7S^zVE$m37+B^ zl8}rPq#_OJ$Ur8t@EkAj64`i#*T_LG-rz0XArJW|Kp~3o9v|=##rTBJ_<|CYq73Dz zKqabBjT+RV4)th2Bbv~R7JS7wd`BzV(2fpt;s<`>7rO8pfAAMVqU?Y4Ku?H36k-sE z1bU%2`alwW(GUG0g#j3dL6C+3G8hb5$YBVELLS2~93!BBkr;*1P=pecp#oKm!B~ug z8q}cyO=w{}CSW48p@T`73|&mYR7`^®+XLLUYH%W7$%qvQ_R6!%!3)`V*wVz z9E-3ROJD&@EQJ-8VL4V{C01cI)?h8HVFO#(!5$89gcF?Mf^}FASGd6)9`M8lY{Vvb z!5cpCg&#I!3$|h#wqpl&!XLY^8+#CdKm=hgg0T<#aR3K#2#0Y5M{x|taRMPYiBOzE z7*691&f*--;{q-s9G4J*%ZNl2q7j2wT)|abLmc9ffa|z{o4AGBNW>l7#Xa1|13bhd zJjN3|#WN%!87W9b8q$%0Ol09XUf?CN@d~ezgIv79Tf9Ra@=<_76yZHS;3JCh37_!= zB`8H1%29zzRG}I*s6`#>(SSxYp&2dsif{OiRMfAPx!iLT~heB>JKs`a=o>Fc5&n1ZR820cv249tW+3}A>^FoH2mFdL?rgSnUo zGt9>VEQC20VKJ7#0+v_`D=foutiVdF!fLF+T3Evdwy=Xe9N-8iIKu_&upX{(gF8In zi4E9@P4I#@eBcW|Y{nLB#Wrlm4(x?rv zLU0nHIE65r#u=Q&Ih@A@Ttql7Ap(~Xi6}%P2C=w;tGI?Z#3KRMaRWDT3%8MoJGhH` zxQ_>Th(~ygCwPiyNJ27Fkcu>NJBa@kclij#|ykfHeTU1av%_qQ`6yJZj+s} zgPV<@2M->VddLd<3CVG8T$_mfPr{}xiTnAg2R_y}c*?q#HzC@(8%uz2x8tEvBf4cRzh z&F(bO$$+$|J0s0AFz_`nbTTk8FmN)M_0N!jp`n4n{O-Y71_pDwOWOZe(lYMuvh2>N zQ95-WZ`!>MX;Js&f0Oa7e`Xr~)BjKPzx@=3+TC^5=W{sT_uuvZ8DH{GK3zyw!P92r u25$#1bvsW_b(@VFJ$w~>95!xp@o-ldud6X$LrYFxp5N`p^I~(}BKr?qi98(u literal 0 HcmV?d00001 diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 8cdc88c..5d9b8bc 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -118,3 +118,17 @@ test_that("class", { withr::local_options(nanoparquet.class = "foobar") expect_equal(class(read_parquet(tmp)), c("foobar", "data.frame")) }) + +test_that("mixing RLE_DICTIONARY and PLAIN", { + # https://github.com/r-lib/nanoparquet/issues/110 + # import pyarrow as pa + # import pyarrow.parquet as pq + # table = pa.table({'x': pa.array(range(2000), type=pa.int32())}) + # pq.write_table(table, 'mixed-int32.parquet', dictionary_pagesize_limit = 400) + pf <- test_path("data/mixed-int32.parquet") + expect_snapshot({ + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + }) + expect_equal(read_parquet(pf)$x, 0:1999) +}) From ddc9d8f700ebd4382c63d5ca28941b357085a48f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 10:31:50 +0100 Subject: [PATCH 02/18] Start of supporting dict + non-dict mix --- src/RParquetReader.cpp | 596 ++++++++++-------- src/RParquetReader.h | 20 + tests/testthat/_snaps/read-parquet-5.md | 52 +- tests/testthat/data/create-data.py | 41 ++ tests/testthat/data/mixed-miss.parquet | Bin 0 -> 22803 bytes .../{mixed-int32.parquet => mixed.parquet} | Bin 9802 -> 11631 bytes tests/testthat/data/mixed2.parquet | Bin 0 -> 6938 bytes tests/testthat/test-read-parquet-5.R | 28 +- 8 files changed, 470 insertions(+), 267 deletions(-) create mode 100644 tests/testthat/data/create-data.py create mode 100644 tests/testthat/data/mixed-miss.parquet rename tests/testthat/data/{mixed-int32.parquet => mixed.parquet} (54%) create mode 100644 tests/testthat/data/mixed2.parquet diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 2fd7e8a..27958d0 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -463,8 +463,20 @@ void RParquetReader::alloc_data_page(DataPage &data) { data.present = present[cl][rg].map.data() + data.from; } - if (data.cc.has_dictionary) { - data.data = (uint8_t*) (dicts[cl][rg].indices.data() + page_off); + bool has_dict = data.cc.has_dictionary; + bool is_index = has_dict && + (data.encoding == parquet::Encoding::RLE_DICTIONARY || + data.encoding == parquet::Encoding::PLAIN_DICTIONARY); + + // A non-dict-index page in a column chunk that has a + // dictionary page. Should be rare, but arrow does write + // these: https://github.com/r-lib/nanoparquet/issues/110 + if (has_dict && !is_index) { + notdicts.push_back({ cl, rg, page_off, data.num_values, data.num_present }); + } + + if (is_index) { + data.data = (uint8_t*) (dicts[cl][rg].indices.data() + page_off); } else if (!rt.byte_array) { int64_t off = metadata.row_group_offsets[rg]; @@ -499,6 +511,7 @@ struct postprocess { rmetadata &metadata; std::vector> &tmpdata; std::vector> &dicts; + std::vector>> &dict_steps; std::vector>> &byte_arrays; std::vector> &present; }; @@ -506,125 +519,12 @@ struct postprocess { void convert_column_to_r_dicts(postprocess *pp, uint32_t cl) { if (pp->dicts[cl].size() == 0) return; for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - // In theory some row groups might be non dictionary encoded - if (pp->dicts[cl][rg].dict_len == 0) { - continue; - } - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t from = pp->metadata.row_group_offsets[rg]; - SEXP x = VECTOR_ELT(pp->columns, cl); - switch (TYPEOF(x)) { - case INTSXP: { - int *beg = INTEGER(x) + from; - int *end = beg + num_values; - int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*idx++]; - } - break; - } - case REALSXP: { - double *beg = REAL(x) + from; - double *end = beg + num_values; - double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*idx++]; - } - break; - } - case LGLSXP: { // # nocov start - int *beg = LOGICAL(x) + from; - int *end = beg + num_values; - int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*idx++]; - } - break; // # nocov end - } - } - } -} - -void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { - bool hasdict0 = pp->dicts[cl].size() > 0; - for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - // in theory some row groups might be dict encoded, some not - bool hasdict = hasdict0 && pp->dicts[cl][rg].dict_len > 0; - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (!hasdict && !hasmiss) { - continue; - } else if (!hasdict && hasmiss) { - // missing values in place - int64_t from = pp->metadata.row_group_offsets[rg]; - SEXP x = VECTOR_ELT(pp->columns, cl); - switch (TYPEOF(x)) { - case INTSXP: { - int *beg = INTEGER(x) + from; - int *endm1 = beg + num_values - 1; - int *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; - while (num_miss > 0) { - if (*presm1 != 0) { - *endm1-- = *pendm1--; - presm1--; - } else { - *endm1-- = NA_INTEGER; - presm1--; - num_miss--; - } - } - break; - } - case REALSXP: { - double *beg = REAL(x) + from; - double *endm1 = beg + num_values - 1; - double *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; - while (num_miss > 0) { - if (*presm1) { - *endm1-- = *pendm1--; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; - num_miss--; - } - } - break; - } - case LGLSXP: { - int *beg = LOGICAL(x) + from; - int *endm1 = beg + num_values - 1; - int *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; - while (num_miss > 0) { - if (*presm1) { - *endm1-- = *pendm1--; - presm1--; - } else { - *endm1-- = NA_LOGICAL; - presm1--; - num_miss--; - } - } - break; - } - default: - throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov - } - } else if (hasdict && !hasmiss) { - // only dict - int64_t from = pp->metadata.row_group_offsets[rg]; + if (pp->dicts[cl][rg].dict_len == 0) continue; + std::vector &dss = pp->dict_steps[cl][rg]; + for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { + if (!dss[dsi].dict) continue; + int64_t from = dss[dsi].start; + int64_t num_values = dss[dsi].num_values; SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { @@ -647,8 +547,7 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { } break; } - case LGLSXP: { // # nocov start - // BOOLEAN dictionaries are not really possible... + case LGLSXP: { // # nocov start int *beg = LOGICAL(x) + from; int *end = beg + num_values; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); @@ -656,73 +555,186 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { while (beg < end) { *beg++ = dict[*idx++]; } - break; // # nocov end + break; // # nocov end } - default: - throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov } - } else if (hasdict && hasmiss) { - // dict + missing values - int64_t from = pp->metadata.row_group_offsets[rg]; - SEXP x = VECTOR_ELT(pp->columns, cl); - switch (TYPEOF(x)) { - case INTSXP: { - int *beg = INTEGER(x) + from; - int *endm1 = beg + num_values - 1; - int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (endm1 >= beg) { - if (*presm1) { - *endm1-- = dict[*idxm1--]; - presm1--; - } else { - *endm1-- = NA_INTEGER; - presm1--; + } + } +} + +void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { + for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + std::vector &dss = pp->dict_steps[cl][rg]; + for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { + int64_t from = dss[dsi].start; + uint32_t num_values = dss[dsi].num_values; + int64_t num_present = dss[dsi].num_present; + bool hasmiss = num_present != num_values; + bool hasdict = dss[dsi].dict; + if (!hasdict && !hasmiss) { + continue; + } else if (!hasdict && hasmiss) { + // missing values in place + SEXP x = VECTOR_ELT(pp->columns, cl); + switch (TYPEOF(x)) { + case INTSXP: { + int *beg = INTEGER(x) + from; + int *endm1 = beg + num_values - 1; + int *pendm1 = beg + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + uint32_t num_miss = num_values - num_present; + while (num_miss > 0) { + if (*presm1 != 0) { + *endm1-- = *pendm1--; + presm1--; + } else { + *endm1-- = NA_INTEGER; + presm1--; + num_miss--; + } } + break; } - break; - } - case REALSXP: { - double *beg = REAL(x) + from; - double *endm1 = beg + num_values - 1; - double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (endm1 >= beg) { - if (*presm1) { - *endm1-- = dict[*idxm1--]; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + case REALSXP: { + double *beg = REAL(x) + from; + double *endm1 = beg + num_values - 1; + double *pendm1 = beg + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + uint32_t num_miss = num_values - num_present; + while (num_miss > 0) { + if (*presm1) { + *endm1-- = *pendm1--; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + num_miss--; + } } + break; } - break; - } - case LGLSXP: { - // BOOLEAN dictionaries are not really possible... // # nocov start - int *beg = LOGICAL(x) + from; - int *endm1 = beg + num_values - 1; - int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (endm1 >= beg) { - if (*presm1) { - *endm1-- = dict[*idxm1--]; - presm1--; - } else { - *endm1-- = NA_LOGICAL; - presm1--; + case LGLSXP: { + int *beg = LOGICAL(x) + from; + int *endm1 = beg + num_values - 1; + int *pendm1 = beg + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + uint32_t num_miss = num_values - num_present; + while (num_miss > 0) { + if (*presm1) { + *endm1-- = *pendm1--; + presm1--; + } else { + *endm1-- = NA_LOGICAL; + presm1--; + num_miss--; + } } + break; + } + default: + throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov + } + } else if (hasdict && !hasmiss) { + // only dict + SEXP x = VECTOR_ELT(pp->columns, cl); + switch (TYPEOF(x)) { + case INTSXP: { + int *beg = INTEGER(x) + from; + int *end = beg + num_values; + int *dict = (int*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*idx++]; + } + break; + } + case REALSXP: { + double *beg = REAL(x) + from; + double *end = beg + num_values; + double *dict = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*idx++]; + } + break; + } + case LGLSXP: { // # nocov start + // BOOLEAN dictionaries are not really possible... + int *beg = LOGICAL(x) + from; + int *end = beg + num_values; + int *dict = (int*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*idx++]; + } + break; // # nocov end + } + default: + throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov + } + } else if (hasdict && hasmiss) { + // dict + missing values + int64_t from = pp->metadata.row_group_offsets[rg]; + SEXP x = VECTOR_ELT(pp->columns, cl); + switch (TYPEOF(x)) { + case INTSXP: { + int *beg = INTEGER(x) + from; + int *endm1 = beg + num_values - 1; + int *dict = (int*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idxm1 = + (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (endm1 >= beg) { + if (*presm1) { + *endm1-- = dict[*idxm1--]; + presm1--; + } else { + *endm1-- = NA_INTEGER; + presm1--; + } + } + break; + } + case REALSXP: { + double *beg = REAL(x) + from; + double *endm1 = beg + num_values - 1; + double *dict = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idxm1 = + (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (endm1 >= beg) { + if (*presm1) { + *endm1-- = dict[*idxm1--]; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } + } + break; + } + case LGLSXP: { + // BOOLEAN dictionaries are not really possible... // # nocov start + int *beg = LOGICAL(x) + from; + int *endm1 = beg + num_values - 1; + int *dict = (int*) pp->dicts[cl][rg].buffer.data(); + uint32_t *idxm1 = + (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (endm1 >= beg) { + if (*presm1) { + *endm1-- = dict[*idxm1--]; + presm1--; + } else { + *endm1-- = NA_LOGICAL; + presm1--; + } + } + break; // # nocov end + } + default: + throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov } - break; // # nocov end - } - default: - throw std::runtime_error("Unknown type when processing dictionaries"); // # nocov } } } @@ -743,30 +755,31 @@ void convert_column_to_r_int64_nodict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_int64_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t from = pp->metadata.row_group_offsets[rg]; - // in theory some row groups might be dict encoded, some not - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - double *beg = REAL(x) + from; - double *end = beg + num_values; - if (!hasdict) { - int64_t *ibeg = (int64_t*) beg; - while (beg < end) { - *beg++ = static_cast(*ibeg++); - } - } else { - // first convert tbe dict values - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; - int64_t *idbeg = (int64_t *) dbeg; - while (dbeg < dend) { - *dbeg++ = static_cast(*idbeg++); - } - double *dict = (double*) pp -> dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; + std::vector &dss = pp->dict_steps[cl][rg]; + for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { + int64_t from = dss[dsi].start; + uint32_t num_values = dss[dsi].num_values; + bool hasdict = dss[dsi].dict; + double *beg = REAL(x) + from; + double *end = beg + num_values; + if (!hasdict) { + int64_t *ibeg = (int64_t*) beg; + while (beg < end) { + *beg++ = static_cast(*ibeg++); + } + } else { + // first convert tbe dict values + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + pp->dicts[cl][rg].dict_len; + int64_t *idbeg = (int64_t *) dbeg; + while (dbeg < dend) { + *dbeg++ = static_cast(*idbeg++); + } + double *dict = (double*) pp -> dicts[cl][rg].buffer.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } } } } @@ -808,64 +821,67 @@ void convert_column_to_r_int64_nodict_miss(postprocess *pp, uint32_t cl) { void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - double *beg = REAL(x) + pp->metadata.row_group_offsets[rg]; - // In theory this happen - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - int64_t *ibeg = (int64_t *)beg; - uint32_t num_present = pp->present[cl][rg].num_present; + std::vector &dss = pp->dict_steps[cl][rg]; + bool rg_dict_converted = false; + for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { + int64_t from = dss[dsi].start; + uint32_t num_values = dss[dsi].num_values; + uint32_t num_present = dss[dsi].num_present; + bool hasdict = dss[dsi].dict; bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *end = beg + num_values; - while (beg < end) { - *beg++ = static_cast(*ibeg++); - } - } else { - double *endm1 = beg + num_values - 1; - int64_t *pendm1 = ibeg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = static_cast(*pendm1--); - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + double *beg = REAL(x) + from; + // In theory this happen + if (!hasdict) { + int64_t *ibeg = (int64_t *)beg; + if (!hasmiss) { + double *end = beg + num_values; + while (beg < end) { + *beg++ = static_cast(*ibeg++); + } + } else { + double *endm1 = beg + num_values - 1; + int64_t *pendm1 = ibeg + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = static_cast(*pendm1--); + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } - } - } else { - // convert dict values first - double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; - int64_t *idbeg = (int64_t *)dbeg; - while (dbeg < dend) { - *dbeg++ = static_cast(*idbeg++); - } - double *dict = (double *)pp->dicts[cl][rg].buffer.data(); - - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; - } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = dict[*dendm1--]; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + // convert dict values first, if not yet done + if (!rg_dict_converted) { + double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + pp->dicts[cl][rg].dict_len; + int64_t *idbeg = (int64_t *)dbeg; + while (dbeg < dend) { + *dbeg++ = static_cast(*idbeg++); + } + } + double *dict = (double *)pp->dicts[cl][rg].buffer.data(); + if (!hasmiss) { + double *end = beg + num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } + } else { + double *endm1 = beg + num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = dict[*dendm1--]; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } } @@ -2028,8 +2044,67 @@ void convert_columns_to_r_(postprocess *pp) { } } +void RParquetReader::calculate_dict_steps() { + if (notdicts.size() == 0) { + calculate_dict_steps_simple(); + } else { + calculate_dict_steps_bad(); + } +} + +void RParquetReader::calculate_dict_steps_simple() { + dict_steps.resize(metadata.num_cols_to_read); + for (uint32_t cl = 0; cl < metadata.num_cols_to_read; cl++) { + dict_steps[cl].resize(metadata.num_row_groups); + bool dict0 = dicts[cl].size() != 0; + for (uint32_t rg = 0; rg < metadata.num_row_groups; rg++) { + int64_t rgo = metadata.row_group_offsets[rg]; + int64_t num_values = metadata.row_group_num_rows[rg]; + uint32_t num_present = present[cl].size() == 0 ? + num_values : present[cl][rg].num_present; + bool dict = dict0 && dicts[cl][rg].dict_len > 0; + dict_step ds = { rgo, num_values, num_present, dict }; + dict_steps[cl][rg].push_back(ds); + } + } +} + +void RParquetReader::calculate_dict_steps_bad() { + // start with assuming no bad pages (bad = not dict encoded in dict col) + calculate_dict_steps_simple(); + // now post-process this + for (page_range &bad: notdicts) { + int64_t bad_end = bad.start + bad.num_values; + std::vector &dss = dict_steps[bad.column][bad.row_group]; + // find the dict step it applies to + for (auto ds = dss.begin(); ds != dss.end(); ++ds) { + int64_t ds_end = ds->start + ds->num_values; + if (bad.start >= ds->start && bad.start < ds_end) { + if (bad_end > ds_end) { + Rf_error("Internal error, impossible mix of dict and non-dict pages"); + } + if (bad_end == ds_end) { + ds->num_values -= bad.num_values; + ds->num_present -= bad.num_present; + dss.insert(++ds, { bad.start, bad.num_values, bad.num_present, false }); + } else { + int64_t num_miss = ds->num_values - ds->num_present; + dict_step newsteps[2] = { + { bad.start, bad_end - bad.start, bad_end - bad.start, false }, + { bad_end, ds_end - bad_end, ds_end - bad_end - num_miss, ds->dict } + }; + ds->num_values = bad.start - ds->start; + ds->num_present = bad.start - ds->start; + dss.insert(++ds, newsteps, newsteps + 2); + } + break; + } + } + } +} + void RParquetReader::convert_columns_to_r() { - std::vector col_select; + calculate_dict_steps(); postprocess pp = { columns, facdicts, @@ -2037,6 +2112,7 @@ void RParquetReader::convert_columns_to_r() { metadata, tmpdata, dicts, + dict_steps, byte_arrays, present }; diff --git a/src/RParquetReader.h b/src/RParquetReader.h index 2a98397..eeecf6d 100644 --- a/src/RParquetReader.h +++ b/src/RParquetReader.h @@ -89,6 +89,21 @@ class RParquetFilter { std::vector columns; }; +struct dict_step { + int64_t start; + int64_t num_values; + int64_t num_present; + bool dict; +}; + +struct page_range { + uint32_t column; + uint32_t row_group; + int64_t start; + int64_t num_values; + int64_t num_present; +}; + class RParquetReader : public ParquetReader { public: RParquetReader(std::string filename, bool readwrite = false); @@ -116,6 +131,8 @@ class RParquetReader : public ParquetReader { std::vector> tmpdata; std::vector> dicts; + std::vector notdicts; + std::vector>> dict_steps; std::vector>> byte_arrays; std::vector> present; rmetadata metadata; @@ -124,4 +141,7 @@ class RParquetReader : public ParquetReader { RParquetFilter filter; void init(RParquetFilter &filter); std::vector colmap; + void calculate_dict_steps(); + void calculate_dict_steps_simple(); + void calculate_dict_steps_bad(); }; diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 84a3aae..65de50f 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -54,17 +54,67 @@ # mixing RLE_DICTIONARY and PLAIN + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 INT32 REQUIRED + 3 INT64 REQUIRED + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 1024 PLAIN + 4 DATA_PAGE 352 PLAIN + 5 DICTIONARY_PAGE 400 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 PLAIN + 8 DATA_PAGE 352 PLAIN + +--- + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 INT32 REQUIRED + 3 INT64 REQUIRED + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 1024 RLE_DICTIONARY + 4 DATA_PAGE 352 RLE_DICTIONARY + 5 DICTIONARY_PAGE 400 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 RLE_DICTIONARY + 8 DATA_PAGE 352 RLE_DICTIONARY + +--- + Code as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) Output type repetition_type 1 REQUIRED 2 INT32 OPTIONAL + 3 INT64 OPTIONAL Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output page_type num_values encoding 1 DICTIONARY_PAGE 1024 PLAIN 2 DATA_PAGE 1024 RLE_DICTIONARY - 3 DATA_PAGE 976 PLAIN + 3 DATA_PAGE 1024 PLAIN + 4 DATA_PAGE 352 PLAIN + 5 DICTIONARY_PAGE 1024 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 PLAIN + 8 DATA_PAGE 352 PLAIN diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py new file mode 100644 index 0000000..61149ef --- /dev/null +++ b/tests/testthat/data/create-data.py @@ -0,0 +1,41 @@ +import pyarrow as pa +import pyarrow.parquet as pq +schema = pa.schema(fields=[ + pa.field(name = 'x', type = pa.int32(), nullable = False), + pa.field(name = 'y', type = pa.int64(), nullable = False) +]) +data = [ list(range(400)) * 6, list(range(400)) * 6 ] +table = pa.table(data = data, schema = schema) +pq.write_table( + table, + 'tests/testthat/data/mixed.parquet', + data_page_size = 400, + dictionary_pagesize_limit = 400 +) + +import pyarrow as pa +import pyarrow.parquet as pq +schema = pa.schema(fields=[ + pa.field(name = 'x', type = pa.int32(), nullable = False), + pa.field(name = 'y', type = pa.int64(), nullable = False) +]) +data = [ list(range(400)) * 6, list(range(400)) * 6 ] +table = pa.table(data = data, schema = schema) +pq.write_table( + table, + 'tests/testthat/data/mixed2.parquet', + data_page_size = 400 +) + +import pyarrow as pa +import pyarrow.parquet as pq +table = pa.table({ + 'x': pa.array(range(2400), type=pa.int32()), + 'y': pa.array(range(2400), type=pa.int64()) +}) +pq.write_table( + table, + 'tests/testthat/data/mixed-miss.parquet', + data_page_size = 400, + dictionary_pagesize_limit = 400 +) diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet new file mode 100644 index 0000000000000000000000000000000000000000..315f5044ccf503fa73bac5248b6a3eb736b36183 GIT binary patch literal 22803 zcmeI)Wpo|Kx*$;7kui4cB$g>oVrEQCNu0#Y%*@Qp%*@Qp%*@QpY?;}XnPw~Z-nsX^ zH*0>)pIIxl);_1Js#V=;slIbg=~EZWULt*9Kw!6Qf&H=-3hWjwFw{4pLUoH5{4sne zB{X69f&ju2j*!j>L?jYl5}7D`MO32kHO33|Ezt=i1~G|6Y~t`8arvGf_>p+TCjkjb zL}HSVlw>3)1u02IYSNIFbfo7eekKDM$wX$d@C(248(I0CKlqbuWG4qX$whARke7Vq zrvL>hL}7|hlwuU81SKg&Y06NRa+Ie66{$pJs!)|`RHp_tsYPw-P?vhtrvVLVL}QxJ zlx8%i1ubbsYueD3cC@Dh9qB}8y3mzwbf*VB=|yk)(3gJnX8;2k#9)Rnlwk~K1S1*6 zXvQ#>ag1jI6Pd(hrZAOhOlJl&nZ<18Fqe7EX8{XY#A24Plw~Yu1uI#_YSyrpb*yIt z8`;EWwy>3LY-a~M*~M=5u$O)8=Ku#e#9@wblw%y{1SdJgY0hw#bDZY_7rDe`u5guW zT;~Qi3E~#Fxx-!Vai0e~0SQS&Vv>-QWF#jA zDM>|Y(vX&Pq~|AoCIcDCL}s$^3%~LkS^1qm_>*j8CkHvnMQ-wtmwe=>00k*TVTw?c zViczYB`HN|%21Yal&1m}sYGR}P?c&_rv^2tMQ!R(mwMEv0S#$HW17&EW;CY-Eont- z+R&DEw5J0d=|pF`(3Ngp~IN|d_e$V2}gJ$ z5RpiHNo1n%6;X-C*L=gbL?@6K#3UB6iNklq<$HeMN8%Bm1SBL8iAh3Il98Mgq$Cxo zNkdxFk)EIUnG9qk6Pd}vFZ{}HWaW4M;7_uVogCyO7rDtpUhr zl%y1;DMMMxQJxA^q!N{>LRG3!of_1n7PYBEUFuPv1~jA*jcGztn$esVw4@cSX+vAu z(Vh-;q!XR#LRY%cogVb07rp62U;5FX0SsgigBik5hB2HGjARs}8N*n{F`fxbWD=8^ z!c?X)of*tz7PFbdT;?&K1uSF{i&?@_ma&`_tYj6dS;Jb^v7QZVWD}d&!dAAiogM6C z7rWWRUiPt{103WKhdIJgj&Yn5oa7XzIm21bah?lYY)UG^8aR>G_GD z$v{Rjk(n&~!ms>BR(|IX{v;dO$w5wXk()f^B_H`IKtT#om?9LV7{w_;NlH=yOIp#IHngQ3?dd>AI?r62tnz(58um>~>h7{eLCNJcT5F^pv#;I&HLPVF>)F6YHnEv4Y-JnU*}+bBv70^YWgq)F zz(Edim?IqJ7{@umNltN^Go0ld=efW|E^(PFT;&?qxxr0>xW#SmaF=`B=K&9S#ABZD zlxIBW1;M=J6|Z^2Ti)@W4}9bkp9u^VI3{}FwCJfq76qaOh6@ZAH&#GMN34L54+k+q zCHs$>hJlfSBZbu+s#~*#hG4ik2&S^<7`*OEi7gH-$1>-}P;=T=92DFIB4g?!a=jrtgkXsnzw}F{|~8 zzb96$QPuavt~YD?o;ZzGUElLvvt9A`#%*<~`rhx`1x?@kL#N>Ddw=W}kYHcDUeRmp zi{CHNjC~0PWw@~~;jml@_9q%us>c4r<7&;=pJY<28~c+^>y_X@vRR{Q97sNI){Fxw z7OlE*Amy@M2@a-Ob*jd})a!y~989w*_{PDs+X50EO1CR|%|q$;C7OBYr$ZTT9{TxM zu7rm(oGMlGaK>}BW**LTsnyNHnXmOqcqB{EsG3K9xjSp-kzXIKx_RWcr@Im!%^G~F z=F#8Z1kF79$A{pXNB;~JHqo(c0fDuSWsi_}*0CIsGX@>Y87+6B>iscHdeX4lA#IsM8D3tNmsglKVCq7-ORO#BMOP8xX`*fK~t#6$!TdjBEGv#WH zu6?F_z1g$RRA{vN)|raUb|*etsnzM)XDhe6HT!IpPA_krt=cVYl5^F11=cxNy7$uQjT4o%6NF)t+;{&ZO42&)1#SJIRH5vqsmsP=DU+ITso%T7COM z!)3dZTx_)Jbe)Th*WH?PvB{>Fw=Xu`7B=anX1fCGUTVHC@!U%-4rRP^spYZUNiVlL zRl4rw*5_)^z1-$f>pPd*UhAFoO1q%Zb+5F)JA3Yx4i8t~xzh3J?xa^c1)r{awR6`u zx8`2$^5Ny3t6f8dOLnbWK#Y3Vx<^Pd?^=(@neJZe87)t;>%F3vsdv41tUB|q_leu) z?)AR$`Xsy2FVUEKH~J@=Gw;TLRBP_u7?^HPvYUf4oT+znaF*NiZVt)%>h8^<*}^3c z8kQ?Y{h;CblFSbpQ7F^BppnJ$B)>JPRGIp>MwhEI|JImFZSLI~Tdhy>+v94DsegNX zy*cx5PiVB}-tCFa_9VYEsnwbKcP6*HJ^#*>POt9Wnc6K}io4T##b|JMdcPzK?#>vL z>HgiB!}6rKH)~Xx2KQ!InFbFQue-hA!IDj{?mt+%EnLcn%XY}N?wo~>H#}VP;PJ+%ds05x6nv)Plg)2#FMP7) z!>b2RwuTCy>gl$Cn2nxpkC1fH(;bmBKYY3~THaL8c117S=-KXAbr(I`6SwWdXM5xI zP4#?VqOpyh?@u;&(eneT);@fGFx}o%FAilm+vvsNEO!>YIFj}C!xu-hg-;!PELY6N z!N>C@T^xL(Q07O$CyV7x{qj_)vW;J!E?0N)%QKbQK6-h!THn;K&ea;*_|^G(a~Hq5 z&}i+WR~MV@P5t^(tFw(?Uv772@#`y{UO#$$wOjZ!Z?5%<+2qajeo2?SxiKj7<2N^l z-rZlc_VK$1%l4*u|8UjW zChs4uyR+o|<4vz0zkjkVeA*9Bcg1Y_;n}{VOFuk6l=;br7sv9Z{TO_zY}1b~&(&S} z@ztfaPd>iB);H~^H$h{YetLU%?$S^193!Izd((dY5PY`j=Z|mhEdBiH!|NxX zKZ~D3vIUOI_E-E2^{@CD{;&8cg8mgh|94H;>>DpQT(p12&ro&hwq5X_x!+*#3MclNJt_QlZ2!sBRMHZNh(s4 zhP0$3JwNd?8OTT`GLwa0_?6$t%J2NapJXFDImk&aa+8O=lxi$tXrMhOvxeJQJA6BqlS3 zsZ3)!GnmONW;2Jm%ws+aSjZw4vxKEAV>v5W$tqT}hPA9?Jsa4_CN{H$t!!gEJJ`uC zcC&}Q>|;L%ILILmbA+QD<2WZc$tg~AhO?aGJQujgB`$M?t6bwcH@Ha zAS0Q`Ocs9OSAHWazw-xwl8x--ASb!VO&;=+kNgy%AcZJQ5sFfb;*_8yr6^4q%2JN< zRG=c2s7w{AQjO}=peD7bO&#h|kNPyAA&qEE6PnVD=Cq(Ct!Paf+R~2pbf6=h=u8*7 z(v9x)peMcPO&|KwkNyl`AcGjp5QZ|0;f!D;qZrK?#xjoaOkg6Dn9LNWGL7lXU?#Je z%^c=3kNGTMA&Xed5|*-z<*Z;Ot60q%*0PTEY+xgs*vuBTvW@NRU?;oS%^vo$kNq6r zAcr{25sq?<1<%RTP%fQLNdF;95P zGoJH;U|#Zy*Sz5^?|9D#KJtmrgbHW=6NWDcAS~esPXr76<6rwOiC`vJkQ-YF|qBLbFOF7C@fr?b3GF7Nb zHL6pCn$)5;b*M`{>eGORG@>z0Xi77h(}I??qBU)3OFP=rfsS;dGhOIPH@ee zEMhTBSjsY%vx1eZVl``6%R1JxfsJfpGh5ioHny{ao$O*ad)Ui9_H%%P9O5uXILa}O zbApqc;xuPC%Q?<-fs0(?GFQ0DHLi1mn*?!-+uY$U_qfjk9`cCCJmD$Nc+LxgdC4nY z^M<#)<2@hv$R|D%$_Incgy9PU2unD^6M=|C;!7eEg|CQ8G`{8=z9l+=#2_ZIh)o>6 zBQD?b13wav_#_}9iAYQml9G(%q#z}!NKG2jl8*HJ#Lr|PBbmrd7JlJZej_Wt^9O&D zjqKzgC%MQ?9`cfp{1l)dg(yrBic*Z?l%OP~C`}p4QjYRepdyv1Ockn9jq22(Cbg(d z9qLk#`ZS;+jc800n$nEsw4f!eXiXd1(vJ3Ypd+2=Oc%P+jqdcIC%x!RANtad{tRFs zgBZ*ZhBA!dj9?_A7|j^QGLG>~U?P*4%oL_Fjp@u_CbO8$9Og2Q`7B@|i&)GOma>fH ztY9UpSj`&NvX1p^U?ZE@%oet?jqU7UC%f3q9`>@2{T$#Rhd9g;j&h9SoZuvhdkmjPk72Rp7VlWUh<09yx}eHc+Uqu z@`=xctR2J%92X;1;I^=~|FQO3HR4|%+#{IAK7jwb7P5BmWpJOr)?T-T4IPq~6`C-7 zK>%S1M|dI-kw|<=WTNmDQHjRae8aayCy*G#Bo?uW!*|5xdw$?Y;t`(&BqR}uNkUSR zk(?ByBo(PiLt4_2o}c)c3}hq|naRR0{K{`+<#+zzPqLAn9ONVyxyeIb@{ykc6r>P^ zDMC?-QJfN#q!gtoLs`mEo(fc?5|yb!RjN^)8q}l~wW&j0>QSEtG^7!YX+l$)(VP~v zq!q1cLtEO>o(^=R6P@WoSGv)i9`vLaz3D?=`q7^O3}g_48NyJ8F`N;MWE7(r!&t^K zo(W835|f$2RHiYV8O&rBvzfzO<}sfIEMyUjS;A75v78mGWEHDf!&=s{o(*hd6Pww> zR<^O79qeQmyV=8D_OYJ>9OMv(Il@tnahwyJAS0Q`Ocs9OSAHWazw-xwl8x--ASb!VO&;=+kNgy%AcZJQ5sFfb;*_8y zr6^4q%2JNYqy8hCHk&IPI#lQ=5kiIG3jzpBIKmTwh(zK` zA`^wLh)OiR<{Q2xI)TI>Cb5W39KIti-}3`M5|8*KAR&oJOcIikjO3&sC8HNAm8eV=s#1;W)SxD{s7)Q}QjhvHpdpQDOcR>YjOMhUC9P;p8`{#2_H>{l zo#;##y3&pA^q?ob=uIE`(vSWOU?77S%n*h$jNy!6B%>J37{)S=@l0SMlbFmDrZSD` z%wQ(7n9UsKGLQKzU?GcG%o3KejODCgC97D?8rHIo^=x1xo7l`2wz7@w>|iIm*v%gH zvXA{7;2?)M%n^=qjN_c(B&Rsd8P0N!^IYH}m$=Lou5yj*+~6ib+~PKOxXV56^MHpu z;xSKn$}^txf?!_qir2j1E$?{G2R`zN&ru>gdm9isbi}X!p8hUJPI8f(Jme)G`6)m_ z3Q?FM6r~u&DM3j}QJON8r5xp{Kt(E1nJQGJ8r7*mO=?k_I@F~e^=Uvu8qt_0G^H8M zX+cX`(V8~2r5)|*Ku0>!nJ#pt8{O$aPkPatKJ=v@{TaYO1~Hf+3}qO@8Nos7?)PQj6Nu zp)U2PPXij#h{iObDa~k33tG~O*0iB5?PyO2I?{>GbfGKV=uQuM(u>~op)dXD&j1E8 zh`|hDD8m@e2u3oB(Trg%;~38bCNhc1Okpb1n9dAlGK<;FVJ`ES&jJ>*h{Y^nDa%;S z3Rbd;)vRGH>sZeQHnNG$Y+)*>T;VF$xXul362vWTbBDX!<30~~$Ri%}gr_{?IWGw2C9inR8{YDc_k7?Z zpZIM4--~JfKa2VI{C7Qal8fBrAusvJPXP*2h{6=1D8(pF2})9m(v+brs7?)PQj6Nup)U2PPXij#h{iObDa~k33tG~O*0iB5?PyO2I?{>GbfGKV=uQuM z(u>~op)dXD&j1E8h`|hDD8m@e2u3oB(Trg%;~38bCNhc1Okpb1n9dAlGK<;FVJ`ES z&jJ>*h{Y^nDa%;S3Rbd;)vRGH>sZeQHnNG$Y+)*>T;VF$xXul362vWTbBDX!<30~~$Ri%}gr_{?IWGw2 zC9inR8{YDc_k7@Eln7Zva%n?KNZM>j`9I~DP6JlYEp~Z)S)i*s80hL(ul@1p()L1P77Mn ziq^EDE$wJe2RhP;&UB$G-RMpadeV#D^r0{P=+6KKGKj$pVJO2G&Im>_iqVW=EaMo@ z1ST?x$xLA?)0oZ-W-^P}%waC`n9l+hvWUejVJXX4&I(qtiq))PE$dj%1~#&Z&1_*S z+t|(ycCw4z>|rna*v|nDa)`qm;V8#A&IwL(iqo9oEay1S1uk-l%Ut0q*SO9NZW6>T zZgYpb+~YnEc*r9j^Mt27<2f$~<|VIq%^TkGj`w`vBcJ#jB|<=`u(#yihE@y17X%QN zaD*oU5sAc?L?#Mf5tV3s%{P2YbOMP%Okxq6IDAK3zUK#iBp&feKtd9cm?R`68OcdO zN>Y)UG^8aR>G_GD$v{Rjk(n&~!ms>BR(|IX{v;dO$w5wXk()f^B_H`IKtT#om?9LV z7{w_;NlH=yOIp#I zHngQ3?dd>AI?r62tnz(58um>~>h7{eLCNJcT5F^pv#;I&HLPVF>)F6YHnEv4Y-JnU z*}+bBv70^YWgq)Fz(Edim?IqJ7{@umNltN^Go0ld=efW|E^(PFT;&?qxxr0>xW#Sm zaF=`B=K&9S#ABZDlxIBW1;M=J6|Z^2Ti)@W4}9bkpF<)*xHx~!@i6LN5I|VM5uOM{ zBobc|nJ9ckRHE@U-|#Kb2_yzFiA8MU@Evjao*(#;c*G|G2}wj^l8}^SBqs$aNkwYX zkd}0$=O=z90~yIgX0q@Lzw#Sd`JF%blWb%s2RX?_Zt{?qeB`G91t~;ficpkd6sH6w zDMe|@P?mC(rveqJL}jW_m1+=(3WeG#AU8UG8z8 z2R!5vk9opVp7ER)1oM(tyygvWdB=M`@R3h^4w?Vq|DONZU5gy#Bp12KLtgTcp8^!5 z5QQm1QHoKV5|pGAr71&M%2A#QRHPD>sX|q%QJospq!zWQLtW}op9VCf5shg=Q<~A7 z7PO=lt!YDB+R>g4bfgoV=|We!(VZUjq!+#ELtpyQp8*VH5Q7=QP=+y_5sYLMqZz|k z#xb4=Ok@(1nZi`2F`XIAWEQiT!(8Sup9L&r5sO*EQkJot6|7_xt69TZ*0G)qY-AIg z*}_(~v7H_4WEZ>H!(R5Wp937^5QjO!QI2t(6P)A}r#Zt}&T*a#T;vj$xx!Vhah)67 zB#2ww<_>qc$9*2~kVib`2~T;(b6yb4OJ4DsH@xK?@A<&TkY(C`E)D!^;UFaaHdWw~ z|B`+iAtbjeq*zt>=R$4Bs>1(lwM*X^f2}I~BN_MaRfWIbQ}{jmJEr_Ui75f6BJ7?T z@c%z{y{upgHNA zm8dMNhh*f2lz+wOz<}6UGy31hmNo0uZd^Su!0qS~hlv%sos`{>=k}puY)c$FZtPS^ zLd6XX6FXz5aM5Dx4Gg0>M)0E0F(xbyon%F}BoW*PCuGQ2e=RQm$Nyr494mHK`M)9e zx$_Wm%LNb7)TBQfG~i!@w*SYV=XQpPv1&=^Bo`iqPBLS0=(w@|^W`LDH08g&$o!-6 z|8krdhn{}ngp)*wapYo{=yr@7D{}S{C5o2)^&j7MtcRh~hLlWovn$0z9+ZOFv$xHj zqi*&**|XQpp7XD+>^X8|&t5pBGiUbf`9q%7|MjdEvY8|IKijisFH*{&SyXf9{72(o zL;tlIvX6FhWDMDASIn+$L$(}F(Nj9)xQ?GM0|VRYXcoLJ7S~?;=fl@1b{4Q0dZpZ-SjFO{GZ( z0kHsfLG1Xx12R|d_{z+CbML${YtDDh+H0Mxy|eT0y-IeqV2uKSh(K~|V03Jy!0fbv zuxw#rn=@W1l{QQYCl#rQAPu34bVL$GdNPoaOx!|dvXB++g=Obf0z`8gF~pLCoWzlf z+qr|>+{sLRG3!of_1n7PWbjI@F~e^=Uvu8qt_0G^H8MX~9#p-EM^HyS;lf!u##1*W({ju z$9gvKG8@^%E4<2Uyv`fE$y;n@3tQR7c6RVKJK4o<_OO?Cc$fFs$9@iQkVCxBVUBQ= zV;tuMCppDwKHv;zImd^b=OaGm6F%hv7rDe|e9jkq$ya>MH+;)wuJ9dK`JNy6k)QaP zU-*@4{6^^F4<{9=i69MWNk=46q$dLz$;2&WCJR}~Ms{u`Ks2`zLo7MSNgTPjojb_Q zo!rIU+{3-(;Xd+mKM(LA`FMzjd4&8Fpdf`POc5UCF^cjy#dw0^l%OP~h$lz_iIk=c zWhqAz<*7hLDp8p#RHYi#sX!nJ#pt8{O$aPkPatKJ=v@{TaYOk{QHchA@<23}*x*8O3PEFqUzQX99t+ zz{qTYFVp1YeQfibAtN&9P8b;zoI7+>_PjL~*WEIA*qGb$H{UfS zIB)p4oJG4_IJRZphzWPZ4~~pYm_PElyOSp7t+#dls7d!#o0~K>VZrDZ9;~~z_3^C> z#!P;s>DH04i3`V0E!6tp;(FT_j+_2irw?~cONWnq6xD~C8vtYQF`&S za}q~pyTAVS#m~(v_v~Her2n1zGTv(s&Q$ZTQ8~*lpR%UKu8H?IczgNOb*=W#EkC{NifJ3# z9b4Px>!_UNR?c{(>xF|$8thy-^R=E|f4F;kxm7Q{(f5aMFP_}FYSvo= z!&0Y@OIkg9%aF+I4>a7hdd{{HS?;bdBWcat9b;k&wLP_K&Agoxawm?CE5COB?n!xT zE^WAb?Sj3N^S9VDqx`ys?@lk;_0p-`>lW>s6+a|>t_tfHADEXk>48Rj)-O4= zGb?OZdU#pgb!|`Y*|6;Bs;1jU=c@Sf^5g4TA6nXI@5?JrZtQe^&&-M&SDt>Y*X2v6 z_ikKu=B?z^8E&t%Y4y2nBeOr)_?=B_&hHGnwl(~{t>GC5I;+>Vh5ys*Cna0M4{i;Q zOtXJ($>FJmNpgH(ZQS)R3H$HEBsuq$Icd*|mvf}t8&mF${~zy-LO0BvFlB7*93)bj zGL)qpNtCAo6{$pJs!)|`RHp_tsYPv`qz-kdM|~R5kVZ772~BB6b6W5eEqR(&w5APh zX~#3Prvn}7L}$9tm2PyW2R-RUZ~D-ee)MMm14(8OgBik5hB2HGjARs}8N*n{F`fxL z%X3U*63_DjFEW`aOl2C=nZZn6VivQR!(8Sup9L&r5sO*EQkJot6|7_xt69TZ*0G)q zyv#;6@d~f<8n5#PZ}JwK*}_(~v7H^f%}#c)n?3C19p2?V_OYJ>9OMx1bC@F>QSEtG^7!YX+l$)(VP}M zMN6Kh6|HGQTiWpq?dd>AI?r62tnz(A52#9)Rnlwk~K1S1*6 zXvQ#>ag1jI&+;4-nZ)zFz>7>~3R9WJbY?J}C&pd53p-kA3Xt00%k5 z`yA#7M>)oEPH>V_oaO`0aF%m?$ay~EV?NYjah31+ zfgkyapZSGfxyEmVdhu{lk(vn7kd|~r5=DA4kdaK>LS|AX$$#@nvZdF8f#?6R^qL%d zJwU=UMTVs!H4&sCE$N6Piu7b4Bbm5`%w!=e*~reV1c>G~Vu&RNIf)||w{r)%xs$uN zn|rvIJlscK?&kp>Bp(m)FprR*0u-bWg(5b*M`{>eGORG@>z0Xi77h(}JgH$J37{)S=@l4=Zo?{}D zc%Bz{k;zP9D$|(G3}*5YvzW~s<}#1@EMOsvSj-ZZvW(@dU?r|rnO@GkGMkNq6rAcuIL!yMr#$2iUjPI8LV ze83sba*hu<&qsXBCw$5UE^>*__?$2JlCSuhZ}^tWT;V&e@;yKBBR}yozwj&9_$^c9 zH$O&%heu^ix%{1uL`qYJvXmo<@>HNAm8eV=s#1;W)SxD{sLhkqp)U2PPXij#h{iOb zDa~k33!b7SPt%Ilw4p8Sc!u_Lpd+2=Oc%P+jqdcIC%x!RANtad{tRFs$qZsJLm0|1 zhBJbZjAArn7|Xa!k;OynuAvlKVhyFgT7QkaSs4>~<4f1)1%FYhHf25bZ@V6w93D9G zrv=%+h!FG1zbF+q$?*|qLu1}^QRyQ7UH8z4u<)p;uy9vKDpC_c8q$)ENTNtj1~QU~ zTgXfnvXYJL+)99GZXY?*6^&E#ZK0g z*bx&cvGYIbfp5l6$^cx<9N-{_c%Q=@;V8#A&IwL(iqm|+nM{$_k~98IUZEx1e1RE% zTC)8^r<(uwG1{NI+MA`Jt~NR%f2gZ15bA1EdWwJPcsu24ceUE*6CfpIR#fyZ+r0v?Gb<6h60uf<>RQGzytyfqeBJ|liEPC_3v2n5a@`S|&QpFYt zOPeKDZ6KB6=siQiqp#M^vpzOYS{MMxz}vyEsL$tL$Pszh?{NpzR~9L z%i+;e>f~A9G0(NyaWQ}BSAOqSLjMx{i_)}W3Wbz zs&$Ip_?2Tm4}T<-9&Htrs)ydBiosydU_z^4*kESQ%>@2Z0-=FILjDS^zr}?}!svmX z5P>Mf&^;)DUg(WJkVIefLw`tN09a3G5Tqf136n3;(1!sGF$+d8h6!fF6mu{a z^I(SgSb&8v$097o5?H_zOJRj&SdJA~iB(vQHCPL4*uWNcu!jR2;RI*6U>(-O6>e~c z2RyL>8?gyq@P-e3;fKxGg00ww?bv~x@W(Ff#vTM95JA|BVC=(w9Kb;w!eJc2Q5?f@ zoInUpA{3_(hSNBMvp9$IxPXfY$0bDIG9nR$Xv82ES8x^A5Qlgq;5u&LCT`(25^)E2 zaS!+L01xp9kMRUg@eD~wMha4qhIC{g6IpnU7kG(myuxebAQx}&7VnUUd=#J%MR<=7 z_=sYB!e@Lz2})6ha#WxaRj5V{YEg%JG@ubpXhsXZ;v2rB6>Vro2RiWsKk*A)_>DjK zi|)-Yj2`F-5r{$z;*dZu^hO^@qA&WPKcp}K12G8F5I_clAqzPS!BEI!7=~j66fhE_ zFdB+bf-+Q~iZK|AaZrOgG@uDBjK>5_gf?_A36r6VDVU0B(8F}hz)a}F0EU){GFxWfaU*no}L1TT2Q2fpybW^BP$Y{Pc!z)tvM7j|P00uYEG>_ss4VLuMwAP(U$ zj^HSc;W$np1Sb)SQwYOpoWWU~!+Bi5MTFxLB5)azh(a`C5Q{6giff2NJQ8pnH*gcT za2tuZgS)tg`*?tdc!bAzf~R3SD#|L~wF+SllzMur9C__0aP>Cv3qXxC8Lp>VMh$b|n1z+(E-_eRTw4(!^_<^7J zg)aQYAN&=r{n7eMT(}3@A3Y%gQHVht66l5A=mSagML+b16b4`*20#j|Er=b1cGQEP(|qu@qKVhUHj+l~{$a7e){Cga||-260HB7kZ-)B+(cB&>vD5fPokUX$T;L z!H|Uy`u?xGg z2LT8~5cVP%`>-Dea1e)Z7)Njv$8a1c5Q38k#VLg0G|u2G&fz>R;3C3t2@$xANJJqT zF^I(#T*WoSAsz|1jvKg%Teyux+`(Pk!+ku!Lp;J`Ji${uLlTmaf>fj-9T~_(7M|k; zULqT>@ESSD#T&fEJLDlB1t>%j-s1y4q8Oj>8DCI>Qk0<_6{tiNs!@Yl)S(^?Xhaj5 z(Son|hVN)a8`{x9?_5{L=Jr)OVb9K8;wktwy>~C7c8^ClrFQ%t2BMEb->6BS^E&13_0h>OBq94 z6H7COdZvxcl=sTB$sFcaaVc~7w${?j5&j~hvJ?WOZL>xOD@J6EI;dTiHTtO0sOO3y z7Pilo!t5iSE1&Z!d#(~5FzSVBWQgsHF|m;mFUH0tmc1BvBW=`6wZuHzm+JQ`B3^1d zZY_JMnItkgTPsc4E_-~I;^piK+1lmV6LXD5ztYaLuzRIbWPkb9q++k~SCdNuM!(jr z2(f!Tr6%(7>!}Ti<*%o;q>aweYt6IEnci7(IcG*!YkAI0AyLI#eG!3uu7QM7WUis4 zPDSo4DPzSqM$(q{Z;WLfBHx(Edsn=ftq`dA)>QGN{o6UJQIT)ws@o2NYWE~)I zEV2n!iY~G}s8dyBchp$G;7VHY)nVy0|-4 zAJ*SUSN`alnD6+}?S5tSNB75VRUbW)L{*AC(*#b%8?uyQiZ^EKR2Of`HCFlLm1pVn z$-BrQ=95pccl9UVl0cQueibL3K5wpxiut^y;ZF7Etu5&)U$(X8JAK){v$Hbh%Z{$L z>MuKm#8gZCMP!^yc1bA5mh6_CR8z7?%0#s^KzgZjX`rlQY-y0ZPfh7wg&@_kV8u}9 zvVE%2v1R+!?$(qY(8^FPKd4>cTz*KmDz^NvUVBaX5q+^S6-SL^Tq=&4DqpEMZZ@g5 z;)J=$n92}~r7o2xmpNXk3|-|@TY1VlXiQa@eW**-Y3JxGRcBo9)>fVM%otOB&a1$s z`n+G&mFf%I+H0#X`iqUN2@jB2S92*?`D#tX!AW&Bmyeo^t&I#>x~?`V%<*b%^f{lp z+L-X5v30SLq3i0d#71AOyBc@5uI}26jIs4`i3RKGbp(JJ)1fV`ARY^^JFm9IrLrE%vEzyjK!5uIYY7==!DyHPP3a9yZ*q zZ+g^{F|PS>Yr*>FC!JN-nxA&H*Ec^C5?5 zuj#T*abGj!eH*@JD(qGJmZf;g_1kmRn7D5*)b2HWd#RPF_B~s>(DnN(-RijSuk|__ zzUSzRtGDJF4R&jNW2zG0`qpf6W9vKf+3Ib17FKR;`OBQ*+X`0sHntU7?^SOvvOne4 z{@ytzzWsyiy~g&Bo|)<$#a@MO9iRNF<2ycY>uBuw;xDezSrRbVy|Xk}C84wI;N+&x z@}sjgepG~5x&NpPb4vJ8bOQ4tc$zX^t1j(rpB*^#6tI9 zjrXe)eluZ+E^{#K(lbgG~=g!vr-I{0R@w=_a>H6>XV&CT9 z9VL4;|8!QI^7!+kCg%E|pAGk#|NIhd$<+MY)mrHB_jhOY^}m0*I-39fC4L^BE(n{h zCJ66O{onYRBmPhPB!YIw&;M@`HZzB}n)L6EpW*$5g_t0W9_R@Xh(ZkFkU%f=MjuF` zFZ!WBq%Z&jF$mHSKn8=+9m3@>1VbT@VHl1PP{2rx!e}T$3Cd7`D#ly5!%qfBus`breG?jK@Zb012dry0~lf!j9?5C%!VoEU@qpt4D+!73t^5$Sd1mG zfF+i~3d^t@E3gu)uo`Qy7S^zVE$m37+B^ zl8}rPq#_OJ$Ur8t@EkAj64`i#*T_LG-rz0XArJW|Kp~3o9v|=##rTBJ_<|CYq73Dz zKqabBjT+RV4)th2Bbv~R7JS7wd`BzV(2fpt;s<`>7rO8pfAAMVqU?Y4Ku?H36k-sE z1bU%2`alwW(GUG0g#j3dL6C+3G8hb5$YBVELLS2~93!BBkr;*1P=pecp#oKm!B~ug z8q}cyO=w{}CSW48p@T`73|&mYR7`^®+XLLUYH%W7$%qvQ_R6!%!3)`V*wVz z9E-3ROJD&@EQJ-8VL4V{C01cI)?h8HVFO#(!5$89gcF?Mf^}FASGd6)9`M8lY{Vvb z!5cpCg&#I!3$|h#wqpl&!XLY^8+#CdKm=hgg0T<#aR3K#2#0Y5M{x|taRMPYiBOzE z7*691&f*--;{q-s9G4J*%ZNl2q7j2wT)|abLmc9ffa|z{o4AGBNW>l7#Xa1|13bhd zJjN3|#WN%!87W9b8q$%0Ol09XUf?CN@d~ezgIv79Tf9Ra@=<_76yZHS;3JCh37_!= zB`8H1%29zzRG}I*s6`#>(SSxYp&2dsif{OiRMfAPx!iLT~heB>JKs`a=o>Fc5&n1ZR820cv249tW+3}A>^FoH2mFdL?rgSnUo zGt9>VEQC20VKJ7#0+v_`D=foutiVdF!fLF+T3Evdwy=Xe9N-8iIKu_&upX{(gF8In zi4E9@P4I#@eBcW|Y{nLB#Wrlm4(x?rv zLU0nHIE65r#u=Q&Ih@A@Ttql7Ap(~Xi6}%P2C=w;tGI?Z#3KRMaRWDT3%8MoJGhH` zxQ_>Th(~ygCwPiyNJ27Fkcu>NJBa@kclij#|ykfHeTU1av%_qQ`6yJZj+s} zgPV<@2M->VddLd<3CVG8T$_mfPr{}xiTnAg2R_y}c*?q#HzC@(8%uz2x8tEvBf4cRzh z&F(bO$$+$|J0s0AFz_`nbTTk8FmN)M_0N!jp`n4n{O-Y71_pDwOWOZe(lYMuvh2>N zQ95-WZ`!>MX;Js&f0Oa7e`Xr~)BjKPzx@=3+TC^5=W{sT_uuvZ8DH{GK3zyw!P92r u25$#1bvsW_b(@VFJ$w~>95!xp@o-ldud6X$LrYFxp5N`p^I~(}BKr?qi98(u diff --git a/tests/testthat/data/mixed2.parquet b/tests/testthat/data/mixed2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f7356833c9c369da4e072ee2bf4fba0f82935030 GIT binary patch literal 6938 zcmeI%d0b6-AHeZ@E4iVOa&xajwuDd-**iDwQOZ&ZEmGM^p{%W1W#5veMcNQT8H^Mv z`!aUMHVeiW+st5Q%=0>36^3RmSY80Vii_n4b~z9>yU}{*no}Lge+tu2f5gcJZwQe z3Q&k56k{v4VLM8&13R$`rPz%<*o!ihV;?@pejLC-e1Su#z+qIP3P*4h$54$L)Z#c! z;3Q7rG|u2G&fz>R;7eS@C0xc8T*Wn9#|_-XSGa}SxP!a6hp+JszQuRAj|X^&@9_vf z;4yy06FkK;JjV2qs}NreG?jArxVlj&MX^24-RwA`ykzn1i{P zhxu55Xv82Eafn9(79tTzNJa`$k%mP`$6_pjTp~}_l|OE1Dv!~T>&P|bnuZDuaX_IV zW|zLiRh30g0XNVTFRYBGg&R!pqAOxMJ-A}cn&p-wx8wbt2j3mYpeg0EC$$0s#4 zQY9;doP-oT6ZZno!#NAnTA2CQtQB$-(+#Y`?_W8bo3x~jb(~D8+MK+sy~A=7ugc9S zD>}L5IAo~uQdf2L+!cJaGB0gS58ujUrOlQ_89jr~7kE`|Nzd#PcDE+OCV%mU{!!2F zU#-etvT1OPnpVpp1xvGSl65WvH?c4Qrnm+5u!so9ZTm88?&X}!bFoTF)qo|A6W?98oRVq}wQ=(uZh?Fti* zBA?n_c_-GG1sq@RSi0p@rd7nl8?~kRXEs^KH)`dyyWm`|!-_6rkMAzLknfi3w83dl z(ZyoV(n&Xu?~e(>{}o=WNYN~x^D*_=U%>i=e% z{EytK|L0_XmnJ0O2RTCXy)-dW)u+jNP5Bm0Q+Y*$51uAwe|egSEn>0yi*%`$*Qe*g z-{-j?mKhfEm4B8QG0M763{q{GL=EcDKm%x^A!N`(BQ!=6XyY?9g$|lQ7kX$8IrPy2 z3MkPM1~5b`v_=~kp)J~>JvyKxjL``u=!`Dt3R84Lcl3Z6%wd6^=mkskMju$AFZ!WB z24EltVKA&AKm{8Nfi3J{4+l8H3C?iAPz-}B+%O#O@W2R+#3+o0C%iBQ-tfU#jKg^N z!Vmrkzyt(hB7zW%5KO{kOuj>TAlrC5gLSb>#Th1FPtwaCCaWMVxwU?Vmm3)#p)E;b_%Tab?e z6ru>l*otk~juPy^PV7P{c4H6rq73EOhtIJe2XGKy;1DWs7?r5P5gf%aRHFv9IF1uI ziBmX@GdPQLIFAeX5*Kj^mvIGGaShjT12^#%Zs9iW;4bdrYkY%m@g45t0UqLeJi-rn zj34m?Pw@=T@d7{LXZ(Vfc!k&a6~EzkyulyZvZt>!q*ASBpWgo54jT-CE$m`V}3OhDuXo ziJ@FgX(`dvQ8Fr5qg(&6t^R3qy`ZTiBYsqd_tx^7qO6a;ZsWhw4Oxp8QuxX0-TMn;O~B^<=-Acl)4dmC@(tLy)8zFy)n zTp!mNB&Z_mudnX+JnCKtN)$$6e&N$+2F~af5Ej Date: Fri, 7 Feb 2025 12:17:41 +0100 Subject: [PATCH 03/18] Simplify dict-step recording I'll also rename dict-step, because it is not always a dictionary. --- src/RParquetReader.cpp | 82 +++++++++++------------------------------- src/RParquetReader.h | 9 ----- 2 files changed, 21 insertions(+), 70 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 27958d0..491ef47 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -93,6 +93,7 @@ void RParquetReader::init(RParquetFilter &filter) { tmpdata.resize(metadata.num_cols_to_read); dicts.resize(metadata.num_cols_to_read); + dict_steps.resize(metadata.num_cols_to_read); byte_arrays.resize(metadata.num_cols_to_read); present.resize(metadata.num_cols_to_read); @@ -407,6 +408,15 @@ rtype::rtype(parquet::SchemaElement &sel) { void RParquetReader::alloc_column_chunk(ColumnChunk &cc) { uint32_t cl = colmap[cc.column] - 1; + uint32_t rg = cc.row_group; + if (dict_steps[cl].size() == 0) { + // first row group of this column + dict_steps[cl].resize(metadata.num_row_groups); + } + dict_steps[cl][rg].push_back( + { metadata.row_group_offsets[rg], 0, 0, cc.has_dictionary } + ); + if (metadata.r_types[cl].byte_array) { if (byte_arrays[cl].size() == 0) { byte_arrays[cl].resize(metadata.num_row_groups); @@ -471,8 +481,18 @@ void RParquetReader::alloc_data_page(DataPage &data) { // A non-dict-index page in a column chunk that has a // dictionary page. Should be rare, but arrow does write // these: https://github.com/r-lib/nanoparquet/issues/110 + std::vector &dss = dict_steps[cl][rg]; + dict_step &last = dss.back(); if (has_dict && !is_index) { - notdicts.push_back({ cl, rg, page_off, data.num_values, data.num_present }); + dss.push_back({ page_off, data.num_values, data.num_present, false }); + } else { + // do we need to add a new dict step? + if (last.dict) { + last.num_values += data.num_values; + last.num_present += data.num_present; + } else { + dss.push_back({ page_off, data.num_values, data.num_present, is_index }); + } } if (is_index) { @@ -2044,67 +2064,7 @@ void convert_columns_to_r_(postprocess *pp) { } } -void RParquetReader::calculate_dict_steps() { - if (notdicts.size() == 0) { - calculate_dict_steps_simple(); - } else { - calculate_dict_steps_bad(); - } -} - -void RParquetReader::calculate_dict_steps_simple() { - dict_steps.resize(metadata.num_cols_to_read); - for (uint32_t cl = 0; cl < metadata.num_cols_to_read; cl++) { - dict_steps[cl].resize(metadata.num_row_groups); - bool dict0 = dicts[cl].size() != 0; - for (uint32_t rg = 0; rg < metadata.num_row_groups; rg++) { - int64_t rgo = metadata.row_group_offsets[rg]; - int64_t num_values = metadata.row_group_num_rows[rg]; - uint32_t num_present = present[cl].size() == 0 ? - num_values : present[cl][rg].num_present; - bool dict = dict0 && dicts[cl][rg].dict_len > 0; - dict_step ds = { rgo, num_values, num_present, dict }; - dict_steps[cl][rg].push_back(ds); - } - } -} - -void RParquetReader::calculate_dict_steps_bad() { - // start with assuming no bad pages (bad = not dict encoded in dict col) - calculate_dict_steps_simple(); - // now post-process this - for (page_range &bad: notdicts) { - int64_t bad_end = bad.start + bad.num_values; - std::vector &dss = dict_steps[bad.column][bad.row_group]; - // find the dict step it applies to - for (auto ds = dss.begin(); ds != dss.end(); ++ds) { - int64_t ds_end = ds->start + ds->num_values; - if (bad.start >= ds->start && bad.start < ds_end) { - if (bad_end > ds_end) { - Rf_error("Internal error, impossible mix of dict and non-dict pages"); - } - if (bad_end == ds_end) { - ds->num_values -= bad.num_values; - ds->num_present -= bad.num_present; - dss.insert(++ds, { bad.start, bad.num_values, bad.num_present, false }); - } else { - int64_t num_miss = ds->num_values - ds->num_present; - dict_step newsteps[2] = { - { bad.start, bad_end - bad.start, bad_end - bad.start, false }, - { bad_end, ds_end - bad_end, ds_end - bad_end - num_miss, ds->dict } - }; - ds->num_values = bad.start - ds->start; - ds->num_present = bad.start - ds->start; - dss.insert(++ds, newsteps, newsteps + 2); - } - break; - } - } - } -} - void RParquetReader::convert_columns_to_r() { - calculate_dict_steps(); postprocess pp = { columns, facdicts, diff --git a/src/RParquetReader.h b/src/RParquetReader.h index eeecf6d..36fac78 100644 --- a/src/RParquetReader.h +++ b/src/RParquetReader.h @@ -96,14 +96,6 @@ struct dict_step { bool dict; }; -struct page_range { - uint32_t column; - uint32_t row_group; - int64_t start; - int64_t num_values; - int64_t num_present; -}; - class RParquetReader : public ParquetReader { public: RParquetReader(std::string filename, bool readwrite = false); @@ -131,7 +123,6 @@ class RParquetReader : public ParquetReader { std::vector> tmpdata; std::vector> dicts; - std::vector notdicts; std::vector>> dict_steps; std::vector>> byte_arrays; std::vector> present; From 0a91e5bc9c143658941316b28f5662fdbe3eba88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 12:21:06 +0100 Subject: [PATCH 04/18] Rename dict_step to chunk_part --- src/RParquetReader.cpp | 64 +++++++++++++++++++++--------------------- src/RParquetReader.h | 4 +-- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 491ef47..4fa7445 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -93,7 +93,7 @@ void RParquetReader::init(RParquetFilter &filter) { tmpdata.resize(metadata.num_cols_to_read); dicts.resize(metadata.num_cols_to_read); - dict_steps.resize(metadata.num_cols_to_read); + chunk_parts.resize(metadata.num_cols_to_read); byte_arrays.resize(metadata.num_cols_to_read); present.resize(metadata.num_cols_to_read); @@ -409,11 +409,11 @@ rtype::rtype(parquet::SchemaElement &sel) { void RParquetReader::alloc_column_chunk(ColumnChunk &cc) { uint32_t cl = colmap[cc.column] - 1; uint32_t rg = cc.row_group; - if (dict_steps[cl].size() == 0) { + if (chunk_parts[cl].size() == 0) { // first row group of this column - dict_steps[cl].resize(metadata.num_row_groups); + chunk_parts[cl].resize(metadata.num_row_groups); } - dict_steps[cl][rg].push_back( + chunk_parts[cl][rg].push_back( { metadata.row_group_offsets[rg], 0, 0, cc.has_dictionary } ); @@ -481,17 +481,17 @@ void RParquetReader::alloc_data_page(DataPage &data) { // A non-dict-index page in a column chunk that has a // dictionary page. Should be rare, but arrow does write // these: https://github.com/r-lib/nanoparquet/issues/110 - std::vector &dss = dict_steps[cl][rg]; - dict_step &last = dss.back(); + std::vector &cps = chunk_parts[cl][rg]; + chunk_part &last = cps.back(); if (has_dict && !is_index) { - dss.push_back({ page_off, data.num_values, data.num_present, false }); + cps.push_back({ page_off, data.num_values, data.num_present, false }); } else { // do we need to add a new dict step? if (last.dict) { last.num_values += data.num_values; last.num_present += data.num_present; } else { - dss.push_back({ page_off, data.num_values, data.num_present, is_index }); + cps.push_back({ page_off, data.num_values, data.num_present, is_index }); } } @@ -531,7 +531,7 @@ struct postprocess { rmetadata &metadata; std::vector> &tmpdata; std::vector> &dicts; - std::vector>> &dict_steps; + std::vector>> &chunk_parts; std::vector>> &byte_arrays; std::vector> &present; }; @@ -540,11 +540,11 @@ void convert_column_to_r_dicts(postprocess *pp, uint32_t cl) { if (pp->dicts[cl].size() == 0) return; for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { if (pp->dicts[cl][rg].dict_len == 0) continue; - std::vector &dss = pp->dict_steps[cl][rg]; - for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { - if (!dss[dsi].dict) continue; - int64_t from = dss[dsi].start; - int64_t num_values = dss[dsi].num_values; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + if (!cps[cpi].dict) continue; + int64_t from = cps[cpi].start; + int64_t num_values = cps[cpi].num_values; SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { @@ -584,13 +584,13 @@ void convert_column_to_r_dicts(postprocess *pp, uint32_t cl) { void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - std::vector &dss = pp->dict_steps[cl][rg]; - for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { - int64_t from = dss[dsi].start; - uint32_t num_values = dss[dsi].num_values; - int64_t num_present = dss[dsi].num_present; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + int64_t num_present = cps[cpi].num_present; bool hasmiss = num_present != num_values; - bool hasdict = dss[dsi].dict; + bool hasdict = cps[cpi].dict; if (!hasdict && !hasmiss) { continue; } else if (!hasdict && hasmiss) { @@ -775,11 +775,11 @@ void convert_column_to_r_int64_nodict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_int64_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - std::vector &dss = pp->dict_steps[cl][rg]; - for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { - int64_t from = dss[dsi].start; - uint32_t num_values = dss[dsi].num_values; - bool hasdict = dss[dsi].dict; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + bool hasdict = cps[cpi].dict; double *beg = REAL(x) + from; double *end = beg + num_values; if (!hasdict) { @@ -841,13 +841,13 @@ void convert_column_to_r_int64_nodict_miss(postprocess *pp, uint32_t cl) { void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - std::vector &dss = pp->dict_steps[cl][rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; - for (uint32_t dsi = 0; dsi < dss.size(); dsi++) { - int64_t from = dss[dsi].start; - uint32_t num_values = dss[dsi].num_values; - uint32_t num_present = dss[dsi].num_present; - bool hasdict = dss[dsi].dict; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + uint32_t num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; bool hasmiss = num_present != num_values; double *beg = REAL(x) + from; // In theory this happen @@ -2072,7 +2072,7 @@ void RParquetReader::convert_columns_to_r() { metadata, tmpdata, dicts, - dict_steps, + chunk_parts, byte_arrays, present }; diff --git a/src/RParquetReader.h b/src/RParquetReader.h index 36fac78..5220fc4 100644 --- a/src/RParquetReader.h +++ b/src/RParquetReader.h @@ -89,7 +89,7 @@ class RParquetFilter { std::vector columns; }; -struct dict_step { +struct chunk_part { int64_t start; int64_t num_values; int64_t num_present; @@ -123,7 +123,7 @@ class RParquetReader : public ParquetReader { std::vector> tmpdata; std::vector> dicts; - std::vector>> dict_steps; + std::vector>> chunk_parts; std::vector>> byte_arrays; std::vector> present; rmetadata metadata; From 9130c450db16205083978739be4393d320d219bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 12:42:06 +0100 Subject: [PATCH 05/18] Fix chunk part recording --- src/RParquetReader.cpp | 32 +++++++++++++++++--------------- src/lib/ParquetReader.h | 2 +- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 4fa7445..da84c34 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -462,17 +462,6 @@ void RParquetReader::alloc_data_page(DataPage &data) { auto rg = data.cc.row_group; rtype rt = metadata.r_types[cl]; - // If there are missing values, then the page offset is defined by - // the number of present values. I.e. within each column chunk we put the - // present values at the beginning of the memory allocated for that - // column chunk. - uint32_t page_off = data.from; - if (data.cc.optional) { - page_off = present[cl][rg].num_present; - present[cl][rg].num_present += data.num_present; - data.present = present[cl][rg].map.data() + data.from; - } - bool has_dict = data.cc.has_dictionary; bool is_index = has_dict && (data.encoding == parquet::Encoding::RLE_DICTIONARY || @@ -481,20 +470,33 @@ void RParquetReader::alloc_data_page(DataPage &data) { // A non-dict-index page in a column chunk that has a // dictionary page. Should be rare, but arrow does write // these: https://github.com/r-lib/nanoparquet/issues/110 + uint32_t page_off = data.from; std::vector &cps = chunk_parts[cl][rg]; chunk_part &last = cps.back(); if (has_dict && !is_index) { - cps.push_back({ page_off, data.num_values, data.num_present, false }); + cps.push_back({ data.from, data.num_values, data.num_present, false }); } else { - // do we need to add a new dict step? - if (last.dict) { + // do we need to add a new dict step? not if no dicts or last is dict + if (!has_dict || last.dict) { + if (data.cc.optional) { + // If there are missing values, then the page offset is defined by + // the number of present values. I.e. within each column chunk we put the + // present values at the beginning of the memory allocated for that + // column chunk. + page_off = present[cl][rg].num_present; + } last.num_values += data.num_values; last.num_present += data.num_present; } else { - cps.push_back({ page_off, data.num_values, data.num_present, is_index }); + cps.push_back({ data.from, data.num_values, data.num_present, is_index }); } } + if (data.cc.optional) { + present[cl][rg].num_present += data.num_present; + data.present = present[cl][rg].map.data() + data.from; + } + if (is_index) { data.data = (uint8_t*) (dicts[cl][rg].indices.data() + page_off); diff --git a/src/lib/ParquetReader.h b/src/lib/ParquetReader.h index 35e8eb9..dad1f4a 100644 --- a/src/lib/ParquetReader.h +++ b/src/lib/ParquetReader.h @@ -98,7 +98,7 @@ struct DataPage { uint8_t *present; uint32_t num_values; uint32_t num_present; - uint64_t from; + int64_t from; parquet::Encoding::type encoding; StringSet strs; // these are for DELTA_BYTE_ARRAY pages, these need a bit more From 8ca10b3f59ff49e5d4d1c8a041dc71e53720d097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 13:03:36 +0100 Subject: [PATCH 06/18] More mixed dict column chunks support --- src/RParquetReader.cpp | 340 ++++++++++++++++++++++------------------- 1 file changed, 180 insertions(+), 160 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index da84c34..7edb3ab 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -878,6 +878,7 @@ void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { } else { // convert dict values first, if not yet done if (!rg_dict_converted) { + rg_dict_converted = true; double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); double *dend = dbeg + pp->dicts[cl][rg].dict_len; int64_t *idbeg = (int64_t *)dbeg; @@ -947,34 +948,40 @@ void convert_column_to_r_float_nodict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_float_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t off = pp->metadata.row_group_offsets[rg]; - double *beg = REAL(x) + off; - // In theory we might dictionary encode a subset of the columns only - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - double *end = beg + num_values - 1; - float *fend = ((float*) beg) + num_values - 1; - while (beg <= end) { - *end-- = static_cast(*fend--); - } - } else { - // Convert the dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + dict_len - 1; - float *fdend = ((float*) dbeg) + dict_len - 1; - while (dbeg <= dend) { - *dend-- = static_cast(*fdend--); - } + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + bool hasdict = cps[cpi].dict; + double *beg = REAL(x) + from; + // In theory we might dictionary encode a subset of the columns only + if (!hasdict) { + double *end = beg + num_values - 1; + float *fend = ((float*) beg) + num_values - 1; + while (beg <= end) { + *end-- = static_cast(*fend--); + } + } else { + // Convert the dictionary first + if (!rg_dict_converted) { + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + rg_dict_converted = true; + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + dict_len - 1; + float *fdend = ((float*) dbeg) + dict_len - 1; + while (dbeg <= dend) { + *dend-- = static_cast(*fdend--); + } + } - // fill in the dict - double *end = beg + num_values; - double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; + // fill in the dict + double *end = beg + num_values; + double *dict = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } } } } @@ -1014,66 +1021,70 @@ void convert_column_to_r_float_nodict_miss(postprocess *pp, uint32_t cl) { void convert_column_to_r_float_dict_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - double *beg = REAL(x) + pp->metadata.row_group_offsets[rg]; - // In theory this happen - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - uint32_t num_present = pp->present[cl][rg].num_present; + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + uint32_t num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *endm1 = beg + num_values - 1; - float *fendm1 = ((float*) beg) + num_values - 1; - while (beg <= endm1) { - *endm1-- = static_cast(*fendm1--); - } - } else { - // nodict, miss - double *endm1 = beg + num_values - 1; - float *fendm1 = ((float*) beg) + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { + double *beg = REAL(x) + from; + if (!hasdict) { + if (!hasmiss) { + double *endm1 = beg + num_values - 1; + float *fendm1 = ((float*) beg) + num_values - 1; + while (beg <= endm1) { *endm1-- = static_cast(*fendm1--); - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + } + } else { + // nodict, miss + double *endm1 = beg + num_values - 1; + float *fendm1 = ((float*) beg) + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = static_cast(*fendm1--); + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } - } - } else { - // convert dict values first - double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - double *dendm1 = dbeg + dict_len - 1; - float *fdendm1 = ((float*) dbeg) + dict_len - 1; - while (dbeg <= dendm1) { - *dendm1-- = static_cast(*fdendm1--); - } - // fill in values - double *dict = (double *)pp->dicts[cl][rg].buffer.data(); - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; - } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = dict[*dendm1--]; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + // convert dict values first + if (!rg_dict_converted) { + rg_dict_converted = true; + double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + double *dendm1 = dbeg + dict_len - 1; + float *fdendm1 = ((float*) dbeg) + dict_len - 1; + while (dbeg <= dendm1) { + *dendm1-- = static_cast(*fdendm1--); + } + } + // fill in values + double *dict = (double *)pp->dicts[cl][rg].buffer.data(); + if (!hasmiss) { + double *end = beg + num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } + } else { + double *endm1 = beg + num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = dict[*dendm1--]; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } } @@ -1111,31 +1122,36 @@ void convert_column_to_r_int96_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); int96_t *src0 = (int96_t*) pp->tmpdata[cl].data(); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t from = pp->metadata.row_group_offsets[rg]; - // in theory some row groups might be dict encoded, some not - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - double *beg = REAL(x) + from; - double *end = beg + num_values; - if (!hasdict) { - int96_t *src = src0 + from; - while (beg < end) { - *beg++ = impala_timestamp_to_milliseconds(*src++); - } - } else { - // convert dict values in place - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - double *dend = dbeg + dict_len; - int96_t *idbeg = (int96_t*) dbeg; - while (dbeg < dend) { - *dbeg++ = impala_timestamp_to_milliseconds(*idbeg++); - } - double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + bool hasdict = cps[cpi].dict; + double *beg = REAL(x) + from; + double *end = beg + num_values; + if (!hasdict) { + int96_t *src = src0 + from; + while (beg < end) { + *beg++ = impala_timestamp_to_milliseconds(*src++); + } + } else { + // convert dict values in place + if (!rg_dict_converted) { + rg_dict_converted = true; + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + double *dend = dbeg + dict_len; + int96_t *idbeg = (int96_t*) dbeg; + while (dbeg < dend) { + *dbeg++ = impala_timestamp_to_milliseconds(*idbeg++); + } + } + double *dict = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } } } } @@ -1181,65 +1197,69 @@ void convert_column_to_r_int96_dict_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); int96_t *src0 = (int96_t*) pp->tmpdata[cl].data(); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t from = pp->metadata.row_group_offsets[rg]; - double *beg = REAL(x) + from; - // In theory this happen - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - int96_t *ibeg = src0 + from; - uint32_t num_present = pp->present[cl][rg].num_present; + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t from = cps[cpi].start; + uint32_t num_values = cps[cpi].num_values; + uint32_t num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *end = beg + num_values; - while (beg < end) { - *beg++ = impala_timestamp_to_milliseconds(*ibeg++); - } - } else { - double *endm1 = beg + num_values - 1; - int96_t *pendm1 = ibeg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = impala_timestamp_to_milliseconds(*pendm1--); - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + double *beg = REAL(x) + from; + if (!hasdict) { + int96_t *ibeg = src0 + from; + uint32_t num_present = pp->present[cl][rg].num_present; + bool hasmiss = num_present != num_values; + if (!hasmiss) { + double *end = beg + num_values; + while (beg < end) { + *beg++ = impala_timestamp_to_milliseconds(*ibeg++); + } + } else { + double *endm1 = beg + num_values - 1; + int96_t *pendm1 = ibeg + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = impala_timestamp_to_milliseconds(*pendm1--); + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } - } - } else { - // convert dict values first - double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; - int96_t *idbeg = (int96_t *) dbeg; - while (dbeg < dend) { - *dbeg++ = impala_timestamp_to_milliseconds(*idbeg++); - } - double *dict = (double *)pp->dicts[cl][rg].buffer.data(); - - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; - } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = dict[*dendm1--]; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + // convert dict values first + if (!rg_dict_converted) { + rg_dict_converted = true; + double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + pp->dicts[cl][rg].dict_len; + int96_t *idbeg = (int96_t *) dbeg; + while (dbeg < dend) { + *dbeg++ = impala_timestamp_to_milliseconds(*idbeg++); + } + } + double *dict = (double *)pp->dicts[cl][rg].buffer.data(); + if (!hasmiss) { + double *end = beg + num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data(); + while (beg < end) { + *beg++ = dict[*didx++]; + } + } else { + double *endm1 = beg + num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = dict[*dendm1--]; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } } From d8f6a4d7fe776ccd70bd655aa798ea0f9e105d8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 16:45:07 +0100 Subject: [PATCH 07/18] Rewrite chunk parts in reader So it actually works. --- src/RParquetReader.cpp | 299 +++++++++--------- src/RParquetReader.h | 2 +- .../testthat/test-write-parquet-statistics.R | 2 +- 3 files changed, 153 insertions(+), 150 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 7edb3ab..836157a 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -408,14 +408,10 @@ rtype::rtype(parquet::SchemaElement &sel) { void RParquetReader::alloc_column_chunk(ColumnChunk &cc) { uint32_t cl = colmap[cc.column] - 1; - uint32_t rg = cc.row_group; if (chunk_parts[cl].size() == 0) { // first row group of this column chunk_parts[cl].resize(metadata.num_row_groups); } - chunk_parts[cl][rg].push_back( - { metadata.row_group_offsets[rg], 0, 0, cc.has_dictionary } - ); if (metadata.r_types[cl].byte_array) { if (byte_arrays[cl].size() == 0) { @@ -472,23 +468,20 @@ void RParquetReader::alloc_data_page(DataPage &data) { // these: https://github.com/r-lib/nanoparquet/issues/110 uint32_t page_off = data.from; std::vector &cps = chunk_parts[cl][rg]; - chunk_part &last = cps.back(); - if (has_dict && !is_index) { - cps.push_back({ data.from, data.num_values, data.num_present, false }); + if (cps.size() == 0) { + cps.push_back({ data.from, data.num_values, data.num_present, is_index }); } else { - // do we need to add a new dict step? not if no dicts or last is dict - if (!has_dict || last.dict) { + chunk_part &last = cps.back(); + if (is_index == last.dict) { + // same as last, extend chunk part if (data.cc.optional) { - // If there are missing values, then the page offset is defined by - // the number of present values. I.e. within each column chunk we put the - // present values at the beginning of the memory allocated for that - // column chunk. - page_off = present[cl][rg].num_present; + page_off = last.offset + last.num_present; } last.num_values += data.num_values; last.num_present += data.num_present; } else { - cps.push_back({ data.from, data.num_values, data.num_present, is_index }); + // new chunk part + cps.push_back({ data.from, data.num_values, data.num_present, is_index}); } } @@ -543,37 +536,38 @@ void convert_column_to_r_dicts(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { if (pp->dicts[cl][rg].dict_len == 0) continue; std::vector &cps = pp->chunk_parts[cl][rg]; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { if (!cps[cpi].dict) continue; - int64_t from = cps[cpi].start; - int64_t num_values = cps[cpi].num_values; + int64_t cp_offset = cps[cpi].offset; + int64_t cp_num_values = cps[cpi].num_values; SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { - int *beg = INTEGER(x) + from; - int *end = beg + num_values; + int *beg = INTEGER(x) + rg_offset + cp_offset; + int *end = beg + cp_num_values; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } break; } case REALSXP: { - double *beg = REAL(x) + from; - double *end = beg + num_values; + double *beg = REAL(x) + rg_offset + cp_offset; + double *end = beg + cp_num_values; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } break; } case LGLSXP: { // # nocov start - int *beg = LOGICAL(x) + from; - int *end = beg + num_values; + int *beg = LOGICAL(x) + rg_offset + cp_offset; + int *end = beg + cp_num_values; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } @@ -585,26 +579,27 @@ void convert_column_to_r_dicts(postprocess *pp, uint32_t cl) { } void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { + SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; - int64_t num_present = cps[cpi].num_present; - bool hasmiss = num_present != num_values; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + int64_t cp_num_present = cps[cpi].num_present; + bool hasmiss = cp_num_present != cp_num_values; bool hasdict = cps[cpi].dict; if (!hasdict && !hasmiss) { continue; } else if (!hasdict && hasmiss) { // missing values in place - SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { - int *beg = INTEGER(x) + from; - int *endm1 = beg + num_values - 1; - int *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; + int *beg = INTEGER(x) + rg_offset + cp_offset; + int *endm1 = beg + cp_num_values - 1; + int *pendm1 = beg + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; + uint32_t num_miss = cp_num_values - cp_num_present; while (num_miss > 0) { if (*presm1 != 0) { *endm1-- = *pendm1--; @@ -618,11 +613,11 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { break; } case REALSXP: { - double *beg = REAL(x) + from; - double *endm1 = beg + num_values - 1; - double *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; + double *beg = REAL(x) + rg_offset + cp_offset; + double *endm1 = beg + cp_num_values - 1; + double *pendm1 = beg + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; + uint32_t num_miss = cp_num_values - cp_num_present; while (num_miss > 0) { if (*presm1) { *endm1-- = *pendm1--; @@ -636,11 +631,11 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { break; } case LGLSXP: { - int *beg = LOGICAL(x) + from; - int *endm1 = beg + num_values - 1; - int *pendm1 = beg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - uint32_t num_miss = num_values - num_present; + int *beg = LOGICAL(x) + rg_offset + cp_offset; + int *endm1 = beg + cp_num_values - 1; + int *pendm1 = beg + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; + uint32_t num_miss = cp_num_values - cp_num_present; while (num_miss > 0) { if (*presm1) { *endm1-- = *pendm1--; @@ -661,20 +656,20 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { - int *beg = INTEGER(x) + from; - int *end = beg + num_values; + int *beg = INTEGER(x) + rg_offset + cp_offset; + int *end = beg + cp_num_values; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } break; } case REALSXP: { - double *beg = REAL(x) + from; - double *end = beg + num_values; + double *beg = REAL(x) + rg_offset + cp_offset; + double *end = beg + cp_num_values; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } @@ -682,10 +677,10 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { } case LGLSXP: { // # nocov start // BOOLEAN dictionaries are not really possible... - int *beg = LOGICAL(x) + from; - int *end = beg + num_values; + int *beg = LOGICAL(x) + rg_offset + cp_offset; + int *end = beg + cp_num_values; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); - uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data(); + uint32_t *idx = (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*idx++]; } @@ -696,16 +691,14 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { } } else if (hasdict && hasmiss) { // dict + missing values - int64_t from = pp->metadata.row_group_offsets[rg]; - SEXP x = VECTOR_ELT(pp->columns, cl); switch (TYPEOF(x)) { case INTSXP: { - int *beg = INTEGER(x) + from; - int *endm1 = beg + num_values - 1; + int *beg = INTEGER(x) + rg_offset + cp_offset; + int *endm1 = beg + cp_num_values - 1; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (endm1 >= beg) { if (*presm1) { *endm1-- = dict[*idxm1--]; @@ -718,12 +711,12 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { break; } case REALSXP: { - double *beg = REAL(x) + from; - double *endm1 = beg + num_values - 1; + double *beg = REAL(x) + rg_offset + cp_offset; + double *endm1 = beg + cp_num_values - 1; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (endm1 >= beg) { if (*presm1) { *endm1-- = dict[*idxm1--]; @@ -737,12 +730,12 @@ void convert_column_to_r_dicts_na(postprocess *pp, uint32_t cl) { } case LGLSXP: { // BOOLEAN dictionaries are not really possible... // # nocov start - int *beg = LOGICAL(x) + from; - int *endm1 = beg + num_values - 1; + int *beg = LOGICAL(x) + rg_offset + cp_offset; + int *endm1 = beg + cp_num_values - 1; int *dict = (int*) pp->dicts[cl][rg].buffer.data(); uint32_t *idxm1 = - (uint32_t*) pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + (uint32_t*) pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (endm1 >= beg) { if (*presm1) { *endm1-- = dict[*idxm1--]; @@ -778,12 +771,14 @@ void convert_column_to_r_int64_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + from; - double *end = beg + num_values; + double *beg = REAL(x) + rg_offset + cp_offset; + double *end = beg + cp_num_values; if (!hasdict) { int64_t *ibeg = (int64_t*) beg; while (beg < end) { @@ -791,14 +786,18 @@ void convert_column_to_r_int64_dict_nomiss(postprocess *pp, uint32_t cl) { } } else { // first convert tbe dict values - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; - int64_t *idbeg = (int64_t *) dbeg; - while (dbeg < dend) { - *dbeg++ = static_cast(*idbeg++); + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + dict_len; + int64_t *idbeg = (int64_t *) dbeg; + while (dbeg < dend) { + *dbeg++ = static_cast(*idbeg++); + } } double *dict = (double*) pp -> dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } @@ -845,25 +844,25 @@ void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; - uint32_t num_present = cps[cpi].num_present; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + uint32_t cp_num_present = cps[cpi].num_present; bool hasdict = cps[cpi].dict; - bool hasmiss = num_present != num_values; - double *beg = REAL(x) + from; - // In theory this happen + bool hasmiss = cp_num_present != cp_num_values; + double *beg = REAL(x) + rg_offset + cp_offset; if (!hasdict) { int64_t *ibeg = (int64_t *)beg; if (!hasmiss) { - double *end = beg + num_values; + double *end = beg + cp_num_values; while (beg < end) { *beg++ = static_cast(*ibeg++); } } else { - double *endm1 = beg + num_values - 1; - int64_t *pendm1 = ibeg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + int64_t *pendm1 = ibeg + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = static_cast(*pendm1--); @@ -877,10 +876,11 @@ void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { } else { // convert dict values first, if not yet done - if (!rg_dict_converted) { + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { rg_dict_converted = true; double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; + double *dend = dbeg + dict_len; int64_t *idbeg = (int64_t *)dbeg; while (dbeg < dend) { *dbeg++ = static_cast(*idbeg++); @@ -888,15 +888,15 @@ void convert_column_to_r_int64_dict_miss(postprocess *pp, uint32_t cl) { } double *dict = (double *)pp->dicts[cl][rg].buffer.data(); if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + double *end = beg + cp_num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = dict[*dendm1--]; @@ -950,22 +950,23 @@ void convert_column_to_r_float_dict_nomiss(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + from; + double *beg = REAL(x) + rg_offset + cp_offset; // In theory we might dictionary encode a subset of the columns only if (!hasdict) { - double *end = beg + num_values - 1; - float *fend = ((float*) beg) + num_values - 1; + double *end = beg + cp_num_values - 1; + float *fend = ((float*) beg) + cp_num_values - 1; while (beg <= end) { *end-- = static_cast(*fend--); } } else { // Convert the dictionary first - if (!rg_dict_converted) { uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { rg_dict_converted = true; double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); double *dend = dbeg + dict_len - 1; @@ -976,9 +977,9 @@ void convert_column_to_r_float_dict_nomiss(postprocess *pp, uint32_t cl) { } // fill in the dict - double *end = beg + num_values; + double *end = beg + cp_num_values; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } @@ -1023,25 +1024,26 @@ void convert_column_to_r_float_dict_miss(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; - uint32_t num_present = cps[cpi].num_present; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + uint32_t cp_num_present = cps[cpi].num_present; bool hasdict = cps[cpi].dict; - bool hasmiss = num_present != num_values; - double *beg = REAL(x) + from; + bool hasmiss = cp_num_present != cp_num_values; + double *beg = REAL(x) + rg_offset + cp_offset; if (!hasdict) { if (!hasmiss) { - double *endm1 = beg + num_values - 1; - float *fendm1 = ((float*) beg) + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + float *fendm1 = ((float*) beg) + cp_num_values - 1; while (beg <= endm1) { *endm1-- = static_cast(*fendm1--); } } else { // nodict, miss - double *endm1 = beg + num_values - 1; - float *fendm1 = ((float*) beg) + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + float *fendm1 = ((float*) beg) + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = static_cast(*fendm1--); @@ -1055,10 +1057,10 @@ void convert_column_to_r_float_dict_miss(postprocess *pp, uint32_t cl) { } else { // convert dict values first - if (!rg_dict_converted) { + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { rg_dict_converted = true; double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - uint32_t dict_len = pp->dicts[cl][rg].dict_len; double *dendm1 = dbeg + dict_len - 1; float *fdendm1 = ((float*) dbeg) + dict_len - 1; while (dbeg <= dendm1) { @@ -1068,15 +1070,15 @@ void convert_column_to_r_float_dict_miss(postprocess *pp, uint32_t cl) { // fill in values double *dict = (double *)pp->dicts[cl][rg].buffer.data(); if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + double *end = beg + cp_num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = dict[*dendm1--]; @@ -1124,23 +1126,24 @@ void convert_column_to_r_int96_dict_nomiss(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + from; - double *end = beg + num_values; + double *beg = REAL(x) + rg_offset + cp_offset; + double *end = beg + cp_num_values; if (!hasdict) { - int96_t *src = src0 + from; + int96_t *src = src0 + rg_offset + cp_offset; while (beg < end) { *beg++ = impala_timestamp_to_milliseconds(*src++); } } else { // convert dict values in place - if (!rg_dict_converted) { + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { rg_dict_converted = true; double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t dict_len = pp->dicts[cl][rg].dict_len; double *dend = dbeg + dict_len; int96_t *idbeg = (int96_t*) dbeg; while (dbeg < dend) { @@ -1148,7 +1151,7 @@ void convert_column_to_r_int96_dict_nomiss(postprocess *pp, uint32_t cl) { } } double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } @@ -1199,26 +1202,25 @@ void convert_column_to_r_int96_dict_miss(postprocess *pp, uint32_t cl) { for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t from = cps[cpi].start; - uint32_t num_values = cps[cpi].num_values; - uint32_t num_present = cps[cpi].num_present; + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + uint32_t cp_num_present = cps[cpi].num_present; bool hasdict = cps[cpi].dict; - bool hasmiss = num_present != num_values; - double *beg = REAL(x) + from; + bool hasmiss = cp_num_present != cp_num_values; + double *beg = REAL(x) + rg_offset + cp_offset; if (!hasdict) { - int96_t *ibeg = src0 + from; - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; + int96_t *ibeg = src0 + rg_offset + cp_offset; if (!hasmiss) { - double *end = beg + num_values; + double *end = beg + cp_num_values; while (beg < end) { *beg++ = impala_timestamp_to_milliseconds(*ibeg++); } } else { - double *endm1 = beg + num_values - 1; - int96_t *pendm1 = ibeg + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + int96_t *pendm1 = ibeg + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = impala_timestamp_to_milliseconds(*pendm1--); @@ -1232,10 +1234,11 @@ void convert_column_to_r_int96_dict_miss(postprocess *pp, uint32_t cl) { } else { // convert dict values first - if (!rg_dict_converted) { + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { rg_dict_converted = true; double *dbeg = (double *)pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; + double *dend = dbeg + dict_len; int96_t *idbeg = (int96_t *) dbeg; while (dbeg < dend) { *dbeg++ = impala_timestamp_to_milliseconds(*idbeg++); @@ -1243,15 +1246,15 @@ void convert_column_to_r_int96_dict_miss(postprocess *pp, uint32_t cl) { } double *dict = (double *)pp->dicts[cl][rg].buffer.data(); if (!hasmiss) { - double *end = beg + num_values; - uint32_t *didx = pp->dicts[cl][rg].indices.data(); + double *end = beg + cp_num_values; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; while (beg < end) { *beg++ = dict[*didx++]; } } else { - double *endm1 = beg + num_values - 1; - uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + num_present - 1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; + double *endm1 = beg + cp_num_values - 1; + uint32_t *dendm1 = pp->dicts[cl][rg].indices.data() + cp_offset + cp_num_present - 1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { *endm1-- = dict[*dendm1--]; diff --git a/src/RParquetReader.h b/src/RParquetReader.h index 5220fc4..59cbb12 100644 --- a/src/RParquetReader.h +++ b/src/RParquetReader.h @@ -90,7 +90,7 @@ class RParquetFilter { }; struct chunk_part { - int64_t start; + int64_t offset; // within the row group int64_t num_values; int64_t num_present; bool dict; diff --git a/tests/testthat/test-write-parquet-statistics.R b/tests/testthat/test-write-parquet-statistics.R index 55719ee..9230654 100644 --- a/tests/testthat/test-write-parquet-statistics.R +++ b/tests/testthat/test-write-parquet-statistics.R @@ -260,7 +260,7 @@ test_that("min/max for FLOAT", { options = parquet_options(num_rows_per_row_group = 5), ... ) - expect_equal(as.data.frame(df), as.data.frame(read_parquet(tmp))) + expect_equal(as.data.frame(read_parquet(tmp)), as.data.frame(df)) mtd <- as.data.frame(read_parquet_metadata(tmp)[["column_chunks"]]) list( as_flt(mtd[["min_value"]]), From 9608cb577cec12395aa57510b4613ff6acd7c018 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 18:51:53 +0100 Subject: [PATCH 08/18] Mixed dict + non-dict pages support for strings --- src/RParquetReader.cpp | 69 ++++++++++-------- tests/testthat/_snaps/read-parquet-5.md | 93 ++++++++++++++---------- tests/testthat/data/create-data.py | 20 +++-- tests/testthat/data/mixed-miss.parquet | Bin 22803 -> 34125 bytes tests/testthat/data/mixed.parquet | Bin 11631 -> 17247 bytes tests/testthat/data/mixed2.parquet | Bin 6938 -> 10211 bytes tests/testthat/test-read-parquet-5.R | 3 + 7 files changed, 105 insertions(+), 80 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 836157a..5c21b50 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -1312,49 +1312,58 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, lcl); SET_VECTOR_ELT(pp->facdicts, lcl, Rf_allocVector(VECSXP, pp->metadata.num_row_groups)); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - int64_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - std::vector rgba = pp->byte_arrays[cl][rg]; - for (auto it = rgba.begin(); it != rgba.end(); ++it) { - int64_t from = it->from; - for (auto i = 0; i < it->offsets.size(); i++) { + // first the non-dict parts, if any + std::vector rgba = pp->byte_arrays[cl][rg]; + for (auto it = rgba.begin(); it != rgba.end(); ++it) { + int64_t from = it->from; + for (auto i = 0; i < it->offsets.size(); i++) { + SEXP xi = Rf_mkCharLenCE( + (char*) it->buffer.data() + it->offsets[i], + it->lengths[i], + CE_UTF8 + ); + SET_STRING_ELT(x, from, xi); + from++; + } + } + + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + SEXP tmp = R_NilValue; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; + if (!hasdict) continue; + // convert dictionary first + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { SEXP xi = Rf_mkCharLenCE( - (char*) it->buffer.data() + it->offsets[i], - it->lengths[i], + (char*) ba.buffer.data() + ba.offsets[i], + ba.lengths[i], CE_UTF8 ); - SET_STRING_ELT(x, from, xi); - from++; + SET_STRING_ELT(tmp, i, xi); } + SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, lcl), rg, tmp); } - } else { - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - SEXP tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - SEXP xi = Rf_mkCharLenCE( - (char*) ba.buffer.data() + ba.offsets[i], - ba.lengths[i], - CE_UTF8 - ); - SET_STRING_ELT(tmp, i, xi); - } - SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, lcl), rg, tmp); // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - uint32_t *end = didx + pp->dicts[cl][rg].indices.size(); - int64_t from = pp->metadata.row_group_offsets[rg]; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *end = didx + cp_num_present; + int64_t from = rg_offset + cp_offset; while (didx < end) { SET_STRING_ELT(x, from, STRING_ELT(tmp, *didx)); from++; didx++; } - UNPROTECT(1); } + if (!Rf_isNull(tmp)) UNPROTECT(1); } } diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 65de50f..ac8e099 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -57,64 +57,79 @@ Code as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) Output - type repetition_type - 1 REQUIRED - 2 INT32 REQUIRED - 3 INT64 REQUIRED + type repetition_type + 1 REQUIRED + 2 INT32 REQUIRED + 3 INT64 REQUIRED + 4 BYTE_ARRAY REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output - page_type num_values encoding - 1 DICTIONARY_PAGE 400 PLAIN - 2 DATA_PAGE 1024 RLE_DICTIONARY - 3 DATA_PAGE 1024 PLAIN - 4 DATA_PAGE 352 PLAIN - 5 DICTIONARY_PAGE 400 PLAIN - 6 DATA_PAGE 1024 RLE_DICTIONARY - 7 DATA_PAGE 1024 PLAIN - 8 DATA_PAGE 352 PLAIN + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 1024 PLAIN + 4 DATA_PAGE 352 PLAIN + 5 DICTIONARY_PAGE 400 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 PLAIN + 8 DATA_PAGE 352 PLAIN + 9 DICTIONARY_PAGE 400 PLAIN + 10 DATA_PAGE 1024 RLE_DICTIONARY + 11 DATA_PAGE 1024 PLAIN + 12 DATA_PAGE 352 PLAIN --- Code as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) Output - type repetition_type - 1 REQUIRED - 2 INT32 REQUIRED - 3 INT64 REQUIRED + type repetition_type + 1 REQUIRED + 2 INT32 REQUIRED + 3 INT64 REQUIRED + 4 BYTE_ARRAY REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output - page_type num_values encoding - 1 DICTIONARY_PAGE 400 PLAIN - 2 DATA_PAGE 1024 RLE_DICTIONARY - 3 DATA_PAGE 1024 RLE_DICTIONARY - 4 DATA_PAGE 352 RLE_DICTIONARY - 5 DICTIONARY_PAGE 400 PLAIN - 6 DATA_PAGE 1024 RLE_DICTIONARY - 7 DATA_PAGE 1024 RLE_DICTIONARY - 8 DATA_PAGE 352 RLE_DICTIONARY + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 1024 RLE_DICTIONARY + 4 DATA_PAGE 352 RLE_DICTIONARY + 5 DICTIONARY_PAGE 400 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 RLE_DICTIONARY + 8 DATA_PAGE 352 RLE_DICTIONARY + 9 DICTIONARY_PAGE 400 PLAIN + 10 DATA_PAGE 1024 RLE_DICTIONARY + 11 DATA_PAGE 1024 RLE_DICTIONARY + 12 DATA_PAGE 352 RLE_DICTIONARY --- Code as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) Output - type repetition_type - 1 REQUIRED - 2 INT32 OPTIONAL - 3 INT64 OPTIONAL + type repetition_type + 1 REQUIRED + 2 INT32 OPTIONAL + 3 INT64 OPTIONAL + 4 BYTE_ARRAY OPTIONAL Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output - page_type num_values encoding - 1 DICTIONARY_PAGE 1024 PLAIN - 2 DATA_PAGE 1024 RLE_DICTIONARY - 3 DATA_PAGE 1024 PLAIN - 4 DATA_PAGE 352 PLAIN - 5 DICTIONARY_PAGE 1024 PLAIN - 6 DATA_PAGE 1024 RLE_DICTIONARY - 7 DATA_PAGE 1024 PLAIN - 8 DATA_PAGE 352 PLAIN + page_type num_values encoding + 1 DICTIONARY_PAGE 1024 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 1024 PLAIN + 4 DATA_PAGE 352 PLAIN + 5 DICTIONARY_PAGE 1024 PLAIN + 6 DATA_PAGE 1024 RLE_DICTIONARY + 7 DATA_PAGE 1024 PLAIN + 8 DATA_PAGE 352 PLAIN + 9 DICTIONARY_PAGE 1024 PLAIN + 10 DATA_PAGE 1024 RLE_DICTIONARY + 11 DATA_PAGE 1024 PLAIN + 12 DATA_PAGE 352 PLAIN diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py index 61149ef..2894adf 100644 --- a/tests/testthat/data/create-data.py +++ b/tests/testthat/data/create-data.py @@ -2,9 +2,14 @@ import pyarrow.parquet as pq schema = pa.schema(fields=[ pa.field(name = 'x', type = pa.int32(), nullable = False), - pa.field(name = 'y', type = pa.int64(), nullable = False) + pa.field(name = 'y', type = pa.int64(), nullable = False), + pa.field(name = "s", type = pa.utf8(), nullable = False), ]) -data = [ list(range(400)) * 6, list(range(400)) * 6 ] +data = [ + list(range(400)) * 6, + list(range(400)) * 6, + [ str(x) for x in range(400) ] * 6 +] table = pa.table(data = data, schema = schema) pq.write_table( table, @@ -13,14 +18,6 @@ dictionary_pagesize_limit = 400 ) -import pyarrow as pa -import pyarrow.parquet as pq -schema = pa.schema(fields=[ - pa.field(name = 'x', type = pa.int32(), nullable = False), - pa.field(name = 'y', type = pa.int64(), nullable = False) -]) -data = [ list(range(400)) * 6, list(range(400)) * 6 ] -table = pa.table(data = data, schema = schema) pq.write_table( table, 'tests/testthat/data/mixed2.parquet', @@ -31,7 +28,8 @@ import pyarrow.parquet as pq table = pa.table({ 'x': pa.array(range(2400), type=pa.int32()), - 'y': pa.array(range(2400), type=pa.int64()) + 'y': pa.array(range(2400), type=pa.int64()), + 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()) }) pq.write_table( table, diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet index 315f5044ccf503fa73bac5248b6a3eb736b36183..f71948af3b4decaf367e081dd2b60576e1586b04 100644 GIT binary patch literal 34125 zcmeI*2bdLAx+rYGZfFn`ktT^s&N-u~G&$#-bIv&mC`mF%5>axFB1uKbIp>^n&N=;W zqu!Y_<9UX;|9I}4bEt>6_O7b6SM6HWwYvAmS8JCDDV07rAh>(Z;J!JF1b2@b9413)1u02IYSNIFbfhN(8OcOuvXGT*Wak4uBnKbyF`tl=5OR^5 zJme)G`6)m_3Q?FM6r~u&DM3j}QJON8r5xp{Kt(E1nJQGJ8r7*mO=?k_I@F~e^=Uvu z8qt_0G^H8MX+cX`(V8~2r5)|*Ku0>!nJ#qYQ@YWe9(=~<^rRQP=|f+>pdbAiz(58u zm>~>h7+*4+5sYLMqxp(4e9c(C;akQro(W835|f$2RHiYV8O&rBvzfzO<}sh|Situz zWD$#5!cuS|UJKW_S_j$lW9`TqbJmneB z3BCBk5{}mhAdv7xAc%-W;&mbug*SMUsJulq-X=Q1#2_ZIh)o>cAujLo9`6&6_#_}9 ziAYQml9G(%q#z}!NKG2jl8*FbAS0Q`Oct_|jqH5DhveWRKIRj05<)I=lZU+IBR>Tw zNFfSSgrXFqI3*}aDN0j@vXrAd6{tuhDpQ53RHHgIs7WnqQ-`|LqdpC2NFy54gr+p3 zIW1^OD_YZrwzQ)?9q33WI@5)&d`dUE(}U0WoSyWeH+|^K7xbe)0~p941~Y`A4C70N zGlG$fVl-bdhOZgRH+;)D#xsG5Oky%qn94M!GlQATVm5P_%RJ`u9SiuLg)Cw*OIXSe z{K!u%<7a+hIV)JnDps?GwX9=38`#JuHnWATY-2k+*vT$-vxmLxV?PHt$RQ4Mgrgke zI43yCDNb{Svz+5R7r4kJE^~#eT;n=7xXCSUbBDX!<30~~$Ri%}gr_{?dDx2&?>vqW zHk`|!*9ahx@I)Yph(zLbA`^u-c$28SMKsE#sCb5W39Nr-=@A4k+6OZ^LAR&oJ zOcIikjO3&sC8@% z3R8rl6r(sLC`l-V?7(#$R;+kg{^F3J3H9PE_Snrz3gK@2RO(f4s(Q~9OF1A zILRqabB42=<2)C*$R#dwg{xfSIybnqI6BZ}28jd5dVgO>}~ZK}=#1n>f5fT;An9-X|XMNkBppk(eYTB^k*{ zK}u4Qnlz*(9qGwHMlz9^EMz4c+4+DE$-zf_%qQd|gk0n%4|&N)ehN^KLKLP5MJYyc zN>Gwgl%@=2DMxuKP?1VhrV3T5Ms;dXlUmfK4t1$VeHze^Ml_}gO=(7RTF{bKw5APh zX-9iH(2-7brVCy9lx}pV2cPjdJ?TYn`p}mz=tqABFpxnEW(Y$W#+M9d1S1*6Xue_$ zUo)0(_?B^uX95$M#AK#0m1#_81~Zw(Z00bRdCccK7Vtd_S;S(Nu#_M8k)K$`&-}u2 zR)oEPH>V_oaPK? zImdY}aFI(~<_cH2#&vFRlUv;84tKf7eID?TM?B^UPkF|3g2M!lj2=8bdaBT%K-A#y z!Qtb^3JBd0DYMyv21EQCyu`)1Lo$)Kf$8B?BWrTP=d2Y))L0!Q+nMpJ-^g+2#OgP52E<NzH z;0kpntc_8n&B?VfYxIo2E>@l4HP^*%FnPkdI87FwT=!0kHSyQSZL_E5`gc2=nXvx7 zE_Y6@f4_S`f(`L{Mz6IYe&0kBHzXL4>C}dVL-Hipm}q#JS{oCOt}}6ClCf=0ZA?18 zXM#=1CJ(Q*Df#rt6E~%pz3|kgl=Igl*qmzNo?4qzFFiAHbDCv$PHj%RA|T!sJQY zKDe^*^tKOgtVy^%$DKX3w}15D%%trfKe=;y`zK)n6YaqqMKYh=Q@TXn#Cyw>DO-1M*$Q>1 z>@8QN?b*HMYxGLIuR@&>b@x?lFlEZVN=+7>-B-EA+Qj>-wAovCf7K3Wr|hrR5KIZ&f#aJ>UH`zD@xpw@uQ=MK~!k~hi0I>XD>J6Lyg-Khubjct4GVEysEk{oI< zc|^TK4X00;dZ^LtMduDRp1(H9;U)|B);rvE>Dj4=n=QM0?r`%Jfk}_FSQA|TNXre0 zryXgvCG+_st#{;2dbG`+vh|O)Jy3Vr(RN4LoGp3bY;=` zW1VlTO?te`oxSysckTAz?6l*bKDm4Tc(*X&lbz@u5Tn6~9zjW_pZF|tmJ267kD4#p z$)3^6H8|NTR=w#bd&g~e;bfn9y_22ln`mT%Q(q*TI{j3?REsa1>Yr|1veN@H?Q3v) zV77D9PY=p*@51T9Im0JEGbB%phG&KrNHXKhup(J5p82vwzT{_zmnqlq?1&2WW}F>a zrQOA|qiXa{er|M~kqys%)nMw3b7Pt;zIg8I7VDCqAKPYM!}H&CI5*?`w_WaCJU^~` z_!Jk$_l(i#!i2s_W?q;$Aj_o-lZNC=adGnSa*Zxd8C`GY#i?W4UAj1JeD4&OrcWN( z=+cboQ)gb9IeYP?OS9&$OL2Mj!hMY{&slnI=HXH+^Q(jwoW@O`QKU|nP>)MZ37GJ*h z(~Wg0uP?i^ukrPtADo+Y{g)^AE?-|BCPJzkD*|FRxv?@R>FgV;B4@pFV|CR0scx=` zUcSlAwXy2YzPT=L`ztrs$Lo{o)`mo*n%vr$Y})Kwn^G;ga%*$C^{H-e$+W-8?XB6) z&%V7a$Nej}x95zI`p%9#F`M4mSs>}0JG+Wxy?SSNiTtVW?kQ8g>D|2*>d(2muS)x? zclX!mlltC)I-{E2JJ?{_oO_3wEV+8`aEtY+?;mNizv=y>9nR0Wf2_;>tM`w0kC5iU ziJmc=JviAn>D&jW24ubV;PjCEX&#;#UcTAGv!md$+6?@0UWPw$`TllIwzGozY6dw5~myl0QDEV=&dao~;h zX`es2v%mTCrw`80d;aXn{p-)4>putO3?7~HMgJM*_x>}&i~ds&`lA2*KTEpJKJo5^ zkNR8x8AgGygyS^=2qZia2qGepc%8^Z;SJs-DsK^uw~0@0trU*qTMsZ3|l2VkW3}q=tc`8tmN>ru_RjEdGYEY9})TRz~sYiVp(2zzn zrU^}HMsr%wl2){)4Q**hdpgjOPIRUVUHO!5bf*WO@i{%|MQ{4hmoMl?e+Dp+K@4UH zLm9@G3}*x*8O3P6Vhmq1mT&l$ag1jI6Pd(hrZAOhOlJl&nZ<18Fqe7E=Q|egJquaH zVwSL!ANY}AZhTiM2TcCeFO>}C&p*~fkkaF9bB z<_JeQ#&J$?l2e@K3}-pVc`k5~OI+p(SGmS@Zg7)Z+~y8rl%y1;DMMMxQJxA^q!N{>LRG3!of_1n7PYBEUFuPv1~jA*jcGztn$esV zw4@cSX+vAu(Vh-;q!XR#LRUVe8{O%_XM9dideNIc^yLfs(VqbfWDtWH!cd0sCBqrP zNJcT5uNcGEjO81?WgO#~z(gi7nJG+V8q=AUG8z82R!5vk9opVp7ES8;a&fP<23>Z zBs>uaA|jD^oybJt4c;UwZxM~RiB2#vh)FDB6Nh(*%e%bC`@|zY2}npH5|f0aBqKQ~ zNJ%PElZLdUBRv_&NG39qg{)*FJ0I{NIrxZ=`GlN=kc-^pAusvJPXP*2h{6=1D8(pF z2})9m(v+brs7?)PQj6Nup)U2PPXij#h{iObDa~k33tG~O*0iB5 z?PyO2I?{>GbfGJs(v9x);4?m_C%x!RANukI{pimC1~Q1j3}Gn4_>$p_U?ig$%~y=! zYsT^o-!hKzOkg6Dn9LNWGL7lXU?#Je%^c=3kNJGZ0={PsZeQHnNG$Y+)*>T;VF$xXul3a*NyC;V$>M&jTLvh{rtPDbILL7#|G65{}mhAdv7xAc%-W z;&mbug*SMUsJulq-X=Q1#2_ZIh)o>cAujLo9`6&6_#_}9iAYQml9G(%q#z}!NKG2j zl8*FbAS0Q`Oct_|jqH5DhveWRKIRj05<)I=lZU+IBR>TwNFfSSgrXFqI3*}aDN0j@ zvXrAd6{tuhDpQ53RHHgIs7WnqQ-`|LqdpC2NFy54gr+p3IW1^OD_YZrwzQ)?9q33W zI@5)&d`dUE(}U0WoSyWeH+|^K7xbe)0~p941~Y`A4C70NGlG$fVl-bdhOZgRH+;)D z#xsG5Oky%qn94M!GlQATVm5P_%RJ`u9SiuLg)Cw*OIXSe{K!u%<7a+hIV)JnDps?G zwX9=38`#JuHnWATY-2k+*vT$-vxmLxV?PHt$RQ4MgrgkeI43yCDNb{Svz+5R7r4kJ zE^~#eT;n=7xXCSUbBDX!<30~~$Ri%}gr_{?IiazG7{Q}sqzYaU==ra)*Xj{pd~gqP z8T$bKm!;6y!RvQ=zlgoA2n-vVmlc+9yhZ?lgeL+)L?jZg6PYNy!J9-QWF#jADM>|Y(vX&Pq$dLz$wX$dkd-nMQr5Vj>K}%ZEnl`kh9qs8rM>^4&E_CHny3w5;e8%VWq!+#E zLtnn2AN?7?Kn5|GAq-_0UoxB#jARs}`HC@o%~-zSTgEY-2~1=XlbOO)rZJrv%w!g` znZsP>F`w^P!1pX<5sO*EQhwk^eqtFv^9##a!Ae%Knl-Ft9qZY^MmDjTEo@~Q+u6ZR zcCnj1>}4POIlw^fMJ{ofD_rFo*SWz>ZgHDC+~pqk zdB8&+@t7w(3)1u02IYSNIFbfhN(8OcOuvXGT*Wak4uBnKbyF`tl=5OR^5 zJme)G`6)m_3Q?FM6r~u&DM3j}QJON8r5xp{Kt(E1nJQGJ8r7*mO=?k_I@F~e^=Uvu z8qt_0G^H8MX+cX`(V8~2r5)|*Ku0>!nJ#qYQ@YWe9(=~<^rRQP=|f+>pdbAiz(58u zm>~>h7+*4+5sYLMqxp(4e9c(C;akQro(W835|f$2RHiYV8O&rBvzfzO<}sh|Situz zWD$#5!cuS|UJKW_S_j$lW9`TqbJmneB zqXgZ27!WpW#J65u{+c576<6rwOiC`vJkQ-YF|qBLbFOF7C@fr?b3GF7Nb zHL6pCn$)5;b*M`{>eGORG@>z0Xi77h(}I??qBU)3OFP=rfsS;dGhOJ)r*xw`J@|~z z=}9ko(}%u%K|lI4fPoBRFhdy1Fur6sBN)jjM)MV8_?oeN!?%oMJQJA6BqlS3sZ3)! zGnmONW;2Jm%ws;^v4HPc$RZZAgr)qzkNm_ke&!dJvx1eZVl``6%R1JxfsJfpGh5io zHny{ao$O*ad)Ui9_H%%P9O5uXILa}ObApqc;xuPC%Q?<-fs0(?GFQ0DHLi1mo800y zceu+v?(=|$JmN7=c*--LM+tiG=9^){#(DVWtLtBT@0trU*qTMsZ3| zl2VkW3}q=tc`8tmN>ru_RjEdGYEY9})TRz~sYiVp(2zznrU^}HMsr%wl2){)4Q**h zdpgjOPIRUVUHO!5bf*WO@i{%|MQ{4hmoMl?e+Dp+K@4UHLm9@G3}*x*8O3P6Vhmq1 zmT&l$ag1jI6Pd(hrZAOhOlJl&nZ<18Fqe7E=Q|egJquaHVwSL!ANY}AZhTiM2TcCeFO>}C&p*~fkkaF9bB<_JeQ#&J$?l2e@K3}-pV zc`k5~OI+p(SGmS@Zg7)Z+~y876<6rwOiC`vJkQ-YF|qBLbFOF7C@fr?b3GF7NbHL6pCn$)5;b*M`{>eGORG@>z0 zXi77h(}I??qBU)3OFP=rfsS;dGhOJ)r*xw`J@|~z=}9ko(}%u%K|lI4fPoBRFhdy1 zFur6sBN)jjM)MV8_?oeN!?%oMJQJA6BqlS3sZ3)!GnmONW;2Jm%ws;^v4HPc$RZZA zgr)qzkNm_ke&!dJvx1eZVl``6%R1JxfsJfpGh5ioHny{ao$O*ad)Ui9_H%%P9O5uX zILa}ObApqc;xuPC%Q?<-fs0(?GFQ0DHLi1mo800yceu+v?(=|$JmN7=cp4=rM`$i> z=oFeZ8#?__PHoWd8)JnAZ(dB|yMM6OrdK((uX1c(<=DQ;v3-?e`}JmQmpgd`#{Nk~dEl9Pgzq#`wGNJ~1>lYxw6A~RXY zN;b0d0UwfskNB8R$Vmvf$W0#dl8^ippdf`POc9DwjN+7_B&8@#8OlHNAm8eV= zs#1;W)SxD{s7)Q}QjhvHpdpQDOcR>YjOMhUC9P;p8`{#2_H>{lo#;##y7DRA=uQtl z<8yk_i{A91FJI7){tRFsgBZ*ZhBAyV8O{hsGK$fB#TdS3EZ^`g;~38bCNhc1Okpb1 zn9dAlGK<;FVJ`ES&vz`~dls^Y#VlbdKky?zv5cSjh2^YZC97D?8rHIo^=x1xo7l`2 zwz7@w>|iIm*v%gHvXA{7;2?)M%n^=qjN_c(B&Rsd8P0N!^IYH}m$=Lou5yj*+~6j+ zxXm5za*z8w;31EA%oCpSjOS5;0>T8Im3tf3QaD~CfIz|%fgmChiPwot6yD%XqVg8e zc$?@16N8wfO&;=+kNgy%AcZJQ5sFfb;*_8yr6^4q%2JNK3}7IG7|alcGK?=7&Im>_iqU+<7`|pK-|#Kt7|#SIGKtAdVJg#@ z&J1QUi`mR!F7uercP!w07P5%NEMX}>@FPF5jGy_1<*Z;Ot60q%*0PTEY+xgs*vuBT zvW@NRU?;oS%^vo$kNq6rAcr{25sq?<>6Q1&n=b=47_&6`F@o<)3BY;4{6M-Ni5{cJ|OcdVWO``G^(RiEa z1QUap#3D9vc!#*W%X_>}JmQmpgd`#{Nk~dEl9Pgzq#`wGNJ~1>lYxw6A~RXYN;b0d z0UwfskNB8R$Vmvf$W0#dl8^ippdf`POc9DwjN+7_B&8@#8OlHNAm8eV=s#1;W z)SxD{s7)Q}QjhvHpdpQDOcR>YjOMhUC9P;p8`{#2_H>{lo#;##y7DRA=uQtl<8yk_ zi{A91FJI7){tRFsgBZ*ZhBAyV8O{hsGK$fB#TdS3EZ^`g;~38bCNhc1Okpb1n9dAl zGK<;FVJ`ES&vz`~dls^Y#VlbdKky?zv5cSjh2^YZC97D?8rHIo^=x1xo7l`2wz7@w z>|iIm*v%gHvXA{7;2?)M%n^=qjN_c(B&Rsd8P0N!^IYH}m$=Lou5yj*+~6j+xXm5z za*z8w;31EA%oCpSjOU@(e}q@pe~7loMQ-wtmwe=>00k*TVTw?cViczYB`HN|%21Ya zl&1m}sYGR}P?c&_rv^2tMQ!R(mwMEv0S#$HW17&EW;CY-Eont-+R&DEw5J0d=|pF` z(3MZ=Mt6Gf8K2XWUi799efffZ^k)DA8N^_QFqC0@$#6z6l2MH2E5`6OWBG<}8OL}g zFp)`2W(rf8#&l*dlUdAW4s)5ue7<7=-?NZKEM^Hy`GFt#iDmrEFDz#TD_O;A*07d! ztY-ro*~DhHu$66WX9qjk#cuYnmwoK#00%k5VUBQ=V;tuMCppDw&Ty7q#cl3zmwVjj0S|e^W1jFdG)((jXyEt2L1_AIs^Gc*A^kQeG`A~s5>@yu zP#YRm`1`er^oj8zs_<(v?yIQ6tG_AyoW1H({v~}%z@DJB;{yIW^eTV(t0H3Z8n40% zE`eO+CJ%YZM}7)WkU|uu2t_GIaY|5

Zrms8b zUsc)v+IliMA+P*h8w6y^dUpE%lZMIcguL>HZ4i(-YqUC%{>xh@i!<^+KtPtvy{`Nj z2NIAaYqWS#{*0Rc(>k*{CI3SNWDPwnf5O28WX&4w$eVvg&3{;DHmBr&kbrEVr{zyL zpnz;yqm7RCXVm<6b!K-;{vQa)p3(2lKWWE7*)wN+3qmfPX;oQiOggnE7_t1MJb`g%gLY|i85pe%@|9cE;*(yVYR^WJs4OYpei0Y9^g#pPbo#OSk>4 zoBe}cTSzJ>AmiV3@4DxVe;|`}FC(*ZFC&X`FC(jRFC&|BFC)8h|4_sdn(?2Ssr8@y z1sM8EFjUicVbZ_y(ru7ITm2Uc$dI97i~s7K{U8%9G;8J?iYv$iK0G?3rea z|1;iTK=v##a>n^HYX1LMXTx|tZT}n2!eoe(^Zozyt-aX#m)VPQFBXG@4yKI%%kp2U zNo7lx&Lmyiw z4pStqUOmQ&#IfVX&XFWc+~9DrGlvNuHMX+B;jE5vXLi^aW9EcSvLI)YAYWS>QGUL%){VEc6@F%%WN{TCqw4ey_CSuazEH9WKVgxnYwWx*9gg#5rN( z#`^2KKSFCW{mnfgzpnfT`-%Mfep>&!pGWh<#aLxO^T)oHWZ&U%asTT79skq!8)Mys z@D4mlP>dn{0;1bt+*pxAN|h>J{)1l|GuG^IX+x*%^+QZ0LuaPKAtCKUa@7yX7ZOrG zB=?I=A-QsegcJ?km^&n-VCbXe-ybc7Zpzg@bp4A$FRIGjI<)H04Y^8%ZV9OvI`g{U zrx#Cazj#&2(DivkH>5A&MWO3T7JTu3you0-7xVw~v_NA|^W^=l{*aKezir7?GW7lB z4+*Iqk|%W4@0Er=|E<0Pq0j%Wz844Sc-n`)UXfo9TESc|4)VooLOu!;D_)D5ty;FN y+a`7G7A;cOY}KlH`*`i@wrbt5d9!%wv!zL&CSB|teg=R2_u_k`ON5k4|NjB7_}fST delta 281 zcmX@x#WZ;n&)O$KQIy$)m2}dUo z;THmwu>dk$ogspOAW<+A#DFlJOhK%Q0HD%x5X%ijfE0x~I;J~1r8{~!I)WtvA%=ow yq2__b5#}j^&2};YDRFjm^aUGG?g(-f1O!hOXw?$Xkzrr}A_fH}28IB~AVUB$_eqZc diff --git a/tests/testthat/data/mixed.parquet b/tests/testthat/data/mixed.parquet index a5b90dce0a8bdf43a9d12cdc462bef63ef2eb402..9fbb6eaf2037d0a0b92e098bcd5543d8a88935b5 100644 GIT binary patch literal 17247 zcmeI)2Urv7y1?-S5(tQjB^VTywIFsvM^OpAsfeN=c0{m(pn{bMB1Ocm*cC)6R)O_Jn|uF*%d@WQIpm(R=Xm$ZJo#lNGw*zp3EzB;N&Hm)O1VU?HkPLv z`^s18$VD|pq9TJAopeM@VrW4d5=bFz(1i?o&_@**pem}NIt)<*q$jF{+K|Htb)bMT z>Y^S@P#+D@5T^J9jnEiP&=h8926Hq=3$#Qlv_>1Wg$0zbgcYn|16$a^9u8=S_HaZ8 zbVMgOK?P^Hz!h$AhX*|21#kGk7oE`sUEzmr@JDy_KmdB87XlH4-Uvn?^hH1P#{dk( zAcSBrLNNqk2**$i!*GniNQ^=RA~70Kh{hO<#W=(u7UMAi6EO*sF$Hl@V=Cg2fN4m? zbR;1eDM-Z(q+uq~F$=RX2XiqG^RWO6u?QJhj3ro#Wmt|CScz3wjZ9==4YIKo>yU$7 zl*g37fG61t`Q;Y{Pc!z)tMKr`U}>*o)7w4@KCIVwB(j4&o3F;|Px87>?rv zPT~|!;|$KC6z6ar7jO}ma2Z!{71wYbH*gcTa2t1U7x(ZvzQC9G3it5<5Aih~;Tt^0 zw|Ii5c!uYAf$#7V-{Td2z>oL|KjRm?#;*`AelfJ54GE;sfi7gwgFdRj098>9)nSMl zsEJyr4LOWZ2MQRYF6zMq_0a$gVTw=C2#wJMO<{&+Fh_H=KufejYqUXISU?F&Siu@L zu!SA$;ed8%4@Y!BM|6S{RB(n1T;T?Hc)$~0@P-e3(HULP6@KUje{@F=1fVB+ArL|6 zjbQXaU-UzN48TAPLI?&U6hjb(a16yT495tJ#3)1{5~C4?XpF&Fj6)1!F&+~z5tA?( zQxFF=rXn5*n1)15M-q~ef>g{v8fGFLvmh7AlWWQ!>$Z}s4dsS%9l4H)LLzKXNQB&F zBr<>N6)UFcy*MD&ixX+d6i%zuQ?+F^%>#>8O-;}>Y~-2Yv^qYqio!Dda?$F9r0S;5 z$qH5Gw3J%r{yBmBGZSaju^4nDLzOjsWJ4ups@8B(j5`Ei&N@(Y+SZ0`hIC{@ZpWicgH-rdvvMCrWJc9zIyiH z+Tl$r_r-~{^-VlCuiBp=tJR{COvS%MU;eO+%gTCm8{uqFP7=#cEMkt0L4B;4xVHl1P z7>QAcKqN*Z3egyYu^5LK#9}-qU?L`AGNvF7YD`5u5-<&kn2sbQBL%6Lfi%oSI%Z)u z=3p-7VLldMAr>J6i?IYtu?)+x0xPi!tC5K;tU)%`VjXgji#+6GJvLw?HeoZipa6x~ zif!1A9oUIo_!PUb2Yc}u_Mr&-QH&BCz(E|sVI09x9K&&(z)76KX`I1Xl;RxD;{q`<1Z$8F3(Q}upjpV6Dt>i^gmfvn~S@4#AZ*K{&nABBX#7Tu^B`jeDYuLaRcCd#7 z+Mzuh(E%ON2~JSK87^>z8{FXmPk6x_KJZ0nbU|16p&R_s9X$|$p6G=@1fe&A(Fc9e z5B)I!12G687>rO1K^Vd@6vHqaBQO%95P?XHMiink24gV}F^I)@Ou$4;!emTA9MqVK zcqCvN5-}Y~NJa`$F#~CsiFC}uY|O!2%)@*vz(OoS1{PxpmSP!}V+B@X6;>k?Sy+Q? zti?LyAQySa$9inQMr^`nY(W7Eu@&2}9XqfSyYMM?V-NP?GweeV_M;djIDmsVgu^(3 zqd11+IDwNmh0{2Lvna(moW})R#3fwD6hCBDLaJitSI zjYs$fkMS*@;3=NrIbPs9yu|l-g&*)Ee!|cA1+Vcdgj!q-EoegmDRiI<8T6o!DlkA* zR6})Cc9K8xPBKHUmh$<(55205%QA?#s!XH>ZAc)64s;=d9`sQK2B?Z^s18HaKuy#_ zZOCDSI#9qEbx{u{sE-C{2vdB5Mre#CXbLklgE^X`1zMsNTB8lx!U9TI!V1=~fi3J{ z4+peEdpM#4I-(Ptpn@}8;0iam!vmi1f;W8Ni_YkRuJA)Q_@g^|AOJnl3xNniZv>+c z`l28DV*mzX5JE5*p%{WNgkva%VK_!$Bt{_ukr<6AL}LubVjN-+i}9F%iI{}Rn1VQ{ zF%|Jhz%(RcI+Bo#6r^GX(l8V0n1$JxgSnW8`B;F3ScD8L#u6;WGAzdmti&p;Mkcbb z2H9APb;v<3@{o`9*no}Lgw5E30u*8^wqZMVU?+CrQ|!hb?8Rr;ha&7pF-mX%2XP38 zaRf(k499T-Cvgg=aRz5migP%R3%H0&xQr{fifg!z8@P#ExQ#owi+lJSU*Jo8h5LAb zhxi(g@C_d0TRg#2Ji~Lmz;}3w@9_#h;79y~pYaP`ne3tZs_ zcX+@PUhswwe9;+Q&=r2@27h!%4+NkmdLa-&=#60XL0|Mke+=JEzt_C(FSc{0VOP91#8&A7Iv_Q1KOcI z9MJ(C(Fsma!5J=ag&W-A0Z(|r8$R$wXME(PXn7;5hPXy`@=lG)yhEO-%sU#@z~yIg3)9!+Bi5MO?yVT)|ab!*$%iP28#~d#yJ3NFKtFZ7cb*KMdKvRjFG1)<*kd zRa?GRsA`QQ7D8346sp?Fn&Q9O-mbjbxmr)-4E~L){X=)RE1QayO~t>c!!40dYw`A2 zZoFI=-seU%DL0}?xe-mujc8JCM3ZtOnv@&Sq}+%mSq=Y4`U=17C z!VdOQ9jPN*HDg7bCaj6mgbi_;uq93tcEoAIo;Xc75LW@2gM+r4iO51vSEw^Zmau{~ zY+wsJ*z4))xf7uoOX4(PMVuzAiPMA)ahk9tP7`*-X~Ld3O*jx&0k1tG%Ss>WGmuz1 zxc&JLV8tKgFCwv0hNk~jiG3{TNUR*PV}+A#V&!13O|@X<;P9TYTw9U3mV<*rEOZAd zhiX68q1yAqM27zgLL) z02iz&eP7Oq)1!x5+53IB74iXE8~^!>^+WY6{|?8k?EU^wGk(% zM-G(hh~!#L`TB6ING=iP<3(}b;Ok^V0GPV)v7#b5R*J8EN zu>`Tv%K)>j#%8jA_GMIKM}884qX_p+W?Li7 zUI&;c-um|)@nzE=|9w}!R`gE~Vo-jN=r;$+I4w3R2rxTwNNl!ngV^NvZr(pk#VGxd zgi|(?85PBA)#j_#CJF+j2+2+ZyusTa*H27t?@ta{sf@_ delta 280 zcmccL#`r#JgRi!lq>U+q1Y2=(Mrv-NC<}uqlPZG*V+DgK3y7{{5IdkMDI=*f`If^@ zO|fs^n1FmW4za$qj3NvSk}?uJjzK~G;Z_js5^oswLBMP>laoWer=z2zlPi#LbOI56 zAwU@m5X%`N7zkp6nIHy)>0}CGRRjQ)mV;PsAOfT))X^~=#Po1<1WN=$3eP^=So%N|XTB6L5pn;pr)Nt=>Dq)kA@V~IgQJV*|$2U!%s zTM^SsJ$dmUhzPy*;K8GRgy8?+MGrnRF{MP(t}#n~@4g?;JUcr%?3?KO)+f_W*o#$m zV7J*@gV9Bz=WIzMB3H|i2Ye8K3J5_2svy>K7S|#4+?+uk@Ie47AOsPpf;ea7YM4$T znHU79(@=pf4Iy-Ak&K5CglVWkmxdU+6ePqUJcm5znTA8Ixj-K9K>#Ws1QDo$*jz9- zp-6`hoo^;GoK1jv8Yhe@4UOY{1{`$;DR! z9%t~02sj@;%P*S|UDY|aH23JGv8bOaKjs(p3xB2=&vlO;Up~}=c1V^n=soB^+F=GU zy+oF_M=UGvPly%deNdvdgGUrUmu$zb4cF}V{W3YMV25NZ*$^-tv*N4U#UF0@%hU4y zJ?CT_WM5-h24_N8!ZYCW^`{1XIbZyBpj{%e9jhR^-K{&VXzt2(mZkZo@pi&U>AO~j@Q%1T(U|JNnTf KKKR?B>-`4p`NW|B delta 278 zcmaFtKg(SX+t+1B0ZD1dn4-kbk%pM7P8nMtu-4o7|@6Q19vJ=;-7MBpjVUgkK0y z#sb80h6o0N*kC4z0bx3sf>;#+K&9m%mK%rwDGGITOb0PN938l3 diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 6bb4220..d81026b 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -129,6 +129,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { tab <- read_parquet(pf) expect_equal(tab$x, rep(0:399, 6)) expect_equal(tab$y, rep(0:399, 6)) + expect_equal(tab$s, as.character(rep(0:399, 6))) pf <- test_path("data/mixed2.parquet") expect_snapshot({ @@ -138,6 +139,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { tab <- read_parquet(pf) expect_equal(tab$x, rep(0:399, 6)) expect_equal(tab$y, rep(0:399, 6)) + expect_equal(tab$s, as.character(rep(0:399, 6))) pf <- test_path("data/mixed-miss.parquet") expect_snapshot({ @@ -147,4 +149,5 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { tab <- read_parquet(pf) expect_equal(tab$x, 0:2399) expect_equal(tab$y, 0:2399) + expect_equal(tab$s, as.character(0:2399)) }) From 38b9cebc1e315835da882675900593a8ce4d5b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Fri, 7 Feb 2025 22:24:22 +0100 Subject: [PATCH 09/18] Support all mixed dict + non-dict types Still needs testing, plus working around an rchk issue. --- src/RParquetReader.cpp | 309 ++++++++++++++++++++++++----------------- 1 file changed, 184 insertions(+), 125 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 5c21b50..dcc3b2d 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -1312,18 +1312,20 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, lcl); SET_VECTOR_ELT(pp->facdicts, lcl, Rf_allocVector(VECSXP, pp->metadata.num_row_groups)); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - // first the non-dict parts, if any - std::vector rgba = pp->byte_arrays[cl][rg]; - for (auto it = rgba.begin(); it != rgba.end(); ++it) { - int64_t from = it->from; - for (auto i = 0; i < it->offsets.size(); i++) { - SEXP xi = Rf_mkCharLenCE( - (char*) it->buffer.data() + it->offsets[i], - it->lengths[i], - CE_UTF8 - ); - SET_STRING_ELT(x, from, xi); - from++; + if (pp->byte_arrays[cl].size() > 0) { + // first the non-dict parts, if any + std::vector rgba = pp->byte_arrays[cl][rg]; + for (auto it = rgba.begin(); it != rgba.end(); ++it) { + int64_t from = it->from; + for (auto i = 0; i < it->offsets.size(); i++) { + SEXP xi = Rf_mkCharLenCE( + (char*) it->buffer.data() + it->offsets[i], + it->lengths[i], + CE_UTF8 + ); + SET_STRING_ELT(x, from, xi); + from++; + } } } @@ -1487,10 +1489,8 @@ void convert_column_to_r_ba_decimal_dict_nomiss(postprocess *pp, uint32_t cl) { int32_t scale = pp->metadata.r_types[cl].scale; double fct = std::pow(10.0, scale); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { + if (pp->byte_arrays[cl].size() > 0) { + // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1499,24 +1499,37 @@ void convert_column_to_r_ba_decimal_dict_nomiss(postprocess *pp, uint32_t cl) { beg[i] = parse_decimal(it->buffer.data() + it->offsets[i], it->lengths[i]) / fct; } } - } else { + } + + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + SEXP tmp = R_NilValue; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; + if (!hasdict) continue; // convert dictionary first uint32_t dict_len = pp->dicts[cl][rg].dict_len; - SEXP tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - REAL(tmp)[i] = parse_decimal(ba.buffer.data() + ba.offsets[i], ba.lengths[i]) / fct; + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + REAL(tmp)[i] = parse_decimal(ba.buffer.data() + ba.offsets[i], ba.lengths[i]) / fct; + } } // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - uint32_t *end = didx + pp->dicts[cl][rg].indices.size(); - int64_t from = pp->metadata.row_group_offsets[rg]; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *end = didx + cp_num_present; + int64_t from = rg_offset + cp_offset; while (didx < end) { REAL(x)[from++] = REAL(tmp)[*didx++]; } - UNPROTECT(1); } + if (!Rf_isNull(tmp)) UNPROTECT(1); } } @@ -1567,10 +1580,8 @@ void convert_column_to_r_ba_raw_nodict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_ba_raw_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { + if (pp->byte_arrays[cl].size() > 0) { + // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1581,28 +1592,41 @@ void convert_column_to_r_ba_raw_dict_nomiss(postprocess *pp, uint32_t cl) { from++; } } - } else { + } + + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + SEXP tmp = R_NilValue; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; + if (!hasdict) continue; // convert dictionary first uint32_t dict_len = pp->dicts[cl][rg].dict_len; - SEXP tmp = PROTECT(Rf_allocVector(VECSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - SEXP xi = Rf_allocVector(RAWSXP, ba.lengths[i]); - memcpy(RAW(xi), ba.buffer.data() + ba.offsets[i], ba.lengths[i]); - SET_VECTOR_ELT(tmp, i, xi); + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + tmp = PROTECT(Rf_allocVector(VECSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + SEXP xi = Rf_allocVector(RAWSXP, ba.lengths[i]); + memcpy(RAW(xi), ba.buffer.data() + ba.offsets[i], ba.lengths[i]); + SET_VECTOR_ELT(tmp, i, xi); + } } // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - uint32_t *end = didx + pp->dicts[cl][rg].indices.size(); - int64_t from = pp->metadata.row_group_offsets[rg]; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *end = didx + cp_num_present; + int64_t from = rg_offset + cp_offset; while (didx < end) { SET_VECTOR_ELT(x, from, VECTOR_ELT(tmp, *didx)); from++; didx++; } - UNPROTECT(1); } + if (!Rf_isNull(tmp)) UNPROTECT(1); } } @@ -1686,10 +1710,8 @@ void convert_column_to_r_ba_uuid_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); char uuid[37]; for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { + if (pp->byte_arrays[cl].size() > 0) { + // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1705,33 +1727,46 @@ void convert_column_to_r_ba_uuid_dict_nomiss(postprocess *pp, uint32_t cl) { from++; } } - } else { + } + + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + SEXP tmp = R_NilValue; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; + if (!hasdict) continue; // convert dictionary first uint32_t dict_len = pp->dicts[cl][rg].dict_len; - SEXP tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - unsigned char *s = (unsigned char*) ba.buffer.data() + ba.offsets[i]; - snprintf( - uuid, 37, - "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], - s[10], s[11], s[12], s[13], s[14], s[15] - ); - SET_STRING_ELT(tmp, i, Rf_mkCharLenCE(uuid, 36, CE_UTF8)); + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + unsigned char *s = (unsigned char*) ba.buffer.data() + ba.offsets[i]; + snprintf( + uuid, 37, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], + s[10], s[11], s[12], s[13], s[14], s[15] + ); + SET_STRING_ELT(tmp, i, Rf_mkCharLenCE(uuid, 36, CE_UTF8)); + } } // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - uint32_t *end = didx + pp->dicts[cl][rg].indices.size(); - int64_t from = pp->metadata.row_group_offsets[rg]; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *end = didx + cp_num_present; + int64_t from = rg_offset + cp_offset; while (didx < end) { SET_STRING_ELT(x, from, STRING_ELT(tmp, *didx)); from++; didx++; } - UNPROTECT(1); } + if (!Rf_isNull(tmp)) UNPROTECT(1); } } @@ -1781,10 +1816,8 @@ void convert_column_to_r_ba_float16_nodict_nomiss(postprocess *pp, uint32_t cl) void convert_column_to_r_ba_float16_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { + if (pp->byte_arrays[cl].size() > 0) { + // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1794,27 +1827,40 @@ void convert_column_to_r_ba_float16_dict_nomiss(postprocess *pp, uint32_t cl) { from++; } } - } else { + } + + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + SEXP tmp = R_NilValue; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_present = cps[cpi].num_present; + bool hasdict = cps[cpi].dict; + if (!hasdict) continue; // convert dictionary first uint32_t dict_len = pp->dicts[cl][rg].dict_len; - SEXP tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - uint16_t *f = (uint16_t*) (ba.buffer.data() + ba.offsets[i]); - REAL(tmp)[i] = float16_to_double(*f); + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + uint16_t *f = (uint16_t*) (ba.buffer.data() + ba.offsets[i]); + REAL(tmp)[i] = float16_to_double(*f); + } } // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - uint32_t *end = didx + pp->dicts[cl][rg].indices.size(); - int64_t from = pp->metadata.row_group_offsets[rg]; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *end = didx + cp_num_present; + int64_t from = rg_offset + cp_offset; while (didx < end) { REAL(x)[from] = REAL(tmp)[*didx]; from++; didx++; } - UNPROTECT(1); } + if (!Rf_isNull(tmp)) UNPROTECT(1); } } @@ -1868,34 +1914,40 @@ void convert_column_to_r_int32_decimal_dict_nomiss(postprocess *pp, uint32_t cl) int32_t scale = pp->metadata.r_types[cl].scale; double fct = std::pow(10.0, scale); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t off = pp->metadata.row_group_offsets[rg]; - double *beg = REAL(x) + off; - // In theory we might dictionary encode a subset of the columns only - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - if (!hasdict) { - double *end = beg + num_values - 1; - int32_t *fend = ((int32_t*) beg) + num_values - 1; - while (beg <= end) { - *end-- = static_cast(*fend--) / fct; - } - } else { - // Convert the dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + dict_len - 1; - int32_t *fdend = ((int32_t*) dbeg) + dict_len - 1; - while (dbeg <= dend) { - *dend-- = static_cast(*fdend--) / fct; - } + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + bool hasdict = cps[cpi].dict; + double *beg = REAL(x) + rg_offset + cp_offset; + if (!hasdict) { + double *end = beg + cp_num_values - 1; + int32_t *fend = ((int32_t*) beg) + cp_num_values - 1; + while (beg <= end) { + *end-- = static_cast(*fend--) / fct; + } + } else { + // Convert the dictionary first + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + dict_len - 1; + int32_t *fdend = ((int32_t*) dbeg) + dict_len - 1; + while (dbeg <= dend) { + *dend-- = static_cast(*fdend--) / fct; + } + } - // fill in the dict - double *end = beg + num_values; - double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; + // fill in the dict + double *end = beg + cp_num_values; + double *dict = (double*) pp->dicts[cl][rg].buffer.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + while (beg < end) { + *beg++ = dict[*didx++]; + } } } } @@ -1944,30 +1996,37 @@ void convert_column_to_r_int64_decimal_dict_nomiss(postprocess *pp, uint32_t cl) int32_t scale = pp->metadata.r_types[cl].scale; double fct = std::pow(10.0, scale); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - int64_t from = pp->metadata.row_group_offsets[rg]; - // in theory some row groups might be dict encoded, some not - bool hasdict = pp->dicts[cl][rg].dict_len > 0; - double *beg = REAL(x) + from; - double *end = beg + num_values; - if (!hasdict) { - int64_t *ibeg = (int64_t*) beg; - while (beg < end) { - *beg++ = static_cast(*ibeg++) / fct; - } - } else { - // convert dictionary first - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + pp->dicts[cl][rg].dict_len; - int64_t *idbeg = (int64_t *) dbeg; - while (dbeg < dend) { - *dbeg++ = static_cast(*idbeg++) / fct; - } - double *dict = (double*) pp -> dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data(); - while (beg < end) { - *beg++ = dict[*didx++]; + std::vector &cps = pp->chunk_parts[cl][rg]; + bool rg_dict_converted = false; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { + int64_t cp_offset = cps[cpi].offset; + uint32_t cp_num_values = cps[cpi].num_values; + bool hasdict = cps[cpi].dict; + double *beg = REAL(x) + rg_offset + cp_offset; + double *end = beg + cp_num_values; + if (!hasdict) { + int64_t *ibeg = (int64_t*) beg; + while (beg < end) { + *beg++ = static_cast(*ibeg++) / fct; + } + } else { + // convert dictionary first + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (!rg_dict_converted && dict_len > 0) { + rg_dict_converted = true; + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + dict_len; + int64_t *idbeg = (int64_t *) dbeg; + while (dbeg < dend) { + *dbeg++ = static_cast(*idbeg++) / fct; + } + } + double *dict = (double*) pp -> dicts[cl][rg].buffer.data(); + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + while (beg < end) { + *beg++ = dict[*didx++]; + } } } } From f2469d87c0c3b1b5be4b8839f9619801b391c3de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 01:31:14 +0100 Subject: [PATCH 10/18] Fix reading FLOAT from mixed dict + non-dict col chunks --- src/RParquetReader.cpp | 39 +++++++++++------------- tests/testthat/_snaps/read-parquet-5.md | 15 +++++++++ tests/testthat/data/create-data.py | 24 +++++++++++++-- tests/testthat/data/mixed-miss.parquet | Bin 34125 -> 45372 bytes tests/testthat/data/mixed.parquet | Bin 17247 -> 22906 bytes tests/testthat/data/mixed2.parquet | Bin 10211 -> 13538 bytes tests/testthat/test-read-parquet-5.R | 3 ++ 7 files changed, 58 insertions(+), 23 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 0ed71ab..4d965fe 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -957,15 +957,12 @@ void convert_column_to_r_float_dict_nomiss(postprocess *pp, uint32_t cl) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_values = cps[cpi].num_values; - bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + rg_offset + cp_offset; + for (auto cp = cps.rbegin(); cp != cps.rend(); ++cp) { + double *beg = REAL(x) + rg_offset + cp->offset; // In theory we might dictionary encode a subset of the columns only - if (!hasdict) { - double *end = beg + cp_num_values - 1; - float *fend = ((float*) beg) + cp_num_values - 1; + if (!cp->dict) { + double *end = beg + cp->num_values - 1; + float *fend = ((float*) (REAL(x) + rg_offset)) + cp->offset + cp->num_values - 1; while (beg <= end) { *end-- = static_cast(*fend--); } @@ -983,9 +980,9 @@ void convert_column_to_r_float_dict_nomiss(postprocess *pp, uint32_t cl) { } // fill in the dict - double *end = beg + cp_num_values; + double *end = beg + cp->num_values; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp->offset; while (beg < end) { *beg++ = dict[*didx++]; } @@ -1031,24 +1028,24 @@ void convert_column_to_r_float_dict_miss(postprocess *pp, uint32_t cl) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_values = cps[cpi].num_values; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; + for (auto cp = cps.rbegin(); cp != cps.rend(); ++cp) { + int64_t cp_offset = cp->offset; + uint32_t cp_num_values = cp->num_values; + uint32_t cp_num_present = cp->num_present; + bool hasdict = cp->dict; bool hasmiss = cp_num_present != cp_num_values; double *beg = REAL(x) + rg_offset + cp_offset; if (!hasdict) { if (!hasmiss) { double *endm1 = beg + cp_num_values - 1; - float *fendm1 = ((float*) beg) + cp_num_values - 1; + float *fendm1 = ((float*) (REAL(x) + rg_offset)) + cp_offset + cp_num_values - 1; while (beg <= endm1) { *endm1-- = static_cast(*fendm1--); } } else { // nodict, miss double *endm1 = beg + cp_num_values - 1; - float *fendm1 = ((float*) beg) + cp_num_present - 1; + float *fendm1 = ((float*) (REAL(x) + rg_offset)) + cp_offset + cp_num_present - 1; uint8_t *presm1 = pp->present[cl][rg].map.data() + cp_offset + cp_num_values - 1; while (beg <= endm1) { if (*presm1) { @@ -2005,10 +2002,10 @@ void convert_column_to_r_int64_decimal_dict_nomiss(postprocess *pp, uint32_t cl) std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_values = cps[cpi].num_values; - bool hasdict = cps[cpi].dict; + for (auto cp = cps.begin(); cp != cps.end(); ++cp) { + int64_t cp_offset = cp->offset; + uint32_t cp_num_values = cp->num_values; + bool hasdict = cp->dict; double *beg = REAL(x) + rg_offset + cp_offset; double *end = beg + cp_num_values; if (!hasdict) { diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index ac8e099..4502c0a 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -62,6 +62,7 @@ 2 INT32 REQUIRED 3 INT64 REQUIRED 4 BYTE_ARRAY REQUIRED + 5 FLOAT REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -78,6 +79,10 @@ 10 DATA_PAGE 1024 RLE_DICTIONARY 11 DATA_PAGE 1024 PLAIN 12 DATA_PAGE 352 PLAIN + 13 DICTIONARY_PAGE 400 PLAIN + 14 DATA_PAGE 1024 RLE_DICTIONARY + 15 DATA_PAGE 1024 PLAIN + 16 DATA_PAGE 352 PLAIN --- @@ -89,6 +94,7 @@ 2 INT32 REQUIRED 3 INT64 REQUIRED 4 BYTE_ARRAY REQUIRED + 5 FLOAT REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -105,6 +111,10 @@ 10 DATA_PAGE 1024 RLE_DICTIONARY 11 DATA_PAGE 1024 RLE_DICTIONARY 12 DATA_PAGE 352 RLE_DICTIONARY + 13 DICTIONARY_PAGE 400 PLAIN + 14 DATA_PAGE 1024 RLE_DICTIONARY + 15 DATA_PAGE 1024 RLE_DICTIONARY + 16 DATA_PAGE 352 RLE_DICTIONARY --- @@ -116,6 +126,7 @@ 2 INT32 OPTIONAL 3 INT64 OPTIONAL 4 BYTE_ARRAY OPTIONAL + 5 FLOAT OPTIONAL Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -132,4 +143,8 @@ 10 DATA_PAGE 1024 RLE_DICTIONARY 11 DATA_PAGE 1024 PLAIN 12 DATA_PAGE 352 PLAIN + 13 DICTIONARY_PAGE 1024 PLAIN + 14 DATA_PAGE 1024 RLE_DICTIONARY + 15 DATA_PAGE 1024 PLAIN + 16 DATA_PAGE 352 PLAIN diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py index 2894adf..669286f 100644 --- a/tests/testthat/data/create-data.py +++ b/tests/testthat/data/create-data.py @@ -1,14 +1,33 @@ +import pyarrow as pa +import pyarrow.parquet as pq +schema = pa.schema(fields=[ + pa.field(name = "f", type = pa.float32(), nullable = False), +]) +data = [ + list(range(400)) * 10, +] +table = pa.table(data = data, schema = schema) +pq.write_table( + table, + 'float.parquet', + row_group_size = 1500, + data_page_size = 400, + use_dictionary = False +) + import pyarrow as pa import pyarrow.parquet as pq schema = pa.schema(fields=[ pa.field(name = 'x', type = pa.int32(), nullable = False), pa.field(name = 'y', type = pa.int64(), nullable = False), pa.field(name = "s", type = pa.utf8(), nullable = False), + pa.field(name = 'f', type = pa.float32(), nullable = False), ]) data = [ list(range(400)) * 6, list(range(400)) * 6, - [ str(x) for x in range(400) ] * 6 + [ str(x) for x in range(400) ] * 6, + list(range(400)) * 6, ] table = pa.table(data = data, schema = schema) pq.write_table( @@ -29,7 +48,8 @@ table = pa.table({ 'x': pa.array(range(2400), type=pa.int32()), 'y': pa.array(range(2400), type=pa.int64()), - 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()) + 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()), + 'f': pa.array(range(2400), type=pa.float32()), }) pq.write_table( table, diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet index f71948af3b4decaf367e081dd2b60576e1586b04..d97fa2ac0e9cdd961fdf45198bf316bb427ce4e0 100644 GIT binary patch delta 10125 zcmYk>3$&)!cn9!jVkQYfCgh9>X?=AGCF&?aQT8MgaS4vpwT;@Y(U$hKw6s;<(;zg} z^fV!YV7oRY^wj-UeZ-}zF2|*+sBZUbIS56O{!Xe^XU%W__y2wN`_8O+*O{~CoA;f4 z$Xh;h$mTg`E?2UxoYhWU$-OIawTN6q?Adkc_m9jwL`3tGWKc}vK*U}atkPJcvrh6< zHn@PnMQrj0ig&YhAeQklo5`2h=6l465xY5@BiPH)?B`e(T#p-aJgeM-TXQ=a2e}ic zvc|o*AE(oKD33UC8SzM)bsodxcs$9OJc+Y-DrfUF&S8Udc{=BF0ng?_p2y&YyqK4A z5ijFnUdbk};x$Xl>v#ii;w`+D;u7A$yLdP6;k~?{Ek49Y_$VLa<9w1&G5HLi;|qL= zukba#&Nkm-d8y^Qe2*XSLw-!06tRm#xB|P`!=W6;;arg`ab=F+DqNK#*~?K}jjMAs zE9~PM?B|%1y#H(3T#IA*8Ge>)v*72r4%g*+T%Y5(0XO95xe+(!cy7Wka8p*f8NbNQ zxdkV1OK!!j`K6P*|J&HymfP{m+@3qoIKYXV#6eExj-0}sxHEU*uAIuiS&Ex*dIaXJs+*LfhF2k~GY!b5o&59bUX!Ef-J{1%VoQT#T)!#cmq z@9}6J!|(G4JeJ3?{6oth@y9%#Kj8`N{KoR6Na9Z=ai%1mD2XRY;>nUYOA=3!#8V~l zXOcKu5`QjA&K)P@k~ivAc<#5;@OgT zjwCMh{_{ec=ej^VPZH0U#E`@bB=JH?yhsu+mc&aW@lr|rwInW*#NSBbWs-QgBrf*H zoEO`?LK3f(#NSF{QxbnCiC0PD)slFPBwj0t*Gb~_l6Zq8-YAJTN#f0tSiZ%E_R+miUEB)%nyZ%g7* zNqk2V-<8CFN#c8w_`W25Ac_B$OH2IFhWL>rek_U0B>w7)MpDEsour6Ebdn;jppz7_ zTPG=Ek4~~IOC0I~NfC$XBt;yqlN519our5>=_Ey5Stlvt2%V&etLWra4lbi?;;Jr? z6mg_ZQp8@Jq==(*k|M6AlN51vour7Pb&?`hbaG|sf=%pmfux9Q=p;q#*GY;vMkgua znmS1l*V0LfI94Yq;%9VnX({4oT_7pq+B!)Q3!S8hpVLW-xQ zWm<|j&IOVpZlIGCaYLP?h@aO7f6b@sZLVF zs!meG&2*9?eo-eW;^sO@5x3Auia0?(Vd;WR5w~=Kq=;MTBt_g>Cn@5Wbdn-&qmvYI zTb-nc+vy}l{Ib5ZBt_ib1(G7}ppz8Qbdn+t=p;p)sFM_Nl1@^@L7k+ClfD0DNs73m z10+S9qLUPHC!M5-JL@Dx+(joT;;uSL5vS@TMcmE%Z+CNnBJS=0NfGzZNs3t0Ns73q zPEy2aI!O_~qLUPHFP)@_doNwMw+j?;ADyI#`|2b`+)pPd;{G~G5x=UF6!B|1NfD>( zBtI}QpDqQk|O?4Cn@5Obdn4?KSqC z_kfk7mN758{6rqS{KQY2d!Dkm^w7Om*tvP{F3Z7K(C8$CViJF9PNS0yib})3S6bEQ@l0h+v zr<&8~B!glSe`ZdjlMISUoNZ2{lZ>UMn8cquK%v<}^CVpqRw-%xQF%X&Drgc)kNPI?14z z#4xAPNe0CvUSLk6lMISUywIFRf1&rEHiKdkFLHoJCm9rzc(FN+PBJJa@e*?yon%l< z;-%&^`b(D%*bItE{Ivr#I?14z#6{*bI?14z#NU|H=p=(;5-&5S(U+EFP)y?G4$$Z% zgJKdFo73ndgJKe|FsIQ;2E`;^X-;F=Ey^i7F^M;t)956FViIpMr!6g=WKc}v%?{A$B!glSZ!xFQNe0Cv{@$EM zCm9rzc&j3~fq85EQFM+azhl0h+vcbn7bB!glS z|71?1lMISUyvKZLX>^i7F^PY6fJP@76q9(bIgL&-C?@efa~hpwP)y?e-v7eV=p=(; z5+CpY8l7ZNOk&HNMkg5*llY)HjZQKsCh;NffAJv)=p=(;5+C*e8l7ZNOyVQvG&;$k zn8d%B)956FViF(q{udv0fKD^i7F^PXOr_o6U#Uz#=cYsDG z85EQFggK2)GAJhTNpl*VWKc}v-_2=sl0h+9#-|*h(MblyB>uykMkg5*lbGf-I?14z z#HY<^bdo_aKkfa;XB?o>Ne0CvK5I^+lMISUe9oLkCm9rz_`ErdPBJLw=a&wA!2ueb zWKc}vi{>;s$)K3Tm&|E&l0h+vFPqcoB!gluE%6lxXmpZ6F^R95)956FViI37r_o6U z#U%dIoJJ=Z6w7IeuRB1alMISUe8ZeZCm9rz*fyupNe0CvzG+UQlML#c-v6|TZ#h7t zlMISUeA}ExCm9rzxYV3RCm9rz_>MV^PBN(Pc>mKTzUu&uPBJJa@n7aNI?14z#P`f; zbdo_aiSL`!=p=($S|;%W2WWJXK{1K{HmA`^2E`HnGe7ELf$nMrWO5gTW@n7IW9q18m|D z4`9J6jWs&!BpVDiDYlqw6IXCQ3sz~Y(ODj!8bk<2W7;I8( zG1(^exSs{9G}h>>lWZ{9q}XCPE!)JQ9>9WC8f$dcNj4a4Qfx8VCJu8y3sz~Y(ODh5R3DvdQd>m(ZtHmRj$i^(=|vi@3(vgv zkt>H^|B(H=9v=JZ`o|o5%&D)wA(e$UbV^YqyLM;tiq zw0qq5Hvjvp@Bi8%H{JR1*fUN%?^{p2{k`sa;J|^C&afP~|ITu7ryn?ZXE|eS=a|hs z?{@jcQ+6&MxO{v0?(*#^_djsr83*pXbGQ7-_45us>AU+*J=Xuk{@>3y-M6K+1E<~W Fe*hs9Bq;y@ delta 119 zcmdn(}un#K1m-_1_`#}qWTMFzJJUWxC_!dF$;&d>tJfoyviPBZ%eb7zo6Z4K{oewvl0AfFOpQ LObiSGjzNY1(3K)1 diff --git a/tests/testthat/data/mixed.parquet b/tests/testthat/data/mixed.parquet index 9fbb6eaf2037d0a0b92e098bcd5543d8a88935b5..7a8ef5a08451ac9db2cb65428862433b97d7a0f8 100644 GIT binary patch literal 22906 zcmeI)2YglK-8k@*kc5OG^I{@Vae#=ZBwn)A8e{LVe+xzD-x+;g6lb5Eo6CP|^FP-a4C zT0;HM(wI9QGkLJLViIJiV~t27ZXnc zmr$HUF6A;Vrvz7UC0B7ZB`L)a2;j2o*TH4awL&V3gxLlMJiF5RBqyCs&EUp zQk68)sYZ2bP?K8JCWAWEr5^QZz-=_75shg=Q<~A7+i5{dTG5&|w51*G=|D$1(U~rE zr5oMpK~H+on?CfVAN?7?Kn5|GAq-_0!x_OyMsWvsGMc-%n=xcEmT`<{0u!0UWTr5c zX-sDZGnvI~=5P;lna91{$NfCOd={{fMJ#3s5AqO8S;lf!u##1*W({ju$9gufkxgu7 z3tQR7c6P9nhk1lu?B-D(<8hwgNuJ_q_V5gQ*~fmK%75n-sCOb<{jSUJ>KU8AMha`@iD*P6Mo4_KIJoh#pnE*FZc~#@)ck64X5}m z-|{=YX3b$}8 zRY@bAYE-8NHK|2yGN?mc>QSEt+(tth(U>MQr5Vk+offpD6|HGQTiVf{4s@gwo#{eX zy3w5;^rRQP=|f-o(VqbfWDtWH!cc}WoDqy<6nAhZqq&Q_8AB#x8OL}gFp)`2W(rf8 z#&l*dlUd9r6ojS}4t`ROYr%`9Gd@X6!Pte8Ce^0aZW3Ds6kx#y~kxusj}T{(5$HI0^MEU30> z+I`ox+}`uey{o1_aAU{4QxdALp0OaM=drb|_N|_|sM5d>4=t#^X4aCMM}B?s&3$WT zKU6g{GVYQZYv(MhKBe%rt@p3JXGQHfSJqirW8K_U_2wt{dTal>d21Rjtv2-v zJ&rD~v#8dlg%9^RzP;Dm2R1F*HQ>bFsl{t=Ui|2glgHMzIkbMTxZAX7Zz0>^4U9wcdU8op|tUNF0H$B?aM1N z=9Ou8Waqk9)-+mCcS+rc*T1%*<&NI(9(j1f8(TW=n|5ivM>f8>qvtE@+dcQlrnh$u z{OHJ%db>8i`}oLjK704MU0dFNIx|<^%j)mm`oX>_MXqcA{O)ZZ9SlzIjd*cyM7}Zh z>gjzEKYDy`$=--#dn00_k1oG;QZ9!{W?E=_V%A|2T=+1_EOBaC%n8TK2jrBEIc4L2 zV%g}>4U~jAZEO3WIyI7rN4o?)0E1z35FJ`qGd73}7IG7|alcGK}GjU?iisgF6|`UEIwWG8xM_#xsG5 zOky%qn94M!GlQATVm5QQhq=t-Uhd<59$-ETSjZw4vxEnEh@~uJIV)JnDps?GwX9=3 z8`#JuHnWATY-2k+*vZ2@!Y+36D39?tPw*s9@icpQhP~`#KhJW2gB;>8M|h6sd4U&s ziKD#CF<#+SUgLG%;5cvc7H{(o@A4k+bAk`}kdOG7U+@XPg4bfgoV=|We!(VZUjq!+#ELtpyQp8*VH5Q7=QP=+y_5sYLMcW@`8 zxr@6QLndPx$9N_%kx5Ku3R9WJbY?J&ae4`-|!`0@ipIYir?}rzvDZ8&-eU+Kk_F|^Jl`fcm$E;B8q5Y$W1J9I#1cmy@{*7ITtop1Qi#G7 zp(r7WaWU~Ea0$gp$!m&DMu2?q)?s;RHPD> zN#!PPrV6)kD^*D&ooZC41~sWgZ8E4sUFuPv2HZwN8qt_0G^H8Mxt$iYq!q1cLtEO> zo(^=R6P@WoSGv)i9`vLaz3D?=`q7^O3}g_48NyJ8F`N;MWE6LBC!@KGyBR|!V;RSI zCNPmnOlAsGnZ|TxFq2u#W)Al-mwDXFecaCj%x3`$S;S(N@E{Mdlw~Yu1uI#_YSyrp zb*yIt8`;EWwy>3LY-a~Md6-Ao#cm$uF&^g$p5!T>W)IJ>mwoK#Sq^ZJLmcJ^&+$Aj z@FFj9l$SZiE4<2Uyv`dO=S|+?ZQkKs-s62v@Btt45g+plKH-;~o(^=R6P@WoSGv)i9`vLaz3D?=`q7^O3}g_4 z8NyJ8F`N;MWE6LBC!@KGyBR|!V;RSICNPmnOlAsGnZ|TxFf)H_mGHQ0cnS}(hNtt5 zzs8l$3gkJxf0on5$A^MZ&S?ajSOp7`ZZu5Q+mpkgG zDnp}!h`6{Q!kH0CE~1DghTOywM;`K$kNjLj0SZ!x!W5w>A&PM^@g#5w#YyB+F5_}a za0OR#6<1S|Qd~o6uB8muQI_kufg34D63L`co(fc?5|v5iCT^w*w{R;}Nh6(VRHp_t zsYPuvs6$=qQJ)6<%uCU;ji^Eqg$g)!3gsL-j)|OOCtEe}*<&ZC0q&T2nPa@dtGveR zyuoqa(Jn-bb|G4{3(=xoh!*Wav}hNiMaQQ`N|;nGIwnXWnH0)Xfr?b3a&$~|6>GC6 zMY`-LFI{$2kS;qaN|zm#q|1)V(q%`gbQchtnwqO-Vo)wFceu_Bl1ZUF6{tuhD#zuH zt0hJDBukeaDbi&}dFisFf^^wYQM&A?Bwcn?mM%L|rMrMSPXx(1eyV*Cm7H4h@4tW) zU&y~mR7z5}S^uicelGb$rKGMJ8UC*uwRFt`+QhrnqDzDF1KV_#b3a80VT@+4}ow_KT zCOdUeI8AoyqHvlE)J133N7u4j{-6HH`Tw(}@UJUq|F4&giORs7ru08#!MTUHGMEw-}C5NfqOcMO93S{z6GM%z&--DLLX z|K}Uir^n^_Pk+oBmq&x@c*)TGb4Ol(s5)~cl?ykiJ}+D}+$Ars%k5WUF;NvR&^LF| zg!{(X>Gy`y`~)X#34LFm$j8&e&FTIAxE1mfv^M_dPu8ERXZfG7-JIU<|HfA1d3(Pu zl1&Ir`s+oqd-8>^vObj>lUY@XmgYewOWDH-!oN!>ND0c3Mk6{hkW8j9hxsgJEnC^m z9u9GoAk>nzZ zXky4sEOF!^FZsyNMHHYQg(yrBiV~t27ZXncmr$HUF6A;Vrvz7UC0B7ZB`L) za2;j2o*TH4awL&V3gxLlMJiF5RBqyCs&EUpQk68)sYZ2bP?K8JCWAWEr5^QZz-=_7 z5shg=Q<~A7+i5{dTG5&|w51*G=|D$1(U~rEr5oMpK~H+on?CfVAN?7?Kn5|GAq-_0 z!x_OyMsWvsGMc-%n=xco4>AeHiUi|Cg7G531d(8(NH9qxm@E=Z5ecS>1k*%<=_0`l zkzl4sFiRwuEfUNT3GNXI=86RKM1p%og8M{*`$d8WM1uJu!2*$Bp-8YuBv>pGED;GF z6bT*@36_ck%S3|ZBEbrgV5LZ~N+eh<608vk)`|q{M1u7q!3L3Fqe!qxB-ktxY!L~z ziUiw4g6$%~4v}D|Nbs;o@Q6sTOC;DW5RKc(D3;MpWIGz>2K96J)yN;+$s+o17umpX(6o-`}0!pByGx zcP9vP%0}l{PTBaMS~e=~OgU}qoVInO`Z5<$L=!`9Vu>RUdC5n9E}{ShDMVq4P?QkG zxR`hnxP;;)aw(T_IVHG)E4hlRDM=}=p)}W0hU+NH_1wUXlp~2`QYcRaDpHBcq;eBC zQ-xc&m8ztXPBp4igPPQ$HW}2RF7>ES18$=sjc800n$nEs+)fKx(u&r!p)KubPX{{E ziOzJPE8XZ$4|>vz-t?g_{pimC1~Q1j3}Gn47|sa(owp{it$%cPg0mHATD5cATWR6W zUDn+Rd=chrkU@PK(TtX~r6XPGNnZvsl#z@klW|NSr<0s>BL81Ikw-e2M{x&tGMc-% zn=xc+h>_DtwjY++56eYzI?49KKKo(6$Uo{NMC&@XzfK{)fQu~YzKtV; zP*gaM3PSn9aY)Iaexfov6dQilJt(%PWI|#>{Zc_od5$`h=f-a$ip_6PYUhAbr<*6n zpY!_(>9Z!4%XyXt|8yhy&TeGLnT^bUDWcf+=B1uL5K-#>O%aJd{Ik&2SN}AZ*t5BG z|Le#9x_PND7e$sjG%nH}I9mhyDaYTAIQ+wGiyc24ZBLepEw=r5WKqqi#Q1#aO`0@p zanqS^DgN`w8^hCx9@Wy*(;E|>>ZPZTNUzo-y;gd9kM!z2vX)hoph0-9dU|@DaP<6a z^ep_+GbvkzSJueNs%Ch$N2755;kgD`3u}jG|FWqcW^1o)ZIE&~T%-)ZpR0V7Hb?>2ZyuL1o_Bvrm4>4tI%;gjPpkDz%j%@}uG{x5yYTWkOT delta 133 zcmeyhiSd3Lw-7!>3mZUxaIv4%+>1T51XC*KLT6Y+I)baW~Q5{@92qhlZtPtJ?@ UCTt_azyLuEo0%9G0vv-30b~^L$lz(OOM z(2Q2Jp%dLm;|PwKO6oX6A5P&cEd}Bktif8WM-w(+BQ~QMThM~7XvH>c#}2e%C)&}0 zPVB~>B+FiOqX+x2A88!KA@t%1j^Z19i(~i>-{S`y#|iw1pU{Vs_yxb=6n@8PoWWWA zf#f-s^SFRN@fZGvVv+++^n(li;l>s4;7VMDtKr1}T!Vqgz_l2J!N`OGABMn>K#co8 zl;JuAaXoIpFl1pkZp2L(fe^wNiEP}ATabfMxD~fyG;)!L+c5@Xk&ioYC&poXjQc-< z;Vu;5ZcM}^L|`I{7z$B@$tcDYOvOF87jaC(eJDXG%21B`F&z)!LCnBRR3up*!o#S< zBbbFpQH9xf40A9S^DrL?RAT`i#}inHMR*cVp$3cbG-|N~&)``sMIDmMSf0c4SdJI4 z0``SjFGG{kK@Ojvdf?3cX69CJMbq zp$!yDQD`HDHc@CZgQYcNK0~9()p|2=(h(cdesFy;A>ET#vrN{8wgWttZ zise(b>%Xn86~hKNhjQd{$2&X&otMszME`$wB-*ujNVB6LP?;~kyLL|XyhXDOj|^u> zen*`Qm)&JDymo6tiU)ar!((sFmoFU(1OxF@UFg?>u!4pskS}*efJ26pW4<#*zN2~J z_6|q5ev>13`3p?#3JuQj^&GHI3VVFLy^c(N)PjDmSzcZ;W5UI&@}G8&wjWCq(Ntr2 zVosT9nz00nSz#}Q_O@AMFNsq7W%e}f-<`#&;UhDR;tDgGFsIrtrxzb(sT1MzKKnCW NdTLBR`%kz$?;pW*u&4k4 delta 131 zcmaEq`PhGhiz=U_k12x$TXAwmYHp$^+vM$r^6X**(m{R1PE@K`ckdKp>uc&G?(J TjSK?=1ToBHVqgex3^D`&KgA?o diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index d81026b..9f3c8d0 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -130,6 +130,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$x, rep(0:399, 6)) expect_equal(tab$y, rep(0:399, 6)) expect_equal(tab$s, as.character(rep(0:399, 6))) + expect_equal(tab$f, rep(0:399, 6)) pf <- test_path("data/mixed2.parquet") expect_snapshot({ @@ -140,6 +141,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$x, rep(0:399, 6)) expect_equal(tab$y, rep(0:399, 6)) expect_equal(tab$s, as.character(rep(0:399, 6))) + expect_equal(tab$f, rep(0:399, 6)) pf <- test_path("data/mixed-miss.parquet") expect_snapshot({ @@ -150,4 +152,5 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$x, 0:2399) expect_equal(tab$y, 0:2399) expect_equal(tab$s, as.character(0:2399)) + expect_equal(tab$f, 0:2399) }) From 4c5d6d36bd1c6844e49e3b88ff9df88bf14dde3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 10:13:08 +0100 Subject: [PATCH 11/18] Work around rchk false positive Plus also simplify logic. --- src/RParquetReader.cpp | 44 ++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 4d965fe..bfe9b57 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -1492,8 +1492,8 @@ void convert_column_to_r_ba_decimal_dict_nomiss(postprocess *pp, uint32_t cl) { int32_t scale = pp->metadata.r_types[cl].scale; double fct = std::pow(10.0, scale); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { - // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1504,35 +1504,29 @@ void convert_column_to_r_ba_decimal_dict_nomiss(postprocess *pp, uint32_t cl) { } } - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; - int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - SEXP tmp = R_NilValue; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; - if (!hasdict) continue; - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - REAL(tmp)[i] = parse_decimal(ba.buffer.data() + ba.offsets[i], ba.lengths[i]) / fct; - } - } + // convert dict, if any + if (pp->dicts[cl].size() == 0) continue; + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len == 0) continue; + SEXP tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + REAL(tmp)[i] = parse_decimal(ba.buffer.data() + ba.offsets[i], ba.lengths[i]) / fct; + } - // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; - uint32_t *end = didx + cp_num_present; - int64_t from = rg_offset + cp_offset; + // fill in dicts + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (!cp.dict) continue; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; + uint32_t *end = didx + cp.num_present; + int64_t from = rg_offset + cp.offset; while (didx < end) { REAL(x)[from++] = REAL(tmp)[*didx++]; } } - if (!Rf_isNull(tmp)) UNPROTECT(1); + UNPROTECT(1); } } From dabb06297c8d767f947237cc587eb33374e9847a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 10:30:39 +0100 Subject: [PATCH 12/18] Simplify conversion to R types w/ mixed dict & non-dict chunks --- src/RParquetReader.cpp | 207 ++++++++++++++++++----------------------- 1 file changed, 91 insertions(+), 116 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index bfe9b57..1a54aed 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -1315,8 +1315,8 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, lcl); SET_VECTOR_ELT(pp->facdicts, lcl, Rf_allocVector(VECSXP, pp->metadata.num_row_groups)); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { - // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1332,43 +1332,37 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { } } - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; - int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - SEXP tmp = R_NilValue; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; - if (!hasdict) continue; - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - SEXP xi = Rf_mkCharLenCE( - (char*) ba.buffer.data() + ba.offsets[i], - ba.lengths[i], - CE_UTF8 - ); - SET_STRING_ELT(tmp, i, xi); - } - SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, lcl), rg, tmp); - } + // convert dict, if any + if (pp->dicts[cl].size() == 0) continue; + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len == 0) continue; + SEXP tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + SEXP xi = Rf_mkCharLenCE( + (char*) ba.buffer.data() + ba.offsets[i], + ba.lengths[i], + CE_UTF8 + ); + SET_STRING_ELT(tmp, i, xi); + } + SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, lcl), rg, tmp); - // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; - uint32_t *end = didx + cp_num_present; - int64_t from = rg_offset + cp_offset; + // fill in dicts + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (!cp.dict) continue; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; + uint32_t *end = didx + cp.num_present; + int64_t from = rg_offset + cp.offset; while (didx < end) { SET_STRING_ELT(x, from, STRING_ELT(tmp, *didx)); from++; didx++; } } - if (!Rf_isNull(tmp)) UNPROTECT(1); + UNPROTECT(1); } } @@ -1577,8 +1571,8 @@ void convert_column_to_r_ba_raw_nodict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_ba_raw_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { - // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1591,39 +1585,33 @@ void convert_column_to_r_ba_raw_dict_nomiss(postprocess *pp, uint32_t cl) { } } - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; - int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - SEXP tmp = R_NilValue; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; - if (!hasdict) continue; - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - tmp = PROTECT(Rf_allocVector(VECSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - SEXP xi = Rf_allocVector(RAWSXP, ba.lengths[i]); - memcpy(RAW(xi), ba.buffer.data() + ba.offsets[i], ba.lengths[i]); - SET_VECTOR_ELT(tmp, i, xi); - } - } + // convert dict, if any + if (pp->dicts[cl].size() == 0) continue; + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len == 0) continue; + SEXP tmp = PROTECT(Rf_allocVector(VECSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + SEXP xi = Rf_allocVector(RAWSXP, ba.lengths[i]); + memcpy(RAW(xi), ba.buffer.data() + ba.offsets[i], ba.lengths[i]); + SET_VECTOR_ELT(tmp, i, xi); + } - // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; - uint32_t *end = didx + cp_num_present; - int64_t from = rg_offset + cp_offset; + // fill in + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (!cp.dict) continue; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; + uint32_t *end = didx + cp.num_present; + int64_t from = rg_offset + cp.offset; while (didx < end) { SET_VECTOR_ELT(x, from, VECTOR_ELT(tmp, *didx)); from++; didx++; } } - if (!Rf_isNull(tmp)) UNPROTECT(1); + UNPROTECT(1); } } @@ -1707,8 +1695,8 @@ void convert_column_to_r_ba_uuid_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); char uuid[37]; for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { - // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1726,44 +1714,37 @@ void convert_column_to_r_ba_uuid_dict_nomiss(postprocess *pp, uint32_t cl) { } } - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; - int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - SEXP tmp = R_NilValue; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; - if (!hasdict) continue; - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - unsigned char *s = (unsigned char*) ba.buffer.data() + ba.offsets[i]; - snprintf( - uuid, 37, - "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", - s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], - s[10], s[11], s[12], s[13], s[14], s[15] - ); - SET_STRING_ELT(tmp, i, Rf_mkCharLenCE(uuid, 36, CE_UTF8)); - } - } + // convert dict, if any + if (pp->dicts[cl].size() == 0) continue; + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len == 0) continue; + SEXP tmp = PROTECT(Rf_allocVector(STRSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + unsigned char *s = (unsigned char*) ba.buffer.data() + ba.offsets[i]; + snprintf( + uuid, 37, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + s[0], s[1], s[2], s[3], s[4], s[5], s[6], s[7], s[8], s[9], + s[10], s[11], s[12], s[13], s[14], s[15] + ); + SET_STRING_ELT(tmp, i, Rf_mkCharLenCE(uuid, 36, CE_UTF8)); + } - // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; - uint32_t *end = didx + cp_num_present; - int64_t from = rg_offset + cp_offset; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (!cp.dict) continue; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; + uint32_t *end = didx + cp.num_present; + int64_t from = rg_offset + cp.offset; while (didx < end) { SET_STRING_ELT(x, from, STRING_ELT(tmp, *didx)); from++; didx++; } } - if (!Rf_isNull(tmp)) UNPROTECT(1); + UNPROTECT(1); } } @@ -1813,8 +1794,8 @@ void convert_column_to_r_ba_float16_nodict_nomiss(postprocess *pp, uint32_t cl) void convert_column_to_r_ba_float16_dict_nomiss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { + // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { - // first the non-dict parts, if any std::vector rgba = pp->byte_arrays[cl][rg]; for (auto it = rgba.begin(); it != rgba.end(); ++it) { int64_t from = it->from; @@ -1826,38 +1807,32 @@ void convert_column_to_r_ba_float16_dict_nomiss(postprocess *pp, uint32_t cl) { } } - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; - int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - SEXP tmp = R_NilValue; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_present = cps[cpi].num_present; - bool hasdict = cps[cpi].dict; - if (!hasdict) continue; - // convert dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); - tmpbytes &ba = pp->dicts[cl][rg].bytes; - for (uint32_t i = 0; i < dict_len; i++) { - uint16_t *f = (uint16_t*) (ba.buffer.data() + ba.offsets[i]); - REAL(tmp)[i] = float16_to_double(*f); - } - } + // convert dict, if any + if (pp->dicts[cl].size() == 0) continue; + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len == 0) continue; + SEXP tmp = PROTECT(Rf_allocVector(REALSXP, dict_len)); + tmpbytes &ba = pp->dicts[cl][rg].bytes; + for (uint32_t i = 0; i < dict_len; i++) { + uint16_t *f = (uint16_t*) (ba.buffer.data() + ba.offsets[i]); + REAL(tmp)[i] = float16_to_double(*f); + } - // fill in - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; - uint32_t *end = didx + cp_num_present; - int64_t from = rg_offset + cp_offset; + // fill in + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (!cp.dict) continue; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; + uint32_t *end = didx + cp.num_present; + int64_t from = rg_offset + cp.offset; while (didx < end) { REAL(x)[from] = REAL(tmp)[*didx]; from++; didx++; } } - if (!Rf_isNull(tmp)) UNPROTECT(1); + UNPROTECT(1); } } From 46dd21adb5da37ddd9f235a79b2125dc9d60b614 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 10:40:13 +0100 Subject: [PATCH 13/18] Dict + non-dict mix tests for DOUBLE --- tests/testthat/_snaps/read-parquet-5.md | 15 +++++++++++++++ tests/testthat/data/create-data.py | 5 ++++- tests/testthat/data/mixed-miss.parquet | Bin 45372 -> 56602 bytes tests/testthat/data/mixed.parquet | Bin 22906 -> 28510 bytes tests/testthat/data/mixed2.parquet | Bin 13538 -> 16872 bytes tests/testthat/test-read-parquet-5.R | 3 +++ 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 4502c0a..79876d0 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -63,6 +63,7 @@ 3 INT64 REQUIRED 4 BYTE_ARRAY REQUIRED 5 FLOAT REQUIRED + 6 DOUBLE REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -83,6 +84,10 @@ 14 DATA_PAGE 1024 RLE_DICTIONARY 15 DATA_PAGE 1024 PLAIN 16 DATA_PAGE 352 PLAIN + 17 DICTIONARY_PAGE 400 PLAIN + 18 DATA_PAGE 1024 RLE_DICTIONARY + 19 DATA_PAGE 1024 PLAIN + 20 DATA_PAGE 352 PLAIN --- @@ -95,6 +100,7 @@ 3 INT64 REQUIRED 4 BYTE_ARRAY REQUIRED 5 FLOAT REQUIRED + 6 DOUBLE REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -115,6 +121,10 @@ 14 DATA_PAGE 1024 RLE_DICTIONARY 15 DATA_PAGE 1024 RLE_DICTIONARY 16 DATA_PAGE 352 RLE_DICTIONARY + 17 DICTIONARY_PAGE 400 PLAIN + 18 DATA_PAGE 1024 RLE_DICTIONARY + 19 DATA_PAGE 1024 RLE_DICTIONARY + 20 DATA_PAGE 352 RLE_DICTIONARY --- @@ -127,6 +137,7 @@ 3 INT64 OPTIONAL 4 BYTE_ARRAY OPTIONAL 5 FLOAT OPTIONAL + 6 DOUBLE OPTIONAL Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -147,4 +158,8 @@ 14 DATA_PAGE 1024 RLE_DICTIONARY 15 DATA_PAGE 1024 PLAIN 16 DATA_PAGE 352 PLAIN + 17 DICTIONARY_PAGE 1024 PLAIN + 18 DATA_PAGE 1024 RLE_DICTIONARY + 19 DATA_PAGE 1024 PLAIN + 20 DATA_PAGE 352 PLAIN diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py index 669286f..c24e20e 100644 --- a/tests/testthat/data/create-data.py +++ b/tests/testthat/data/create-data.py @@ -9,7 +9,7 @@ table = pa.table(data = data, schema = schema) pq.write_table( table, - 'float.parquet', + 'tests/testthat/data/float.parquet', row_group_size = 1500, data_page_size = 400, use_dictionary = False @@ -22,12 +22,14 @@ pa.field(name = 'y', type = pa.int64(), nullable = False), pa.field(name = "s", type = pa.utf8(), nullable = False), pa.field(name = 'f', type = pa.float32(), nullable = False), + pa.field(name = 'd', type = pa.float64(), nullable = False), ]) data = [ list(range(400)) * 6, list(range(400)) * 6, [ str(x) for x in range(400) ] * 6, list(range(400)) * 6, + list(range(400)) * 6, ] table = pa.table(data = data, schema = schema) pq.write_table( @@ -50,6 +52,7 @@ 'y': pa.array(range(2400), type=pa.int64()), 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()), 'f': pa.array(range(2400), type=pa.float32()), + 'd': pa.array(range(2400), type=pa.float64()), }) pq.write_table( table, diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet index d97fa2ac0e9cdd961fdf45198bf316bb427ce4e0..2d0504be7f0928de85d4bbc749b542ef8659ca95 100644 GIT binary patch delta 10309 zcmc)Q0gPJp9l-I|_S#*Wo?cAva_uhWVrq9Wr^5yfF+?9Ht*F>xhaGn4sHGMwR;<{v zjtT2nWezLrv5s}9tRaRNtW1d`h8W_AVTKrDh#`g;Vu&Gz7-EPahM0Z-aMu;nyV6X< zrJwiz|KI=p-Vc87J$Sq{^V#aN@2p9F^7`%xuXokNi97TY-th_P8cFiXlC*u5G}63d z>}p9~KPrumjY{L#gegp82G?OLwqX`GU=BAe%*ekRIl~cEEFN9M;t_T%9%WCEP83OGQA8C@ zQTeCFymV)vfD8jHa6rrj5k(SN6j4P}gde$lILCka_@^%)zf!n!fN-wAWnca?1H`Wo zesqv<&+_Sy4HCb6xND$r&$93S*8{~b8-9GSaB=)EdG7pXu=wS|-2;Ym{4bgQw*$s6 z6aMU=;hsyU_Y4|;>2U9Q?}e{p@l`wn@ztw<3Nmysi0>a09Sdx5zy&Y-2qKIq;t;h; zBa1wWD5HuxnrMql|93Y^bjLA;8O%Zf6=dk}!{t9Q^C^Z#MU`A-fx|8qmm z?;mphKMpzn^Fz-6ry=JL3_1S`L(YF{$oXF!F#m(!>-X<$^F@A@^Vfb+UWlIFbp7bk zXsXxaXZxfy`<=CK{pa+L*LwJMe^Q?7{<@!7H?eMdF{1N|yc8RhFU0n(CC78}^couP z_$q1bs*#|)_D9kP-|wT?2yq|q-Nyprk$|`-5RU@5FYri8I^yO)4RM2@g}6!3!vG^p zFv9{XY_P)tCtPsD3m^OlAczpch#-m>;z%GVZiG^F(#Rl-9P%ijh!V=Epo$vmXrPG} z+USTIpg-Ry(H+G`jAIj~FpU{(#VqEa5c!pKR8T{P7CPu*fDtB`VSyDk*x`T^F1X=^ z4}Js?LIV1@-&*kFePPPpKP7e4qAKoB8>5kV9&#F2m}$s&a`GRPu_JPIhHgfc3q zqJ}yeXrhHSI-&pzS5yFxf;c5qP(y|mI_P155hj>nffY8`;eZn^xZ#BlegqIi2w_AJ zMGSEy#Pye?lR_F9WRXK21r$+285LAfLmdq?(Lx&?QGn2u6u{H{aiAnt1vO-7p@SX< z7-51L7Fc0}9S%6*f*W4=;70&Kgb+pqQN$2OLR^1IIw_=)K^8gWQ9uzTluH{3gVPdK@AyN=%9xIMwnoR1y%vM0Y#KhMg>*WP)7qzw9rOJ6yVI@1rYfaP(lSYWN4v-9tId;f*BTA zVS^nGIN^dDUijcg06~NhMg&pB5QithB8e2z$RLXx@+hE)63VEciW=%@potdR=!gRR zb>ISc=7>>12^G|kp@j~57+{16W>{c_4R$!-gbQwX;e#Im1Q9|Q5kwI~TwMPNI!UCE zMh02rkVgSUlu$+mRn$;N15LEhMn@DNJZJ$tOE&^TDgf8t4^n`?4Ojs3&rvi5lu$tp z8CvL|hXF>IV1@-&*kFePPPpKP7e4qAKoB8>5kV9&#KrZOKoTjWkwF$Y~O#d7u@i|2R{M`B7`s^h$1Ggzc>;|B84IV1@-&*kFePPPpKP7e4qAKoB8>5kVB57>hU(NFs$a zGRPu_JPIhHgfc3qqJ}yeXrhHSI-&rtT($t7t!z7sIVhln3TnvELI*tzFv0{gEU>}` zI~;Jr1vk9#!H)of2qBCJqT>3G(TO90BvMEtgDi5$qktkxD5HWZYN(@uCR$^ov+Y;p z?h^=}7feVKr{*T2lXDZj>C;-;l&B~a1wqz zj{r^~h|>t+48k~z2wp}M=MclIOV|Hv3=%kxB;G^{ZzGL&kiomi;yvW>KJxeg1$>Ai zK0*l}ql`~b!KbL=Gt}VuoJAd9pn)&Z#8+tHYqaqVI`~#xSO0iHqC0|h7{vx`#8nu_ zHQ0nnOkoSAaV=(W9k#-=jm0c(z#MLZ0^6a)tx(}MsBt@F%tMQv(BU5Fu?q(5h7o&U z!d{s1Ff4ckR&o73O2>vnu;VZsI07e*!iD2-<5_rd0zRCCAI~FzQwZWTLO6pk&LV=B z5yd&g#QFCsoj6`Y0_Ty$n@Hhpr11_ico$i`haBEV9v`594^hNNDB)w2@d+yU6jgkN znmGSHM;%|FfiKa-S7_mDwDAo(_*R?(=f0m)K;++sS=@j*+yn)-Ly23V!fjCFcF35= z|9c8}I0uG#3g~+u1H6a&7=TH?f+@$@0HfPkxzLwe91t*KSMx zYwgmL=Zil7FM4X-w|3$F^IOIyPoLhvZnlk1o~f-{@V>EWeRR!F#ouww!b@-5VR=BK z(P-~up*bWLcZr4Om{`n<#m+-)s-m+)yF-lU?-$E%S9E&a`Nu_X`%yL(kq7V8hy?WJ zHLeA9a*Jp0L9ye5n)z-kHfHMH<4e2V+1vF&vHrWeKHl3{tQV=Z4_w%_P0VP->+ii( z(OK*s5S!j-T-?9L(|gstp`9Ad-5Sk~;@owB7jq znmfB=vDndl)zZ|AW8aado;l#WalgZH-QmcOaaE_OgwQby8ebLh=jMlG>j=Rg8# z9AYcBvx+bjNDf9i7vGgkvO#Z~^fhT|q>8 z07w;(5$FpRbO#AKf|+0znCTwr=#=j03DOOuq2kWPAPayTC#N8=5ZGuZs5D3%#7qX6 zT@N$`$OBtw2)ED^Y>OL66}p8U*+4NDSEvP!j-gORPJxcjj*dV#COf)-gkXk(*ntr9 zJV0zT^E4biz|IExpd4fXPz}r*j!r;97_gmu_l}B~j|>9?95KW&GcW`=1{neX%o=bP diff --git a/tests/testthat/data/mixed.parquet b/tests/testthat/data/mixed.parquet index 7a8ef5a08451ac9db2cb65428862433b97d7a0f8..f185ca7f9c908b5d5c159d5a2700972a36135b2d 100644 GIT binary patch delta 5113 zcmeHK%}*0S6n|SvY0-_1VVYK(SPfP}P?3bh7!!6%`BKn=P(IZnU=cy_6T)To;K3uN z|3Lo$G(>MECXIgq-b_qPuX->TZ_2^V{NZ|2R<*UIP5 z%KEWnKhs;-TCkU8Jq!*5=pld>1>Bo$>LJj4Uht}3@FVIFK|~l4L7YI;Bcg}~L?faJ zaVGQOcwC^Nu6o_TRkU?OOp10X;!<=-(J4ij6x~wvNO4Ju%Tn}8(I>?fDXvP9@cBVQ z_vjUJsz9IpDh*TWs18`wYgTp8st#Gzq*WcZsv}l)6jiaq@tBn!x2h9X^}1EPVO4Kh z)s!z{Oa;An%6!+X`Tmfn?>?y#M@U(CucU7|(^Jm$v^_1z`HVeJ^lfK4?M%-)(gN-{ z@-XL2-*u+vo#_QtIDYnf&gS=>=|!Jr1Y`tfwAq~9ggL{D<}8a5gH8`v4O}VHp%?q-_3FU?d>1^-QLOkaS`Sif40Ma zQiQ#O`O{*|g}wWwV(jhBmy0qV-1YVi`DG30x4En_V9yV;Z#7L!_cK59Fi$w>-6qW% zo&y1B*wy;xc$dp1bcJmlHEf{-TkKyaXaLoUdtR%qvDvxIooT@?287(X${#{DAs{Lh zLN*~Fj1a6q2-$>yv9nhqgls~9PWFm~(5?vD-u^B!g>s_$r(WnR>O5nuI#c*5vS>^*B@xoWHzK4H4Gy(bJnHy#jl_RuNOO{WBY zS~rw%8xV!A1K35SZUX`eTnFR|!PVi0a^D6p^U-h>rV6dfO56r0^DF4QLbm}?(0S!< z17h!HV4KU7Bb7039tm?27R^;)lgpGNm7)1B$^Aa3GPwRra=+iHj3~|hzNa#6MZCS` zHsB7;w?feKXliz5VUpEmPA5+tVznnBs7ym3cQONMjJlT7MZ?Mc;?e^TWW}N9`D0~&r#7u1z(~fMn&fwK6;*N@eV3u{ZtxNs>o{WF4bC|;-h4l zZs+8H+W2y2W#q`jSeiOc^OzwGBE~PTY3WtJK9tc% z>MqU@*EhJ`p&TVzb#mTAPsNtF@s4IM)Wo^$V2V13%or_c9ybi8c)Ouf(P0cLqm9;w RCc*$a|M05{Ai)RD{RSHO9ZvuN delta 412 zcmcb2kMY+g#trgee3CJy3=(X`$r-7+iJ}~nv$NIN#SW-S%1HWbUYH%ms3mseBr8xr zjYDkH4JHu=21ywS9><^{|8Oga9*H+h`XErAKDi^;zCO^=(a|{_NH|7<2p16F(G^6b z2Y^(8SiWFEcM#JN%mlN*O!r7fr*ub8kZvFi6?ZNMSpecV1%ZXYMms@d!NMRS8Dw@n z&=jBq*g`|Ng`Qwr+(4?(E%e9+in+MLEN~2kDsl>RbOtk%9bG^)%uuix#5@l$8`T3E sjvioV1AR~qG61Ls<_$+Dpdbv`PFBoU5%ZB@V1OfrP-X^(0LLIh07U?8+W-In diff --git a/tests/testthat/data/mixed2.parquet b/tests/testthat/data/mixed2.parquet index 687c721bbb2bd10725c8c504bae43abe9b655c33..6dcb2031b18d01258da9ccdde1f2c44b4508e1cf 100644 GIT binary patch delta 2232 zcma)6&rcIU6n+a$vu#|dgl%?X4~>C}N|Yib#+b0vmI4A&Du`f}VyWe)q9`<6w#JJ` zk|ri59_mdKV@P`NXipvu{slHBCLVkA=zBY@VGA9)>Gytp^Ua&LlkKmSKd+?ykHIN5 z*Qb8c#?^vTg?50P4$vooqcW(pf#H%tR%GxXIuIJdhwvlLAvzHOL^t9hq6cv~e?y6w zD9kCc1DJwZ2ZXo_a~I)mh`T6v!`zKQ!1mY`N{Wd+ zrxZFS&oPbL)7$oR+MdqX(>wNb)}G$Ar*oK^hZ@h@>3jAxVNdVd(+Bo+!JZ~ve(Ne| z?qqe=LUn#9(|cc3h$AFzyv*r`Vmc+Jmm1S1HBUF@iGC!eGh%vKNSm-CSty|Jje^IwAJ+f;WKXwnjROI^B8PZWO#`z12~ zKGm%cfYYNRr?w#J{l4Hm2nOE$lmg$sNxteJIG|`GIw$BoszU{}1<-g@C)T{L-?!qH z0onWJuhapc`?OOm9-o}O&X4BS3awY@*=70g)j@NHF&3FX!&b=*lVLk#LS&+=xbzxD zks-37DY8~b0S`lK6!mT6Qc`BZj2cjlWi2~L*4ocXc{rcJZYImH5Ct`2Y`4}h${S{g zsc*PTMcfeUbfC!0|Ax!NFq-N}!!#|On6B+_jgp{MQN~6s!49+DM0HmxF~&DcQCL4g jnu<6;#L}l&*viQaS*Ol_=BR!7MP-J<`!B-O&@I8%RUNor^&hfH+P;lQ%j@)hC0E33POV8f^$S z+7oP^8(1Tn(H_}AQ5RR3QI4TdMNWZ^&R}M;qYH?J844DInCAgz!_Cu$INJm4K%iU8 lK?VTTz#Q!81Qdh;+sUgPRm6N`7#QG)!I+tWA;2-n5CBOQX1D+V diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 9f3c8d0..6f70022 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -131,6 +131,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$y, rep(0:399, 6)) expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) + expect_equal(tab$d, rep(0:399, 6)) pf <- test_path("data/mixed2.parquet") expect_snapshot({ @@ -142,6 +143,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$y, rep(0:399, 6)) expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) + expect_equal(tab$d, rep(0:399, 6)) pf <- test_path("data/mixed-miss.parquet") expect_snapshot({ @@ -153,4 +155,5 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$y, 0:2399) expect_equal(tab$s, as.character(0:2399)) expect_equal(tab$f, 0:2399) + expect_equal(tab$d, 0:2399) }) From fadc7c980f0248323b217eb175bd311514f2d938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 12:18:16 +0100 Subject: [PATCH 14/18] Fix reading INT96, and add tests Also test mixed dict + non-dict pages. --- src/RParquetReader.cpp | 17 +++++++---------- tests/testthat/_snaps/read-parquet-5.md | 15 +++++++++++++++ tests/testthat/data/create-data.py | 13 +++++++++++++ tests/testthat/data/mixed-miss.parquet | Bin 56602 -> 70231 bytes tests/testthat/data/mixed.parquet | Bin 28510 -> 35420 bytes tests/testthat/data/mixed2.parquet | Bin 16872 -> 20537 bytes tests/testthat/test-read-parquet-5.R | 3 +++ 7 files changed, 38 insertions(+), 10 deletions(-) diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index 1a54aed..e43a222 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -349,7 +349,7 @@ rtype::rtype(parquet::SchemaElement &sel) { tmptype = INTSXP; type_conversion = INT96_DOUBLE; elsize = sizeof(int) * 3; - psize = 8 * 3; + psize = 4 * 3; rsize = 3; classes.push_back("POSIXct"); classes.push_back("POSIXt"); @@ -1130,14 +1130,11 @@ void convert_column_to_r_int96_dict_nomiss(postprocess *pp, uint32_t cl) { std::vector &cps = pp->chunk_parts[cl][rg]; bool rg_dict_converted = false; int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_values = cps[cpi].num_values; - bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + rg_offset + cp_offset; - double *end = beg + cp_num_values; - if (!hasdict) { - int96_t *src = src0 + rg_offset + cp_offset; + for (auto &cp : cps) { + double *beg = REAL(x) + rg_offset + cp.offset; + double *end = beg + cp.num_values; + if (!cp.dict) { + int96_t *src = src0 + rg_offset + cp.offset; while (beg < end) { *beg++ = impala_timestamp_to_milliseconds(*src++); } @@ -1154,7 +1151,7 @@ void convert_column_to_r_int96_dict_nomiss(postprocess *pp, uint32_t cl) { } } double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp.offset; while (beg < end) { *beg++ = dict[*didx++]; } diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 79876d0..00de204 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -64,6 +64,7 @@ 4 BYTE_ARRAY REQUIRED 5 FLOAT REQUIRED 6 DOUBLE REQUIRED + 7 INT96 REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -88,6 +89,10 @@ 18 DATA_PAGE 1024 RLE_DICTIONARY 19 DATA_PAGE 1024 PLAIN 20 DATA_PAGE 352 PLAIN + 21 DICTIONARY_PAGE 400 PLAIN + 22 DATA_PAGE 1024 RLE_DICTIONARY + 23 DATA_PAGE 1024 PLAIN + 24 DATA_PAGE 352 PLAIN --- @@ -101,6 +106,7 @@ 4 BYTE_ARRAY REQUIRED 5 FLOAT REQUIRED 6 DOUBLE REQUIRED + 7 INT96 REQUIRED Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -125,6 +131,10 @@ 18 DATA_PAGE 1024 RLE_DICTIONARY 19 DATA_PAGE 1024 RLE_DICTIONARY 20 DATA_PAGE 352 RLE_DICTIONARY + 21 DICTIONARY_PAGE 400 PLAIN + 22 DATA_PAGE 1024 RLE_DICTIONARY + 23 DATA_PAGE 1024 RLE_DICTIONARY + 24 DATA_PAGE 352 RLE_DICTIONARY --- @@ -138,6 +148,7 @@ 4 BYTE_ARRAY OPTIONAL 5 FLOAT OPTIONAL 6 DOUBLE OPTIONAL + 7 INT96 OPTIONAL Code as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output @@ -162,4 +173,8 @@ 18 DATA_PAGE 1024 RLE_DICTIONARY 19 DATA_PAGE 1024 PLAIN 20 DATA_PAGE 352 PLAIN + 21 DICTIONARY_PAGE 1024 PLAIN + 22 DATA_PAGE 1024 RLE_DICTIONARY + 23 DATA_PAGE 1024 PLAIN + 24 DATA_PAGE 352 PLAIN diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py index c24e20e..dfb3322 100644 --- a/tests/testthat/data/create-data.py +++ b/tests/testthat/data/create-data.py @@ -17,12 +17,14 @@ import pyarrow as pa import pyarrow.parquet as pq +from datetime import datetime schema = pa.schema(fields=[ pa.field(name = 'x', type = pa.int32(), nullable = False), pa.field(name = 'y', type = pa.int64(), nullable = False), pa.field(name = "s", type = pa.utf8(), nullable = False), pa.field(name = 'f', type = pa.float32(), nullable = False), pa.field(name = 'd', type = pa.float64(), nullable = False), + pa.field(name = "i96", type = pa.timestamp('ms', tz='UTC'), nullable = False), ]) data = [ list(range(400)) * 6, @@ -30,11 +32,14 @@ [ str(x) for x in range(400) ] * 6, list(range(400)) * 6, list(range(400)) * 6, + [ pa.scalar(datetime(x, 1, 1), type=pa.timestamp('ms', tz='UTC')) + for x in range(1800, 2200) ] * 6, ] table = pa.table(data = data, schema = schema) pq.write_table( table, 'tests/testthat/data/mixed.parquet', + use_deprecated_int96_timestamps = True, data_page_size = 400, dictionary_pagesize_limit = 400 ) @@ -42,21 +47,29 @@ pq.write_table( table, 'tests/testthat/data/mixed2.parquet', + use_deprecated_int96_timestamps = True, data_page_size = 400 ) import pyarrow as pa import pyarrow.parquet as pq +from datetime import datetime table = pa.table({ 'x': pa.array(range(2400), type=pa.int32()), 'y': pa.array(range(2400), type=pa.int64()), 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()), 'f': pa.array(range(2400), type=pa.float32()), 'd': pa.array(range(2400), type=pa.float64()), + 'i96': pa.array( + [ pa.scalar(datetime(x, 1, 1), type=pa.timestamp('ms', tz='UTC')) + for x in range(1, 2401) ], + type = pa.timestamp('ms', tz='UTC') + ), }) pq.write_table( table, 'tests/testthat/data/mixed-miss.parquet', + use_deprecated_int96_timestamps = True, data_page_size = 400, dictionary_pagesize_limit = 400 ) diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet index 2d0504be7f0928de85d4bbc749b542ef8659ca95..7d25585b975bb024b0476e21a75c608b59801128 100644 GIT binary patch delta 12918 zcmZA72~%ZQ;_?+Lq_i5i}-(A<*$6hjvr7QAejwn}2)Dq4~sU7ojQ%g`% z6RC(q2|*%}qLjSPh^~t56-1)54w?!wrcSWg8QQwQY*%>Z1|A-8+7o(uLxvAX`@$?g zcsv?h$3Rg47zM)maUhxiQ4^tB5`@l^pfCjVCd0Zg_#F;YBcNg`I81|MQJ^~mQf9)> zSr8rr_h*B6E*y>p?fHZ4%ZNcM;8qe0SqXbq zf!bm-yEL+{hDsRU%tLi9O!dLG;_ zLh&Uqz5?l`(B>MT38|15+Nu zgC}733=ThsZZBcwEBICip>N<`Jq&vb2i}3^2UyVvUp{IoNMwRP6K^$v%~#ml4C>!u z=@0ny6DG94_1`e)ucm_hE|Jy)bD7R<*iVp#_c9PD2UpvIRXfOS4@wHKP!SqBg1-`6 zR)&6^Ax9M?3hKB(173Fl->z_83(UH~79D8U9p?6cmwMoB0A~!Lw=rxs0l8if(;J?e zf=6FCWdJB=dkm3bDydi89l>35sG#v5=tpG?G3(bKLGG18!I}?buL9j0vG(sRT z6h2LciQ#Z#3JjhKyCOj~3Kma?4>KTe7F>&l0kdJp98ijdMf2e8d>9i4m*b)TBG@Ke zwc=t}AX$QMmV)0hxUd||l3;5xv|k0WDe!VN_^gGq>%cS(Hm`@a=`cG3o^1q=&2Tys zdTxP?tstEPv$nzG?clltit@l{m(c&~cN0Z>A!;8~?+53DP`dWsALwQVE6w4X1%&p8dzLV4fKdMj1`;&~!wPHo zG6aHc;nq;F84i2Jpl%OKN5CfsnBWB0onep*>~e+9?vUUC?>!;V8?O3*l`rJ_fs#Kg z90Lsj;4cZp%j2Nm1jv~P3c;{o61)xp-^p-349up$mI!DU33I2x%P8=k0cU1H?`YT@ z19Ed9W-dI91&{e~Y5|zUL;6CHmI(DfGl5vU7+jV@Q6d;FhqM*&HwmIv!lPB-v>FQ5 zfZjS-n+h#y5U~L&(qY6#IJyaRG9e`ker$oTY$(qG@pd?r3lgn7Oxg*}yC7r_+}R7Z z`(gh9=yC{_9fr?GAm|v}%m?d2*nJ$-ieSk}Xe@?tr=hF_2A+kybD(?yBp2b`B?!0z zS4t&lc@4IgLB|^qcN6Mw!RR}1@h+H`L-u{>@DS!zz^h6a^$5;YL!Vm6d<^nWVa_vn z{v14CLdh%W^%^$50htDfek;`9lXpb74{)*(j6cDK&mi>$rhkQ+W*GS$j{gAtUy#}Y ze||&cUz)K})J<=uyo_TT2J)q$yDY4hgI{f7N;`Pa9_$q0up(%866Q}UcpRjUhc**oMiA5lgG&gU2nB;MNDYU- zQ()RusEP!~C@7c?J!Zn1S@0_wB4)$GIba_PN9KXf0$3FXlJD_2c@dNgwK#k+99#lj z6Cr6Cd|eKclHhhS*sg+oDWI_i64%1#bucjvZmfsF>99KkR5!um&F~=;0=K}`tzeY{ zxvh6%rCj2|9ng>m{=4AvZs@lca`u730a$PlULOMABXIsGnB~Kk0%&&}=AM9;Mc`cw zXHG%y64-nOr&MQc%gCDOUOj3{K4Iq98hu(u$BP4x< z=1&mP1b4oGZ8Plu23>x@vY+tz7XO^nW>09_8)lorvp(Qq2B*!1`tR9~n9(1ktzgywcsvkX2SbrH7}>yjTM!L{sNqm8 z24{OH907Wcu+9m7JHu2LsBi@bcR1z&x?Ygt4L^M#+!rMG{ZQ-=hsS{SSV#_p=5Y`@ z0q#zOp~0|!5@?3P^2yK?20>HcRs;-*ggw(hZ8|KS0Uu|=_-H7LfkAU%=Uh;k2a@>` ze7^w3#zW~suoT8}6QI))h+hiziQvB+F0BBIWXN6#3MnvuHPo$vQS0D*D)e0sSsOq; z1Lkgo7n{H<6H2n6_g10)H)RuLw?XuFc$y3Dc~HC)jCVu&9%!=I!@>g~`{TybOll zfP*)o>upH717GjLq;j}@A8a4Oz6#K&g2YGgxf&+c!i~o;_$lmu2C6S$@k{vdN~pha zuZh>*z<>tW@fMWd!=exHwh_jBf-9e){}<|G$ZRLf-?jsBwgNm;1kX-zS_yipKt^YfQG;3P@K^&}HQ{7e zFw%zg-9V%Z)4M};4{+9pLIcn@f_295+XSZeg38|D&<7;P`l7BmthRuk{UF>D?puM~ zKsY=Iw5=g|2z;}F(4la57z`D|0Xxtf0m~hr$q|B_;pRxNc7@$;pymNfJfYDG#`#FF zY!nRigS^q8JO(5I@NO&wjDsuV!Ez#O4}y-9AT9*zLt%6nTnq;N1%2we}uiX-slC~zk{dm!MzcRKZ5aRNN<8RUo{nM zW;E-H{@rT6>59}uY8k3(+f+@3CyGjHa%ytg`8U%vWFOY@ST zBqcm^bors%`pogfPtI-70;#{@Fa0b(P1Ie@My5)NeMf01kbxd@u%<2il81=)@UR2e zE5ebEprZtzNe6E0f^84jrw1AakZ1^>jbNe) z+~^5|d&6#1Q0)tg&ESJMjOz#2`ojP#*fBtY%7b9hV0dc{V{G7xE%YA-+lGUp9mLtg zn-SpW2p62dd?ajjf%a|?>kh9xz{d;DdV}dG$n=G_LL<-ihi7BJbF9#5rvr&S$3w;h zkO_iW!SHwzxQ0T}WH1bev?=g60-_?}(KK+H4h1tnZx*bLhL#wJm;)8Gv&@KjaCAQC z#6e0t{8$KK5@7+#6F|HK4lM<(WstNSnpZ$bGTd1SwkfcGHFQ}E%hti?R0vuRH#dN_ z(8;?ug4$+Sk_nAjFm5Z9Wy8R2khdL_cL<#($s@ko2?4v|${w)X2ix~U$Ab`e2A8b|G{)0rQIB)kzq23eKH|K4&2FEXbdSITzshMew`~C09Vw>nd)%1~S(n z`UX6?32wLHH$oD2sITjvI>qr0{t3Dt%X02A@V6yJ_E-WkpB|8*TL%7 z@aqjsX@CcBh2^(<4@W*gw~w&$6MXv&lfS^duQ2Qz9QY1hf5M7i@TCQU|G@3P^rDSa z58<}Dw+#d8GLR?>pX6YIJX~)FgFC=31?b!n5;_UJht~$+YXldJ!K^22=>_dfA+`^^>rG>SkHw$v7j~|mMnnAIHCT>FC>;Nf`JK;w-}U{!lFcYy9~yxfGbJR zeQDCytS*J0S0TI%?q3JHn{fCRXy1Y4yYTHEgx-g{4`65o9H<1%N3gsa znra~UG2D6rL!QB&=b-))mcD|Ibuj*oP=D9!iG$w4&Uc{l0TLSF{YM!48A_YL>MP_n zL#OW${{tF+f`1EK`VAI;>BAh6p73u?g*FV#mxekS@Rft}ZJ}>F$Z8Mm6omQbDiU9G z1TQ5hQHEZfVWTR@s6(^{Jm~^%UE!n_7K#|=T>7*b8(Pfv*K z4V9+g*cbB6pu5BZSNDTo{b7m~JQx6WgW$+u=r#ma+Q2tkm^=*b4ToWNaKIkAI=~7? z_~Hb?BjL6S*to%7cTo3)L@)T{4HHJeb>G%+TBC`({GoFIB#ea*fe<(zu1)}}AlMNM zN+GZ?6dESOm~gl}1^P{eoJdfNf(6sz^$hTx1s9^hOz6WcbD&);#LffB%lYUX2WR5J zbP;S8s#0z-#4LelOTl9qoLUYhNsy5Y(yL%*3e>I!*R@cz4h+*EZ9Rz6Au0nNZ3L&y zP?!mNTVU;0VWyUBh}Z@Xw}br-IFbiCyI|FB_`U}w?}PIFF#I4KJOo{jK+;k8dJHBN z!0ke?JpucQK%*EEPr>KYF!2oBI17W%3-!1A0#Wr6EWQjMuE4mfaP1llxDGpRfbuO^ zbQ|8@fid^sN;&j@0NWmdVkN{?!J9|mR|6Mn!TbqqeG2WLL+lH9^%8s}b$IqQnASsP z1GIeyv){wB58(L`PJe=)O_1>gWSU{tH+cLVTz|sJUtshb*8iaoMIwEn52s5pP~8Tc zWuQA=G!0?75i}V?a8J0^3x=4&o<2hTtD6y*n!`s67~dbRTf(3LuyY`&42A@2 zcs~Tj+Cu42uo@1zV(4TK@gtzY0sNidk~3JiK#nUYxWjx8sPhD0Z;+h#!M?tb0rAN_H6=Y?C+XL73 z!hrpdcL0v$*9Pw7USY7vbe4 z@V)|PO2PCRY%T-28xV67p4|eEJ8lY-q zK=W@1`Aak25g7S`M}3GlfO_sd~y5|k!`)hftMflg~6el0Yt1OGI*v>q(dAtwVATK|{M{LRF=Oz_&tdNiP=5u9b@1smOsI$J4KVl}?0OHK8zJE% zeE0-`O>p%KST)0rZ=m!87XE~WUohr3T>e8J_7fQjH^m$&1{9@1vOor3%YtuPxF8Q^ z?O{s?Xr~CV9pPms@K%O1DqyM#o7F&017fYXHq5~PaAl(CI>OrkOxEex{ z5g3|CFs&zudP9^cJn92ZW>9Djdi`K+e`v9UsRN*5AdDCcN3B8E22yO{$503x4)?_% zwueI_K+6%5ouJtnLR{dEt5E+#-HH1>poj=8JeW5h z>K4GLcsREZ`Up#wl>qWfV9ru_o(Nvcp=1U0N`{RqK{f@VSHqJv;Iw zzf=Zs#zv^w1S2!ycorCJh16{LlLL|4p(+;~^B{jGbl(kY_Q0>bJawG1-|*juj-m$# z427qTYwXm%Xql>=lO+pJ9fjwMJzAeHo|EPIqC`sG{*Yno1I8nVIZd~tu<{su&xgr{ zaPK$_D}sY3q3bDFaT>mqz@)Qq`yAL@fV~$%<1!>(fls9{;Tqg1gTXgo*G=er8y4RY z7W2bh2rP%I_hG<8*iivWRj}|8ysd^YwQ%_{^b?wP+cQvn0SjKj>sR3S8ZNv6vj*7m z7TUju*bnfs5xhUa+0S75MX0~cUx{+xAm%$f{Q>U3ptuE0{y_R)TDOgqkzbL_j5h4o zN`s3moR9+pc}Qyqf7`<}1*lR4r%q6y1U*z>O=oCPg$Q+cr~wjtO+4BabhKerH~6jt zVcns;2MpJTg9e~w1WCs5)dVK>f;&Rv+V+8ceL=$ekxHb$1h#}7olt;iK2YBlU0nTt`B=o0gWwyCNM-Pbegg0K`=K~i=S z9S!Zrz`Ou>H5Pov!P)W9XCh<<3G3f>5^;72JP!rWFgP6!JtJV_RFIhlv!dYfba0yq zCuf0C46L6GQgdN?EL6_}=LK*)4)hnox<&9?xYo$UP`Lyg65*I+8S1WplqC3(3}LI_ zehP@!z@fFEl?use(7YZ((&0`94BZ6#H$#^!ShfY4wn9)2+}sA%xv*yksO^L$yP$D5 zjNdEtf7w3bzypwX5L6C>EVfzW_coO1^q5c$%E`f_@z~UTapN9?? zVcsREy9}dB;oMc|QwCYrLH;I4=G?;Px54W!l-z?}_hI7$kgb5|N_bKQZq-m+1ICYG z!xNBt1~Z;R%?lX$3Xa!-!5c`ehd&Jv`3|bygJUD)e}wLzh2P1VCiwLQrZmHYZ(#oe zj{JmfEwJ)8eE&lmP8JypchGxM3=ETogEG)n4py{2d9Hj1#fBWNf=qB49^ zfeET`T}_yOkOu7R0xDf0K?~k%!&n_C)di~_kgErs3?SYR8jQf-1TOUii{6l93JQH; zz8Tb+gKs}L-yix4eV8=>+6{uagN6EkVNLY1fit$ydl+mQ4zhL-V-HV9fV(3UJAuha zNOyrYZZN|gYCXWk3r={0!6-=cg}*{GPV73Id_WcvvHufGratA{ZV{0{c)n zIvI4rVbv7)9syyIP(BTYPltmuKx-BxMZ?z^m^26O%mv$dux~zS#KE$7_`DD%3Tu8N z0j!q@^}l;5QFR$CUJi{bU|cd>TL}YFAa6A&uZ2bH;B6`dtcNQbpnnEz+Xx*uLtG}j z$pXKva4{Rq=|h{X+l~KwcWl4ISfnPS8Lwa^>LM2tGG%g9u$sbp-6}I>H95mJU8Exh z4-l!JGt|`9oR;rbby-ScB+^!s)3g%FDQUJIRcpgx^_mK4^@4j+Mqdir8fCwf>uIAV z!+o}maO36l0!r+OMCu>v6?wmryn5Mjc~y}}Q@e|T*vH4y-};}qyL^{66Sn?hJF!^o z&0k@26^moVcHv@2v3Px8wTUD^7<3m7FZj=4vDhwFh&}@U{9K4|9(L^Pr?d`+2nuKY z&o)AM4fc)fCl-f`*>MtH?f*UMEp}*)!dV?h`+3LO1^jzHv1DHBAwR+Ycf!^2x3};?T0gPHe00T;Xf0I#jTgZo9w)W72q?52rJThb^k1b zaEgD<=PVXm346lfe>NdHhYOi2h!sx2ds_E}F*^p?Y;D)q));6{<9pZt{1NSv5sAFS HJ{JE6X^zQ0 delta 470 zcmcblQU9t6GgeE@8o6FXBRu5Dk&owv;8M8<55OUu}PD- zfFf!fV$+_mi7+ro%1H1y1_k+tTS0V5EMV3LfvV}M0*nszp^lD@F1|p*F$+Yvg9yiR z5a9$O+_Hd5Er1MH7grG9DG)@0nIHy)>68s(8I=K*f^?=kJAy0#vYZ?%Kr)VzAOdV2 zhyi6rr2}oNcT9J50x=-K$fMgs&zQV9D z6{OTR1ZqCSD={EdZjO$gAV)ftJAy+3YAVDGh=Ivi4Fm}SJ>UY?pX}%i=7R`Vu&bww X3o)ul#>g-*AR`8576yg@#~?!hBw%^E diff --git a/tests/testthat/data/mixed.parquet b/tests/testthat/data/mixed.parquet index f185ca7f9c908b5d5c159d5a2700972a36135b2d..552e460b109e49cba9b0f98ca7dd4bb027293528 100644 GIT binary patch literal 35420 zcmeI*2V7IEM=F#S zgdBrcEp!AvG@uDBNT3aILNVw<5Bew$0~n$NO2P=G!1W1bP!>`cqZ~|Nit?xcGgL$+ zRE9aKpem}NI%>cIHDQTbsEsG&=rx0 zLNsE~4Y7zrcl1C!63`QgNJ204Mjs?21%1&E{V@OoF$ja9z!0QjDAF(t!!ZKs7>Q9B zjWHODaTt#bOu$4;!emUrR7^uAreg+XVism&4(1{Y^DrL^un>!|7)!7e%di|PuoA1V z8f%b^wOEJs*no}Lgw5E3t=NX`*nyqch26-(9^_&#_F+E`;2;j+Fpl6Tj^Q{?;3Q7r zG|u2G&fz>R;36*JGOpk%uHiav;3o2L3%79xckvbO;XWSVAs*ptJjOS8f~RCfiG(&T=fDCdt!5J=ag&W-A0Z(|r8$R$wOSFO?{1JfG2t*Ltpe=$Cf_7++P;@{? zbV3+9qYJ_jfv$)|6rvG>Ziq!3x}yi;k$|2^L=t+TH~JtMDd>xS=#K#yh(Q<(1%@CM zLy?AI7>*H0$4HFAXpF&FjKg?H1!;O|>62o0r3xddkyJ;jV`d@|CzwdYXP2>H`N1W= zkf!%)pN8IGK~vX6mZcb?rCZuEI45h!&|*eaJu+qUQil~cv5CHvGjHgKlIBk7Ci3}d zBgf7s}9R$UHd$`=eDTfrgm!KFP5MvSj4OVu@XT9`hen#V-TklckMC)M+;FV zS}zS))^7juDJ$A$cR4)6b;ZqS z9$Yhb_h3Ow-^?RBYtK;KGPOewWzX9?!l;_pERVJG_m4KQ?RM?Z+64#4n>&v(^IW&^ z@FdID%R>*ZTXb}qRrrxvp6eGMpXreF=-T1+OHR&}4b`vcwPES$1s;=XcQ~?P+1Vw2 z3%q7~ZCrkSMezD=*N<#madA!9-cc33H?6$9KKktP4o5ewy1F^x&XL*Pn^#}op7Qe1 z^`o2D+}y3uDqhKFOZKh3>1FD4Jho-+orA)=T#XaC8U}-TRNw8@_|3l`_)4zEnOqHB z?Njrc57**pqL4|~nJJ$p!l$1m3iDUTPQt182Cv z6>e~c2Rz{gZ}`9$Ezt^o@J9ezBM?DogSH4p2-=}NLeT*o(FtMbj4lXA1iB&;QHVwi zx*-;E=#CzUM*?~x5lQHU-spp5q@XYQp+5#-AO>MD6c~b33`H7-VK_!09V0OcqcH|! zF%ILAfeDz1Ntlc&n2Kr0#B|KSOw7V;%)wk_VIJmV0TyBr7GnvPVi}fW1y*7eR$~pa zu@>vF9viR`o3I&Muoc^|9XqfSyRaKM*n?c`#XjuE0UX339L5nG#W5Vm37o_!oW>cP z#W|eE1zf}>T*eh##Wh^V4ctT?Zs9iW;4Z$xJ>16wJj5e>jmP)~Pw*7a@EkAj60h(r zUgJBw5#Gsk{?31WWbC<16~@w0#&xAR+I;@u_Ldbt(y!ZF;w3&5#a|897C*O!4Q$a6 zcCbeyIKUB&(F9G=49(F3GRWZsXSl!>Zg7VOJmCdz_`nw}(F%U>M*vzQ5J6~zwg^TD z+Mzu{(E%ON31R4rE(k{ix*`%$h(-*$Ar^7yjvk0d0(v45N$7>%=!0aWpfCENKL%hR z24OH17=ly`MH+@-I7T2HBQXl2F$QBX4卼jUn2afyifPEibj-j^%))HU!CYiv z9_C{K7Ge<=V+odG8J1%OR$>)aV-2#g7VEGc8?X_Zuo+vh72B{KJFpYGup2qpgIw&z zKJ3Q<9K<0U#t|IFF&xJUoWv=d#u=Q&Ih@A@T*M_@#uZ$}HC)FH+(aI3;WqBzF22G& z+{Xhv#3Ou-$M^7 zChs;GwQr~wPqge7XBHtL`*>Y+XwzzWu| zfh`)s4)$mS2RNcJnxH9~p*dPW205JI3>Ub<4es!OC%oVdANZmrTEP$g2taEDA_#5J z7QqNXJG4hAI-nyuAq<_-1>uN5S41KT(TG7e#3Bye(F5^FKu;th3BAx8eUOY4^hH1P z#{dk(APj~ALy(H0NW(A;#|WfjBt~I0#$YVQVLUQ00TVF^lQ9KTF%6lRjv1JVS(uGE zn2RjT!+b2jLM*~!EWuJN!*Z;^O02?atU)%`Vjb3F12$q4He(C6VjH$&2XV2o5;g0+{PW; z#aFn8`*?tdc!aO<7~kLtp5hsv;{{&g6~4u5e1|vq9zWnmyv0v=ho22~pMNjW(9kQT z{`2SIaE1$9;Rbhjz!P5Zh7Wwv60P6|e*~a40uh8ZXp3NkpdH#H6dlkJoe+l3=z?%W zperH~g=oZ}8)6ZM?&yJdB%mh}k%V68jXp?53i_fS`eOhFVh{#Hfgwo6P^4iPhGPWM zF%qLN8e=flP`9ag+*SM#53!0LA3Of4n?KP+S2dGzzu+TZWvcpk>=%1HR-qxiP~d>< zN1h?LRDc;Oq7o{@992*i)leNZV1b&jL@m@t9n?iV)JFqY!5TKO zMMK!Z9*y7tM>IwgG(|HsM+?XxhZCIP0#~@f9Ukz67rfyEUwq+9(fp05QW~X7^4uw< ze(vy`P(OEyvk+eQ8+MP zQ8+T+C+IplYPp&TR(i$6b*5kgTQr0n?9m7gdd2kIn4@THn6D^onXf1sGG9^HF<(*G zGhb0OV!onqV7{VoWWG=E+#%Sg{Zo&D#KzI}_rCyJej$GmiLG_laevj7eJR%?v2|RO zBHr2RwvG;3tQKq?9Y1m`*IuyHbaXV)5O)XEhibpnL$&8(rCUBcQv2bv;*p(+`1G-{ z`N-;x6=gAr?QhnDEU!Pdek#hkNSvoA>mqTUqO6O=d5W?w66Yz(x=5Vo6YC*0SzVOPXmOsRY(|Uod}1?NoW4w7ur3;jwe_%l zzt19lzP6}H*CMf2ZeK6*)3vr~bZb`r4NI(T^~SKsQu_4IBC%F2$a#(0sfIN3 z!y~WnSDjkcR^lerPU0`{w-7i2KGx+3ywS`3L@B&DrBvAPval4{*n%7!!YN!r9v}SE zzzAhw0yCJSIxJBaR%i$ZG=&T<@PH5e5Qt!eA`B6TMjR5*3n>@?1yYfQ;Yi0QjKMf$ zU?L`CDl#zxvoHr)n2&{6jHOtPl~|2ztiuLu!WL}94(vh>ah*^Cg@GD29#2w^=Vgbj=kHZnrk#0X(CBZMuC5VkTx*v1H9J0pZ0j1YD* zLfFL!VK*a$97YIx7$M{`LfFd)VIL!e{frO}FhV%Uc+mMHM^t%V0wyTmoj}1$E%fOucOUw=Nr5-0`8_wpMRjYqGx7f&|*oK{4n; z5Bew$0~n$NO2P=GP#R@W7E&0a986$}@~8kaR753IhB>OBDypG6YQO?DVToF(jXJ1{ zdZ>>Eu!1#gV2g&ZgFPC-0gh;lCTNOgXpR<;K@KN4!v(HzgF8In2`_lV2fk>DR`7#A z0?-yO@@Ab&+FbNX<5OD68zDc_xdUj(1Kzym(;Lm+|?iZDbV8gWQKFQi}q z6i7uH)H})QKjityF`bo{{o|I&M`wJ6*#0#p0d((V@3@mstsNlsm1uwQO zcrm!(#gKv*+ZDXnzTm~sf)_g!yx6he#ZCn;hUu9re~&WmX4w0?nx(q;;tTdAoFOev z5`}$ia+ks(;^%#P(&v49uh0AT-kDBp|1h$~fZ zM5`Wj9%u<~-DA!}qJZ+EdLvqVD6&)!hxi^h-Yqs~jbsHG{1Ao&u+S-d`|97mTg=f@Z|Q-`zZ^ZysUg0%tFDH3 zbwhn#!P>*e)If68mw2=y8~sSN0Fu#~{1iyKwIP?<64wy2t{tfyO2&2|KXfEfVdPwA z;uKESL=dw`GBS#M7fm9%k<+oHd3Un12Qf_`!+VmKiKKHca-uhBl1!GS5aWJiXn*o- z0O>G@934y?hLA<6q;wil3?ttRC+*V7!I8vnG?_n!lpIF}j3EzxF5;%+OnN8}?B{Q>#-h9$$0lBk~_$?;8mXJEjNak{)y^XD<-hC9>)=DSwrWxJF)GC!KGSlX;}+ZL;hRG5(6A z-6PNMla3F`(MQDbFSN``?)w z=-LQ1HolU~)!?$B7U?G;_q9o2F_NoG8t9W*#fhFFNiIR|mL&eA$ga|)Zdo#2N_5JR zBolJWl(eiswwn>lN@QwfqE&??R3+D|5$_sgiv_7+Nha1JLLJh*F1b>VxHljhtw>cH zGTxTFZAiM=lZ%ariz8Xvm{e{`#xx_}Hz$!Ya!yX<&SbRZ5#E%T~Cocm?SRgqbL>jjxOM{8D9Z79Zo`#YR9m$bSq)}(GunQ?2K?Zjvk0VJ) zG&vYU8pe`&ail~KG9aEjOdxF%$=)Pl-J8tmLyD)6l)mI%Khk;t$r(uM4JI=cL^qZ6 z9!hSf5x?PN=Lk}JB$+mfXpbR@W68~N#3zGnn?Pz#B9kW*jj1Gl8o8E9JZF$iGfDN? zWWpTsZZ3(PM=sANZVSo!MZ|mw8Ml=DxQs-vAm>*S=hY;84XL=6j9N$DtS4PJk~5n~ zi!Ef;R#JXD8L@-B+DW?XCMR=9(_FG_FDbX5q#Yp750Z|D$+08E@fcZroRm39hMXc# zPLuX$$)R(^{sLKWkr-Vj1Fw*;uae;FWd99fBmVtm-6Do}NWZ(}{#PXEKFNJR8ayJi zz9#zLkmM)i?o;Cboa}x<>b@e=za=`~k)$^y?|b6=BiZ(rSiU1ue&&x*Q^TTag03W9 zlP_G?B3{~Lvks}QOD5`(pY=(c0l92Q+)I)TMx;tVd; zqbrg(l}MyHIa`Ius*zRINd*g%UXy%lNxIY~r|OVq^~m!2q?{EQW=&q$kWLNBF+0+@ z5n19u$~GoLnvkbWNoaF&xCOD7lZ8&ClnWW;O1^d@As*y_C$aS=Sw5shOVYm;dEiHa z0?6Lh#43o)ZbS5gNlFO$svQXkCA&M2dY#COFjA}w=@m|HN03&LWJeUK6+@QAZ-BpHLqPlHM95L1Jumr^a*l~R4U zrdfRKO?R27KEVCO9^lT=kiM?|!2xcb4T>=3X5h3@HI$oy>ZSC5@lv|?UG+kmW!=Ae zAuYL-%3W|xW4+6+*OX9+ddEw><8_m##NWE(C9x4KLiN=z3&BjPX=*3v7@2Y{Qcb>Wd}OGG@#`Rq4W<^lW+oEx z{~V<}H(n6W)%x|4VA@nFVZE#=9{b|v#Tapkll8bxN&a>ARbA)Gmm0>IZ7nv$Tf7T0 zGx^~Eb-?_}5Ao#+zyF_96@DWI`8SfJx{=J28pi8_ERG$}u$a0^!_5AdcS0 zmdIc4{)-@sXEQY|4y9`H2;`rwCcbjQ@`Fo-|8V;GR~zx`Bk(rRBK^6h#kNaa?eAW) zp}hK!e6{eK(RqI};xAQoL)m$n##>@6<{a0u*tANk)*tCm(PHJ{V#FJM|KTEzodaWS#M|Y9 zyYlw)r~1oXm7j{&>eeR6KiMfH|N7)9$|*r&Ke#^Sr|rZqwoGJJ2R`wXNBQxo@+b3E z-d}EnkNms-{!{l5@&0rFl%sMExxaH#{_Q!b?v(OQ|MvF6mDlYr{v67m*ZP;+^Hkml z59<3HQC+R_rafZ0;NqHp{c`0!VTzN#_;~XXEx1QYXw&WSyHckNX#JRzZ8=fuSLWb-~<6O&@%dzf20)VHp0 UWg5rg2XK_r+7VnCQq*&vot8Bi%mXS%Z^$b68PV+ELuocy6fG&CJ#db*<% zm~ z1ZypK1bY#xA0*`Hm;ok}fld=qNCmUg9bLd0k{zAFd=TLZcF<&*E)~fb83qPq#PFM$ Kfg!*#$PfVGVsMQB diff --git a/tests/testthat/data/mixed2.parquet b/tests/testthat/data/mixed2.parquet index 6dcb2031b18d01258da9ccdde1f2c44b4508e1cf..bfccad38cf6e4cea86445388855a75802794e6e5 100644 GIT binary patch delta 2630 zcmZA0dt8iJ7{Kv&rZGF(iHez;v=TM4*c62%z0;M78Wn{|l$1m$i4nQeTC>Q#b+@i< z?$%{lw{^d)6`T9rV$rUw+y35>{bSE(KHu}4bDsA(=Y2nO9!!+&DwS=q3ovQMM`(_? zL~2Tv8o^!=4!8?~jZ7traI>*i3qn?;POZ2a1>xObTQoE=!tCzwE*1vI!{r_jk^ny> zf=h3h)d&7bg0y~cF&PXguz3JDr^2)}6TTV*0|vvn3n#%i)kb0#1*F z4!N*4548C(aSZ%f0DZ>638`qiLRd8c?2BOhBzQ6z5~sq^Y2Y(Mr#2~;&m^o{3n9;i!T^a8lI5Mmd@PfOsdWw59eYA=UTE8zA@h+Yl*)N2XM|d3ose#h>zbF!lL9i(p8ic~MF!(zhQo6v| z2ndLR^zQlrw)i)S3yyXTkMih@1_(zk#M^`1V_n&4Z!y;r9~gyb!i80=Fgb z%~E*33^KlhiscZx63SOWqcu>x7T&CbLF?hy4buG&+C;czGkj47(|>^Kau~P`&TofK zJ7ME4aM}Y?euS#M(0@OiIRO3#Vcj8cI1EKc;Q3KVIu0j)f%Ye1^(m-#1`5x@Q`0%@ zbpeiD1m8=r;y19ZfU#HL@m1(q35TzNw{#6lZ-UKj$iD;k??T*tIQRfOAHm|sVEqJg zpTeDI5c3@Nzkn81u;4H0|EGLKm{ScmUPIIy*!>nf-oxAv+(o%e-#A;P7%FF?(h9oN zf*ne5Q^B0t@WC1~)lgvrVRd1vEx6i4aXoljAJVmO$pM0N(*9*mgbkfx#urfC5K6MILf~;IB!t6}&d@pnN+Y38R~XX`9z;W&5e{{SR?$CW5gy?C%3D`@+J0@L7Koj!1!91E56dFMz7r<7dWOMD6`3lnXDs9nbd3{09IXkLF zqp@2LzmHowR$F;Wt+m-MFwiqWq(zA_qL?p55fX-6Nuz|cAc`qcz7zzRvqf!(U@1(F z&F~h*G?77=)CWnEQi~BoEWK1DG(IjS*O2h(aWT*ADmuj{O6y&RGuw`!vzQi5XxTH$ zvK{qkme3Me!mtF1*bLaz2$d7|msYPNoz~nHbR^emsTOf2BKH>V;Y!{9ozt z3}%BvK9v&VYLW~wHE)$7y(QnJrl@84W4-X2R7)p{%|0GGJWU#N)?BFoIhL_BXJC@n Zv7>pt=;oa1EC~Pm2^k7Oh!%}r{{jiXY*+vQ delta 466 zcmdnFfbm5$;|4)HKFJ(Y1_`#}7dK|+p>5CfAzOmrWlf&_scZ~^O2c60{wL4+&V)suhvt4PMk RFfbq^hKtM$3;~Woh5)^HdUgN+ diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 6f70022..1e58a88 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -132,6 +132,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) expect_equal(tab$d, rep(0:399, 6)) + expect_equal(tab$i96, rep(as.POSIXct(as.Date(sprintf('%d-01-01', 1800:2199))), 6)) pf <- test_path("data/mixed2.parquet") expect_snapshot({ @@ -144,6 +145,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) expect_equal(tab$d, rep(0:399, 6)) + expect_equal(tab$i96, rep(as.POSIXct(as.Date(sprintf('%d-01-01', 1800:2199))), 6)) pf <- test_path("data/mixed-miss.parquet") expect_snapshot({ @@ -156,4 +158,5 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$s, as.character(0:2399)) expect_equal(tab$f, 0:2399) expect_equal(tab$d, 0:2399) + expect_equal(tab$i96, as.POSIXct(as.Date(sprintf('%d-01-01', 1:2400)))) }) From 11da9d4598eef5386d531b683abdff3d836a2fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 18:49:27 +0100 Subject: [PATCH 15/18] Fix reading INT32 DECIMALs, add tests In mixed dict + non-dict column chunks. --- src/RParquetReader.cpp | 153 +++++++++-------- tests/testthat/_snaps/read-parquet-5.md | 60 ++++++- tests/testthat/data/create-data.py | 209 ++++++++++++++++-------- tests/testthat/data/decimal.parquet | Bin 0 -> 7141 bytes tests/testthat/data/decimal2.parquet | Bin 0 -> 14188 bytes tests/testthat/data/mixed-miss.parquet | Bin 70231 -> 70188 bytes tests/testthat/test-read-parquet-5.R | 43 ++++- 7 files changed, 302 insertions(+), 163 deletions(-) create mode 100644 tests/testthat/data/decimal.parquet create mode 100644 tests/testthat/data/decimal2.parquet diff --git a/src/RParquetReader.cpp b/src/RParquetReader.cpp index e43a222..74fbc08 100644 --- a/src/RParquetReader.cpp +++ b/src/RParquetReader.cpp @@ -272,6 +272,7 @@ rtype::rtype(parquet::SchemaElement &sel) { type = tmptype = REALSXP; elsize = sizeof(double); type_conversion = INT32_DECIMAL; + psize = 4; scale = sel.scale; if (sel.__isset.logicalType) { scale = sel.logicalType.DECIMAL.scale; @@ -1308,9 +1309,8 @@ void convert_column_to_r_ba_string_nodict_nomiss(postprocess *pp, uint32_t cl) { } void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { - uint32_t lcl = cl; - SEXP x = VECTOR_ELT(pp->columns, lcl); - SET_VECTOR_ELT(pp->facdicts, lcl, Rf_allocVector(VECSXP, pp->metadata.num_row_groups)); + SEXP x = VECTOR_ELT(pp->columns, cl); + SET_VECTOR_ELT(pp->facdicts, cl, Rf_allocVector(VECSXP, pp->metadata.num_row_groups)); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { // first the non-dict parts, if any if (pp->byte_arrays[cl].size() > 0) { @@ -1343,7 +1343,7 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { ); SET_STRING_ELT(tmp, i, xi); } - SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, lcl), rg, tmp); + SET_VECTOR_ELT(VECTOR_ELT(pp->facdicts, cl), rg, tmp); // fill in dicts int64_t rg_offset = pp->metadata.row_group_offsets[rg]; @@ -1366,23 +1366,23 @@ void convert_column_to_r_ba_string_dict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_ba_string_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (hasmiss) { - // need to rewrite - int64_t beg = pp->metadata.row_group_offsets[rg]; - int64_t endm1 = beg + num_values - 1; - int64_t pendm1 = beg + num_present -1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - SET_STRING_ELT(x, endm1--, STRING_ELT(x, pendm1--)); - presm1--; - } else { - SET_STRING_ELT(x, endm1--, NA_STRING); - presm1--; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (cp.num_present != cp.num_values) { + // need to rewrite + int64_t beg = rg_offset + cp.offset; + int64_t endm1 = beg + cp.num_values - 1; + int64_t pendm1 = beg + cp.num_present -1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp.offset + cp.num_values - 1; + while (beg <= endm1) { + if (*presm1) { + SET_STRING_ELT(x, endm1--, STRING_ELT(x, pendm1--)); + presm1--; + } else { + SET_STRING_ELT(x, endm1--, NA_STRING); + presm1--; + } } } } @@ -1455,23 +1455,23 @@ void convert_column_to_r_ba_decimal_nodict_nomiss(postprocess *pp, uint32_t cl) void convert_column_to_r_ba_decimal_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (hasmiss) { - // need to rewrite - double *beg = REAL(x) + pp->metadata.row_group_offsets[rg]; - double *endm1 = beg + num_values - 1; - double *pendm1 = beg + num_present -1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - *endm1-- = *pendm1--; - presm1--; - } else { - *endm1-- = NA_REAL; - presm1--; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (cp.num_present != cp.num_values) { + // need to rewrite + double *beg = REAL(x) + rg_offset + cp.offset; + double *endm1 = beg + cp.num_values - 1; + double *pendm1 = beg + cp.num_present -1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp.offset + cp.num_values - 1; + while (beg <= endm1) { + if (*presm1) { + *endm1-- = *pendm1--; + presm1--; + } else { + *endm1-- = NA_REAL; + presm1--; + } } } } @@ -1615,23 +1615,23 @@ void convert_column_to_r_ba_raw_dict_nomiss(postprocess *pp, uint32_t cl) { void convert_column_to_r_ba_raw_miss(postprocess *pp, uint32_t cl) { SEXP x = VECTOR_ELT(pp->columns, cl); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - uint32_t num_values = pp->metadata.row_group_num_rows[rg]; - if (num_values == 0) continue; - uint32_t num_present = pp->present[cl][rg].num_present; - bool hasmiss = num_present != num_values; - if (hasmiss) { - // need to rewrite - int64_t beg = pp->metadata.row_group_offsets[rg]; - int64_t endm1 = beg + num_values - 1; - int64_t pendm1 = beg + num_present -1; - uint8_t *presm1 = pp->present[cl][rg].map.data() + num_values - 1; - while (beg <= endm1) { - if (*presm1) { - SET_VECTOR_ELT(x, endm1--, VECTOR_ELT(x, pendm1--)); - presm1--; - } else { - SET_VECTOR_ELT(x, endm1--, R_NilValue); - presm1--; + int64_t rg_offset = pp->metadata.row_group_offsets[rg]; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto &cp : cps) { + if (cp.num_present != cp.num_values) { + // need to rewrite + int64_t beg = rg_offset + cp.offset; + int64_t endm1 = beg + cp.num_values - 1; + int64_t pendm1 = beg + cp.num_present -1; + uint8_t *presm1 = pp->present[cl][rg].map.data() + cp.offset + cp.num_values - 1; + while (beg <= endm1) { + if (*presm1) { + SET_VECTOR_ELT(x, endm1--, VECTOR_ELT(x, pendm1--)); + presm1--; + } else { + SET_VECTOR_ELT(x, endm1--, R_NilValue); + presm1--; + } } } } @@ -1883,37 +1883,32 @@ void convert_column_to_r_int32_decimal_dict_nomiss(postprocess *pp, uint32_t cl) int32_t scale = pp->metadata.r_types[cl].scale; double fct = std::pow(10.0, scale); for (auto rg = 0; rg < pp->metadata.num_row_groups; rg++) { - std::vector &cps = pp->chunk_parts[cl][rg]; - bool rg_dict_converted = false; + if (pp->dicts[cl].size() > 0){ + uint32_t dict_len = pp->dicts[cl][rg].dict_len; + if (dict_len > 0) { + double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); + double *dend = dbeg + dict_len - 1; + int32_t *fdend = ((int32_t*) dbeg) + dict_len - 1; + while (dbeg <= dend) { + *dend-- = static_cast(*fdend--) / fct; + } + } + } int64_t rg_offset = pp->metadata.row_group_offsets[rg]; - for (uint32_t cpi = 0; cpi < cps.size(); cpi++) { - int64_t cp_offset = cps[cpi].offset; - uint32_t cp_num_values = cps[cpi].num_values; - bool hasdict = cps[cpi].dict; - double *beg = REAL(x) + rg_offset + cp_offset; - if (!hasdict) { - double *end = beg + cp_num_values - 1; - int32_t *fend = ((int32_t*) beg) + cp_num_values - 1; + std::vector &cps = pp->chunk_parts[cl][rg]; + for (auto cp = cps.rbegin(); cp != cps.rend(); ++cp) { + double *beg = REAL(x) + rg_offset + cp->offset; + if (!cp->dict) { + double *end = beg + cp->num_values - 1; + int32_t *fend = ((int32_t*) (REAL(x) + rg_offset)) + cp->offset + cp->num_values - 1; while (beg <= end) { *end-- = static_cast(*fend--) / fct; } } else { - // Convert the dictionary first - uint32_t dict_len = pp->dicts[cl][rg].dict_len; - if (!rg_dict_converted && dict_len > 0) { - rg_dict_converted = true; - double *dbeg = (double*) pp->dicts[cl][rg].buffer.data(); - double *dend = dbeg + dict_len - 1; - int32_t *fdend = ((int32_t*) dbeg) + dict_len - 1; - while (dbeg <= dend) { - *dend-- = static_cast(*fdend--) / fct; - } - } - // fill in the dict - double *end = beg + cp_num_values; + double *end = beg + cp->num_values; double *dict = (double*) pp->dicts[cl][rg].buffer.data(); - uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp_offset; + uint32_t *didx = pp->dicts[cl][rg].indices.data() + cp->offset; while (beg < end) { *beg++ = dict[*didx++]; } diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index 00de204..c6e84dd 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -153,28 +153,76 @@ as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) Output page_type num_values encoding - 1 DICTIONARY_PAGE 1024 PLAIN + 1 DICTIONARY_PAGE 1009 PLAIN 2 DATA_PAGE 1024 RLE_DICTIONARY 3 DATA_PAGE 1024 PLAIN 4 DATA_PAGE 352 PLAIN - 5 DICTIONARY_PAGE 1024 PLAIN + 5 DICTIONARY_PAGE 1018 PLAIN 6 DATA_PAGE 1024 RLE_DICTIONARY 7 DATA_PAGE 1024 PLAIN 8 DATA_PAGE 352 PLAIN - 9 DICTIONARY_PAGE 1024 PLAIN + 9 DICTIONARY_PAGE 1014 PLAIN 10 DATA_PAGE 1024 RLE_DICTIONARY 11 DATA_PAGE 1024 PLAIN 12 DATA_PAGE 352 PLAIN - 13 DICTIONARY_PAGE 1024 PLAIN + 13 DICTIONARY_PAGE 1013 PLAIN 14 DATA_PAGE 1024 RLE_DICTIONARY 15 DATA_PAGE 1024 PLAIN 16 DATA_PAGE 352 PLAIN - 17 DICTIONARY_PAGE 1024 PLAIN + 17 DICTIONARY_PAGE 1018 PLAIN 18 DATA_PAGE 1024 RLE_DICTIONARY 19 DATA_PAGE 1024 PLAIN 20 DATA_PAGE 352 PLAIN - 21 DICTIONARY_PAGE 1024 PLAIN + 21 DICTIONARY_PAGE 1016 PLAIN 22 DATA_PAGE 1024 RLE_DICTIONARY 23 DATA_PAGE 1024 PLAIN 24 DATA_PAGE 352 PLAIN +# mixing RLE_DICTIONARY and PLAIN, DECIMAL + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 FIXED_LEN_BYTE_ARRAY REQUIRED + 3 FIXED_LEN_BYTE_ARRAY OPTIONAL + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 176 PLAIN + 4 DICTIONARY_PAGE 400 PLAIN + 5 DATA_PAGE 1024 RLE_DICTIONARY + 6 DATA_PAGE 176 PLAIN + +--- + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 INT32 REQUIRED + 3 INT32 OPTIONAL + 4 INT64 REQUIRED + 5 INT64 OPTIONAL + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 176 PLAIN + 4 DICTIONARY_PAGE 400 PLAIN + 5 DATA_PAGE 1024 RLE_DICTIONARY + 6 DATA_PAGE 176 PLAIN + 7 DICTIONARY_PAGE 400 PLAIN + 8 DATA_PAGE 1024 RLE_DICTIONARY + 9 DATA_PAGE 176 PLAIN + 10 DICTIONARY_PAGE 400 PLAIN + 11 DATA_PAGE 1024 RLE_DICTIONARY + 12 DATA_PAGE 176 PLAIN + diff --git a/tests/testthat/data/create-data.py b/tests/testthat/data/create-data.py index dfb3322..9fa4fe5 100644 --- a/tests/testthat/data/create-data.py +++ b/tests/testthat/data/create-data.py @@ -1,75 +1,142 @@ -import pyarrow as pa -import pyarrow.parquet as pq -schema = pa.schema(fields=[ - pa.field(name = "f", type = pa.float32(), nullable = False), -]) -data = [ - list(range(400)) * 10, -] -table = pa.table(data = data, schema = schema) -pq.write_table( - table, - 'tests/testthat/data/float.parquet', - row_group_size = 1500, - data_page_size = 400, - use_dictionary = False -) -import pyarrow as pa -import pyarrow.parquet as pq -from datetime import datetime -schema = pa.schema(fields=[ - pa.field(name = 'x', type = pa.int32(), nullable = False), - pa.field(name = 'y', type = pa.int64(), nullable = False), - pa.field(name = "s", type = pa.utf8(), nullable = False), - pa.field(name = 'f', type = pa.float32(), nullable = False), - pa.field(name = 'd', type = pa.float64(), nullable = False), - pa.field(name = "i96", type = pa.timestamp('ms', tz='UTC'), nullable = False), -]) -data = [ - list(range(400)) * 6, - list(range(400)) * 6, - [ str(x) for x in range(400) ] * 6, - list(range(400)) * 6, - list(range(400)) * 6, - [ pa.scalar(datetime(x, 1, 1), type=pa.timestamp('ms', tz='UTC')) - for x in range(1800, 2200) ] * 6, -] -table = pa.table(data = data, schema = schema) -pq.write_table( - table, - 'tests/testthat/data/mixed.parquet', - use_deprecated_int96_timestamps = True, - data_page_size = 400, - dictionary_pagesize_limit = 400 -) +def do_float(): + import pyarrow as pa + import pyarrow.parquet as pq + schema = pa.schema(fields=[ + pa.field(name = "f", type = pa.float32(), nullable = False), + ]) + data = [ + list(range(400)) * 10, + ] + table = pa.table(data = data, schema = schema) + pq.write_table( + table, + 'tests/testthat/data/float.parquet', + row_group_size = 1500, + data_page_size = 400, + use_dictionary = False + ) -pq.write_table( - table, - 'tests/testthat/data/mixed2.parquet', - use_deprecated_int96_timestamps = True, - data_page_size = 400 -) +def do_mixed(): + import pyarrow as pa + import pyarrow.parquet as pq + from datetime import datetime + schema = pa.schema(fields=[ + pa.field(name = 'x', type = pa.int32(), nullable = False), + pa.field(name = 'y', type = pa.int64(), nullable = False), + pa.field(name = "s", type = pa.utf8(), nullable = False), + pa.field(name = 'f', type = pa.float32(), nullable = False), + pa.field(name = 'd', type = pa.float64(), nullable = False), + pa.field(name = "i96", type = pa.timestamp('ms', tz='UTC'), nullable = False), + ]) + data = [ + list(range(400)) * 6, + list(range(400)) * 6, + [ str(x) for x in range(400) ] * 6, + list(range(400)) * 6, + list(range(400)) * 6, + [ pa.scalar(datetime(x, 1, 1), type=pa.timestamp('ms', tz='UTC')) + for x in range(1800, 2200) ] * 6, + ] + table = pa.table(data = data, schema = schema) + pq.write_table( + table, + 'tests/testthat/data/mixed.parquet', + use_deprecated_int96_timestamps = True, + data_page_size = 400, + dictionary_pagesize_limit = 400 + ) + + pq.write_table( + table, + 'tests/testthat/data/mixed2.parquet', + use_deprecated_int96_timestamps = True, + data_page_size = 400 + ) -import pyarrow as pa -import pyarrow.parquet as pq -from datetime import datetime -table = pa.table({ - 'x': pa.array(range(2400), type=pa.int32()), - 'y': pa.array(range(2400), type=pa.int64()), - 's': pa.array([ str(x) for x in range(2400) ], type=pa.utf8()), - 'f': pa.array(range(2400), type=pa.float32()), - 'd': pa.array(range(2400), type=pa.float64()), - 'i96': pa.array( + import pyarrow as pa + import pyarrow.parquet as pq + from datetime import datetime + import random + schema = pa.schema(fields=[ + pa.field(name = 'x', type = pa.int32()), + pa.field(name = 'y', type = pa.int64()), + pa.field(name = "s", type = pa.utf8()), + pa.field(name = 'f', type = pa.float32()), + pa.field(name = 'd', type = pa.float64()), + pa.field(name = "i96", type = pa.timestamp('ms', tz='UTC')), + ]) + data = [ + list(range(2400)), + list(range(2400)), + [ str(x) for x in range(2400) ], + list(range(2400)), + list(range(2400)), [ pa.scalar(datetime(x, 1, 1), type=pa.timestamp('ms', tz='UTC')) - for x in range(1, 2401) ], - type = pa.timestamp('ms', tz='UTC') - ), -}) -pq.write_table( - table, - 'tests/testthat/data/mixed-miss.parquet', - use_deprecated_int96_timestamps = True, - data_page_size = 400, - dictionary_pagesize_limit = 400 -) + for x in range(1, 2401) ], + ] + + for col in range(len(data)): + for i in range(20): + data[col][random.randint(0, 2400-1)] = None + + table = pa.table(data = data, schema = schema) + pq.write_table( + table, + 'tests/testthat/data/mixed-miss.parquet', + use_deprecated_int96_timestamps = True, + data_page_size = 400, + dictionary_pagesize_limit = 400 + ) + +def do_decimal(): + import pyarrow as pa + import pyarrow.parquet as pq + import random + random.seed(10) + fields = [ + pa.field(name = 'dba', type = pa.decimal128(7), nullable = False), + pa.field(name = 'dbam', type = pa.decimal128(7)), + ] + schema = pa.schema(fields = fields) + data = [ + list(range(400)) * 3, + list(range(400)) * 3, + ] + for i in range(10): + data[1][random.randint(0, 1200-1)] = None + + table = pa.table(data = data, schema = schema) + pq.write_table( + table, + 'tests/testthat/data/decimal.parquet', + data_page_size = 400, + dictionary_pagesize_limit = 400 + ) + + fields2 = fields + [ + pa.field(name = 'di64', type = pa.decimal128(11), nullable = False), + pa.field(name = 'di64m', type = pa.decimal128(11)), + ] + schema2 = pa.schema(fields = fields2) + data2 = data + data + table2 = pa.table(data = data2, schema = schema2) + pq.write_table( + table2, + 'tests/testthat/data/decimal2.parquet', + store_decimal_as_integer = True, + data_page_size = 400, + dictionary_pagesize_limit = 400 + ) + +if __name__ == "__main__": + import sys + if len(sys.argv) == 1: + do_float() + do_mixed() + elif sys.argv[1] == 'float': + do_float() + elif sys.argv[1] == 'mixed': + do_mixed() + elif sys.argv[1] == 'decimal': + do_decimal() diff --git a/tests/testthat/data/decimal.parquet b/tests/testthat/data/decimal.parquet new file mode 100644 index 0000000000000000000000000000000000000000..146bf267970c834329d0c0acfa7d2c276293adfe GIT binary patch literal 7141 zcmeI%cT^PD9>DS4Wq}3JAhNhiQHml}0W}geiMt?8jED_v5l|_LQj9f;E1)7`L+ly_ zQ4vr9QHd!kq7*wCdjV_+7CNQDfj~#tH5_L@2@#jtES`WJF>LrXmW_h(Ro-;WJFf49vtV%*Gsi zjxX>f=0b&eh(kOQFdqxB5Q$iXBrHZUmS8ECVL4VH1uKz?RalKRSc`R7j}6#}P526% zu?1;J$5v!u8@6KyGO-g`$VLu!VK??57kS9XUKC&-_TvBw@ih+O8+?oJ@I4OUFpi)I zM^TJpIF1sOq73CYfs;6e(>Q~(IEMV2o4AEaRG}KTaR+yC z5BE`n2dG6I9^w%m;|c1~fJQt;6Q1EYUf?Bu#4G%SpYaP`<5z(|NF}5Yil6~8G@%7; zNT35|hTE zIHD`M!3oZAL3i{(PrQ#_aK#7cjXvm$5AhK`Mn5Q^gd5z^A0F_87rfzv0r15@e1bvn z!(jMh2!>)9hGPT*5Qvc&h0z#;vG^3@5QJcaU_2&ZB0>>{a717dCLF&8S#Lmc9ffcaQ}g-FCABw;a?!8Oe&Bj8b}{&J4jXf zQhlkGRLfK*R(GgN^&DXH))oy3I!(nwow)*$MAmPMYMzF~z$W1EmU;2o`j%d6`=!Or zZzXdHyL>n;e&KuO%0!tWJz-Hh8~@CJBkA)Ow|5FIU8~r-V2O!qOwHvZTNf_t*e^~Z zS7s!xu<}Z=2`tK3w6cp|y4O18wxm_|0r_E9inc9Y(`{T)qTFqJ^1AL}6`6rYw=dby zE2_G5o!gG3oBGUbthsV@$Fj{It3*1+?wQNe+!76JM;2$U*y_35(tEx8&XjEf*1CjW zE#A3u$Dl3BMaKQJQg;r?^xrx1Sk|iS5xK!->-%S~-ZiQ)=E2ot*=zQEdN@wU#3N^I zUPx(*?Wp59>-L78P50j5v1|RlNtg4(uN~jD;lR|&qD3a2yElFvTT`)fRLSm5-^{45 zF5BR_=d17Lyli}Mtz^&ULvsZhx~5*aTaLs_+S!dR%}qPHP~XaDqgP(~vBk3P5!Xxe zww5e2cS|z$&d(@YX)`2ibXoqk6KkA8$~SuN-F|9=Yi#ZHvb{UbZ0;AYYvxmsd2XxM zO1m-T1v}60@Z0LM$!A~I57_~GBW{%M%f7T{+|eYn0sC{V>p%mtzNG}HE!b$?&2Qq zqXrL9i#j~SBRs|v)T057c#0-G!*jgAOZ|hTEIHD_Q~(IEMV2o4AEav`i%}X?8GESqR2s0{#|hHeS2{eqQ0faa+{!Pagw3k@Z7Mf zYRzPs%fz#pLASM*n)aSvUsH8kdwGc>LC?a!xU_92?^K7NisG_%HiI*K)BKN>OC16W zCRbG)JJH@Ls5seT$nlf%9&^JlYqR+_Z=$9-LNtF>liS$7bKoY`k|aJM>A1?>~R_yrJ&?la7Z~B5ebQuzK^N z1l{&7lWXcNiW3dG_~(Q*SRPL{?iKT}ropOoxus`{LwIAS@>IJK`I8?sc0Re*CA1ob=q`YC*)M{K&fJj@Jug?pN%Z^rGv{Z)d+~d{poc>dzCok(wz+bFzHRUwNlGW5)jgsc#Xg3%Lo$cv#b^#N zavW7St)WrVSLPg2n(7>@HPE={q{{`-PqjWV{UEmHLSC%)pbj6-dD+zPRNK#75igOt zOp^?@?7vvnGp0$BlHr|d7kDticIEbg8+*@sCALe=9Gu~^uJ?SqRawJwMt(h8(|&bM zU_r!{gY)e#bshWN=f8-Id@nl%A4@iKTkU%#f8yz{6pO7RjrT=pT-+UWVtb^?fhp?8 z2(6?kfBOAU{MLP~k{epFX46rTgp~}DQ1kUIQ}q`$b$iPM-!j4fw-fwdeTJqYiI|5t z#3KPD_W$AYGqJpHPk~G;HY999WRO%WkhYOZqym}7_~4+_fDTHka0V_3ttKB~O<`LnP_8>2WmYl+eJiH7WED$`T= z`wtq?`^|-rsYKn>ORrD`%V2fEGC-l2p-_e@JQRvhgaup1XQm9k01PRDC{!)qfXD^RDU@g+OK=6%-Tw*@S71AyH9|K`}8? iXP8f)5EC0db&9#OYggy4PI3p+H?Pw?_(%Eig6|*MVZnv~ literal 0 HcmV?d00001 diff --git a/tests/testthat/data/decimal2.parquet b/tests/testthat/data/decimal2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..12b024aed2f33b93b205d3c3ec2cdcb3b6742a05 GIT binary patch literal 14188 zcmeI(2UHZ-{=o5FSgNRqEG|VwRHQ0kL#zu(R{<#XYQT3bKP_2?r-J}o^o$% zsYIHjD^1t+kZx9%inK(cV;c1Xl|>{m6rczRlpqwSKox3GM>A-kIa;74G|>vwCu$9C zNTGu^kUGfT!yXQB#9%nV87^>z8{FZ6AsC8b@Prq5~DC0KJbMf#=swA zF%IJ~0Rad^5GG<0f)RpHgdrReh(r`7V+x`XgIL5N9#b(52}s0r%)m^{!Uy;evyp^3 zNJa`$F&FbNA8A;CbS%UoEXEQn#WE~M238;wE3pczu?B0g4(qW28}Sh~VKcU1E4CpE z+pz;Xk&Ru*!EWSY5B6do@{o@L>_;IE;2;j+Fh0f+e1cE$89v8R9K&%G;RK3t5~omt zQk3B|&fqM*z&V`91zbcqF5xo1#1&k{HC)FH+{7*1##gw53RI#BcX1E*@c<7|jYp`# zV?4oAJj2&`j#|{=1?uq)Ug8yA<6FGJclaJZ;4OZHaPf3mNGiabW zTA(E~(F$5<4Q)uFgEo*s7kbc#0otM++QSeXV1$n7gw8NV7nr~lW-x~ZEYTI+U4AZ91(~_6eeQ|q7j2w#33G2F%1bw#B|KS zOw7Uu_z<&^ggHn?3Q{o_^DrN2Sb%ga#3C%l5-f#OBu&$jK2x!fCTU7FrOHxe1DQl9 zkV%Bzr6V$VZ;OmnwfbVQ+H8@6s%+5aq&bSJS|&cnHqS{>(KK>dGiXcl+-5SH&}+xG zq|9$=XrCsNZ%tj$+Qd8C=lIsS3)@%)madU+o3~isHn#fO@on>$wjY$Ns%xK>w!EWD zhKX-c)`AsXJh!^6wcnn;(%h#Y^m@_ug{!+wC`!|H*s*AB_t5fe-xE6)ukRgQRl3$; z=aP+m6YHw4pV+x{)4(JJH9g1dWm_E5v`qbqvzKpkUS{OF&T&`9cK04@;jNca`))Gz_N9Ncdyzr=5Xwz8z*a9-t!wASY1&K}OcTOxW}BtBOp)|kz=`u2qQSO4CSQzR}g z600g**g9aIBHbjZnKZqbg|tF}_4m6;#V@*vuthqiRDV>Vf|W`%jg3uX@{7XgL z?YWW+(Eh)(DXV|0NDOkoCdSillp(G6Cxh7G!-2YR9xdczidV28fwhyECVffxii z9N>t-aDp>j;0iam!vjMw6vN;NFL+}(MqngHVKjW;3qOp3KgMDl#$y5k5Qrd5#3TeG z1fd8+I3f^Vi0PPtnV5wS@F8X+33HH)6r^G<=3ze4umI^; zh(%b8C0L4OSdI*=KqgjV6;@*n)?yvjV*@tgBW%KEY{6D+Ll(AU2X-PGyO4w3$i*J) z#XjUA9|hQtLL9(B9KvCIj3f92pW-uoj-xn+<0!%j6yqdLp#-HU!)ctsS$u(WIFAds zh;m%QWqgS%xQc7IjvKg%Teyv{a0eBrL>2Dh9`54-9-Cw%9BWvN%R3F( zI-;z+v&Y&VQB{}AE_LzT;<(soWQ}&th@4UR;rDB#xjqHK7jq^(*4gWSWLj;_{l{(c zCKSzUWgh%QRxt7OD(i>`Pjm}I%XfI~4t}b4F!IL1n8y#E>K~4-I)zMLPYGOS(ynXl_>U#NH*%2QOOHu0)nMuQj1ddAkPW@Nc$ zn)x0{HC?e|$OgOVZzN`!*-X^tCG%CezJ=k}kIXf{+I8G#vwl#} z@wjFccydu&hgBZe3xdymBwuJ8rFS4)@ygzSGdrU64^0-HBs7y|{O&EH?f>`P89RqO%>rFrKO^?$4-A8GE_A%PO{RoZQL>p=5&l59Ax(&}*#m!Yk z3Q&XuN>GLhRG|iSG=m13qXk+*6Rn_y*3gC&I%oqKbfE`*7@#fMp*;-I0Y>PEPUs9{ zbb$#>VFq(pz!F{24OXy*4Z5QTdZHJ4!xnvDhrZ~C{uqFP7z8=&;Q&Vrh7+9O0#~@f z9Ud5hp%?~Fc)=UPF#;no3ZvlzU-)4R{4o~eFdh>SfItLcA|@diAqYhn!V!T;L}4Rh6PB+LM*~!EWuJN!*XO` z1v0S`tFRhtuommE9viR`A7K+VV+*!o8?vw+JFpYk*o7SIMlSYXFZLl1`6$4C6yg95 z;t&qwV;sRJ_!OVva~#Dn97hpOpcp4{3MD8-8BXI2&f*K4!+Bi5MU>+bF5^pF!Bt$t zb=<&B+`?^qg*&J~C8}^2_i!H%@DSB_gc>}?6FkK;e2wR*MIBzC9^c?4Ug0&q#T$Hw z@9_iP;>YHywckp_VzpLHmp@;J0~|3JPH=_`T;T?Hcwh*IVi-K(1#b+;2#mxijD`<< z;fFEs$5@QRcuYV50uh9Xn1o=2AQWK;M+71fg~^zLXv82EafruMOhW<^FɀSMFE zKE!M!VGfd!f>g}KJj_QL79br9u?UNstM(ISWrQS5zz9i*seU~>qx!B;CIn9niCLQ8 z$fc-hw&uTfwnj&~`4^%1dmW^rvDXNbd%uxOVNx?mg)nPesiq?Ny&+UmE$s0~*yBGm zh)P6aH8qi#OG5#QkU$B_P=PAcppItHKy$P}OK74Mw9p#bkU|G-AcHRSpbrDIMLV>I zAv(Ya9nlG$VT>*?fho*j4hvYKE4sl7*04c$^gvJaLT}ij5A4tv{m>r+Fc5v6=8P-naRC=mj!U?V zFL4D|aShjT12=ICxA7J3paPYs!d=|MeLTQJRO1o;Lq96DkD4O$rk|8Q9htwo{QC*B zI6@L;a)k6-vpMetRgF9G9+l?%q1L-xKAUED{+r&rYMR|?n%()^&hEUHO0)mzEv4lB zajxR$f$n?t9>1LHX}Y~^y1gXK3qT-(FcFgwj1Yt(4B?1CB%<)^r*wbz3EltO(>aN* ze@~H2IW8!4QdEGHXA26QMKZ;SfdQS>i3?g-_zO#}0rC}O67r*dT_E8A$;7Z;JsJvh z8jjErj*xJKl2Fii$j>D@6;Pm!{wZo?U+3KZzn^wzm&XKNfo&RCAlAqez z@Skn0HpgG5xU2E=*~UkL4P?I+GyUT6RKqER#lLi##;>mNJXQWWd;1s{?iCxq%@qIk znY7;@wf^~;bh38|U$?QUPS!cGHcc{+X~@03hmE#t>kw(hmeLmdv+KitU1W#uEHnV?>^ZQ4t1~+@+S%l$A+o~3#&1n)StkLS5QNy zvyk~er=lJq*V#LIXt;O0P~tO9*guXn2VbFSLY{r1u$21>3!$MurH1v98#^VzaN4BUxUeac4Xtgv ZT6eY5^)P7od$z`<=$s<&sPcDq{{RXMrIG*u literal 0 HcmV?d00001 diff --git a/tests/testthat/data/mixed-miss.parquet b/tests/testthat/data/mixed-miss.parquet index 7d25585b975bb024b0476e21a75c608b59801128..2d1a51d49e079197fbbf1ec48c0e7303466e6c2f 100644 GIT binary patch delta 7599 zcmc&(3v^RO8oqPy+%!!e2_-i~=t96!1GJc?d5Y3h3zoG?t5wV4d3b3Bw6L)53hQl= zVl61NKW-6GI4nZTDJ(LJMVDnOA|MKum-2A2R#4=i2JC9(x&NdOLR+9K1#-?e^Upss z|Nqb9-uq?d^X0UBIqjUD{BVLom4YOtI4Mo3(JMlfAZ+PY`+>fw>w%&(qu2 z2ck5W6-yIE5H&H-TT{g@XgU*(APAx$u-|DX5_mbZnoZVuL|7f3$>fLv0!5Ko?EA<& z2|a35PJqQ=e4JnhQ&%g!C<%g@@)G{m>89R|nRHtI3Zv)ZojcS7w)oltv_u#`t85Ez=i3P$vc zqE$xaGnm@1yBKbTynY>fVQi(kGz(^%Bn)pZpR7tAXd%%Xp2qx5(y>+}=ZZK{=c!(g z4fzjet5Q|g!iT=@X-2D-&ldlgwGA`I3)M+W<1n+JkpV?yN{~%LAKK3I-4(R*01sM{gq;97~^PL|$BJT&BqJCX!9wrC9=)Yl_l@Icj&E^u~8 zjJi^=5Bg1@_5^t@c3yK`P+KuT?TJ0jXm$NffXba*bqLh%9?+{HlB+hnmZ4VbTZ}&u zqiHA5#DsCV*D$8Go40E?x`Lnn&4*hy1`Vxm8_J#nGHe`-8Ic3ej2Mf!Po@s{u|<$P zd>ZT>F&h`xiMDPT)RxYv(;YQfcauU28ovuexr8++-vp?^PRgDN3LrR`Hp|rI%s=w~ z7*X5Wn7Z6ihjndj-R5?EF3ot$LAr58nt(gGYIGV@kG>n>kg405e+)JE;|eykp>B87 zyZXjbJLPN8PD^-D*8sIqxJ5a>o`Ryeio6o*ZUEFqqxf2C76|p=p6SqB``gO_59_gi zFmV03!h35#z5No8!Ef_kHU_$nvxI29&cw+5CX_Sb!SPSfDV?beEDzrfhsXOe_8FBU z#yBO{pcoX5qKS4Ht&-R4H4__htW~<&UNUEZM>*NzyxW6V!5R85=Qtspf!N3iA*YC( z6dvF#(@#~&zwOS8=W0JkriMgwi>mBN4mc6&okUrw-R5wcX@bq}K>IaG+hZ~Fb2lbL z6=&&a^ZA9`OB{j|*^}JUM8X+8??*$m)QO4UiqHyfga_e8WulEOocKI}sHgj@G{F(E zR0zT(vyjj(Frb}lLTyWu7 zAalunGLSTCn&;Hvi8y|#L-XopWdoL5gH~`(s)yPr+@f4$DBD-1UR)ai}z3jt)WMF-|4gf$g*T3Oz-Xre4vU5-{-(=uUb>r;Z1W>nYP-P28IR5MMPTijDQ7x=CGfLgKNUi$Os@NA_ z^9au+P>&6_lLvd;aH!3l;CA)kF1Sh1svcKDRC?&m@)zufdT>>unvZp-75Y9Gt)X6< z$zq1&>4hQqaCjXrm7<)9YL`i?MR6o8%kCp4T6AyX=dQ>sjHN~Mf)fR2gCloFAT zVP}U2+s!hi@ngxgoC@Vi?1vdsM4aGu`F`Rewb2g$+HVgeyobt@rczsNw6z_zJF}m> zYGbvHQD4=_b=WYrFr2v4gM7m#lB?5ezr)Ul!DT$vCw!DXs zr+6TDrruPVq^!a7$rJVGll!^}f=p_6vbSb-5ZQaP@aQ;|ZOcC*va*7u1RgAmfoX+V zu)A;=B+Qx0%H|YE%sFqV2nUOHj6_m@D45jaF?gk#f~g?nnoQQ95^7fI9Gv18@*Opm zNN0|c&|#Z7#Xs8vHJsRU{{0Ro;+siy?OFO*fZ~TWP<%Yh7ofVtLLz}IH1@!-g~hyW zCIXLVrA=>gE+5#hoWpMrS(k-(NigK~Ev)(mj zbAsV`gjVn)>LRpMCmM7Ik%+A^?V=NKADtd#f+c& zRD_oq$$L(NZ>tU- z_^>xj-LYGOAl;yCE1-7kqtdGZ-E`Qp>o3w@gX#zF;C;#==Hnh>yuZ;DOZjA|1j~25 zA-xgQHfi@RFn$^@Es3vV$$JQ)NBSrWVc_2G&c-R+w5(}#OZn)2RD4}9i@_oAy~?jM ztP$bSfhqhG6zqKn4(;6n%k~vO-~FRCllaP0czy3S7<*s|$omVWWt03+^!WrP?vsec zx^WpdikC3Ue0Zr=xrrz>gVPj${nNa3B;0;*Kb$|f9d;c0m6+L#_lZ%d3f?v7afL3h zOb_D#?nU_dDi!nf+D>#rDWV*2tc1X|**Y3{cGDaGoL(JBi zWaWo0L7zg5=9DF-17?BK2)u?9W)fNJtJt6+_lfU>lJTOPrKNjFMNSx*Kq{)kNj!_PcN13?IbL!14_Uh)n8)${t~WnL*8lP)lQRG)+nT+eDAQiNPyS?Ux$ALPaThI2W#9Yw8*yB0Pe_C9)dMuopGIA@DMu*FFNv&`(zfyV!AEchFvJZ1~ZS-SC?Okb}?ppNBVC zM!-MLn3I2gbV2NnTrz;VG7U?0EJhH)7M zaSb|si?Ef-ehNu!MASenxLAG2;?Z)s&4x|(S7ph~vf42r>!F4fs$m{2 ze%d=1y0N~d5CR*RCH_AEQOwjg3PN2jQ2yW*YCCiEdqsmD^D=U^k!AJY0a$NKV5hPo z2nG+8<=Cqs5=1qIVBF9?I5J(&ipN@k)sF51crNu@bUGF^vt5kfY*!=h$+plSyyQbf zRt)Zu2s=J8(AZOoms`n^+_SdfE7YUCL(QG3yd&0flltQD7|9~uV(^&zpz?(Rd#~om zV@!I6Prm|56J?{uzHpbr0aOx+r>}pTodscSwnZwX$ zn3UvyH<&rC1K^qQ{ePbw*nOCk!WZuWn3O#udYEM896kZ}WJhCdb|4vndj_rrIG-5q}XI44U>laq3baZk=fJd%?sfE~%10Z@VVNpmrAqKs9OR^XmV zFK{sGXsR`AS^!p0n(wr5pSlp7KbLZ9NnQTWpDs+iPMx?dOZ;wR6?vt6?tYM{L_se~s zd^}-ysmG1E2Hch_{}k|-(uVJ*Y02MJU|DWtCp&{kbVR3Y!P95cT~RaU(tpkMO?B@@lV?jS0 zxgfy{qMWI7y_Bl&7=E^9VKAU)(V!Lxa#}e{7lQLaE0?X7#IRk4 zR<#dYf2-n_U;c*pioT6!<$-|nK`FY29;cU=)MhcnMLZb4Tg{1Yedz2L)0d3G`o)jp zl*Qp%nwGMq9KTtd$uU_+(<}yiR0}%HIWvy`8F$^}yr*`_9A)qEzgJb3c0Tn<=B4GG za&=#a7{BVM^w8jqgseiNa)13zJ97E-ihY;Mlx{5!9SK z{D?Y9W}XBND#FXo7+!!<+X_%GUVM_pS$c~DA)l`$CvU08rLRpOvOjv}hr>C%{MtGL zZ(yuwzm&P+BuNVmjqI)bPy$+3`CC8mem2U%)u3=)KiAXna zOyRTO!S@R%^VzTHMX<9aMMWCI`RVFS1nl^B$!9(~6_Up}MD0|#_(hktW87_D6|1bg)+MpP_@bL``ar!|nsLMuOLZc5}omf6<- zv9n`UO@v4@v1EHJ%n~yT-~O&2+CQ8M1!$~!6;_Iwh5bne>|3)CURQ~$)VWy9DF0UYs`@%5|i+yF`XVYR)b=Ws`>r}-Yv7uz}yB9K0)T!gQH>RSUU-bCJySb^8}8jYdd_f_PnZ7lzX9>ShYjYwwqW9A!|h_LQf8tDW)tuKIO(BnHjP8{b3d!{+YeKZ}Dy-O9@+y zE!n79uY(Li{^6)OEyGL_FioakK(|2pn5JgLLogf2^w~8#0wEE4HdKOPT@{#i*iAC; zQ?@h!xh(PZH)HO(!SJyAkzml9PV6Iwom;{5)xW^BsS08uy*p5RlNbicJuw&_54%xK z40}0m^?Hrz*eMcYk=)yZ!4fMpmU4}rgVa=7NV!s}^~>g8id!231A>H}%avd{c3o?# zc4vk_BuMVu5Eznj`nblh^9wLF?DR9e^{uYYAl~hj6e8Mxl0wC#=NPx*^2H#7r?*Kq N9Q?56ObCo5{{@osFIfNp diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index 1e58a88..6cb76ad 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -147,16 +147,45 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$d, rep(0:399, 6)) expect_equal(tab$i96, rep(as.POSIXct(as.Date(sprintf('%d-01-01', 1800:2199))), 6)) + skip_on_cran() pf <- test_path("data/mixed-miss.parquet") expect_snapshot({ as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) }) - tab <- read_parquet(pf) - expect_equal(tab$x, 0:2399) - expect_equal(tab$y, 0:2399) - expect_equal(tab$s, as.character(0:2399)) - expect_equal(tab$f, 0:2399) - expect_equal(tab$d, 0:2399) - expect_equal(tab$i96, as.POSIXct(as.Date(sprintf('%d-01-01', 1:2400)))) + d1 <- as.data.frame(read_parquet(pf)) + d2 <- as.data.frame(arrow::read_parquet(pf)) + expect_equal(d1[,1:5], d2[,1:5]) + # arrow does not read INT86 into a time stamp, so compare manually + expect_equal(is.na(d1[,6]), is.na(d2[,6])) + bs6 <- as.POSIXct(as.Date(sprintf('%d-01-01', 1:2400))) + bs6[is.na(d1[,6])] <- NA + expect_equal(d1[,6], bs6) }) + +test_that("mixing RLE_DICTIONARY and PLAIN, DECIMAL", { + skip_on_cran() + pf <- test_path("data/decimal.parquet") + expect_snapshot({ + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + }) + t1 <- read_parquet(pf) + t2 <- arrow::read_parquet(pf) + expect_equal( + as.data.frame(t1), + as.data.frame(t2) + ) + + pf <- test_path("data/decimal2.parquet") + expect_snapshot({ + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + }) + t1 <- as.data.frame(read_parquet(pf)) + t2 <- as.data.frame(arrow::read_parquet(pf)) + expect_equal(t1[,1], t2[,1]) + expect_equal(t1[,2], t2[,2]) + expect_equal(t1[,3], t2[,3]) + expect_equal(t1[,4], t2[,4]) +}) \ No newline at end of file From cfe713f480df12a7e54a83e13f91ba1bf49f50e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 19:56:15 +0100 Subject: [PATCH 16/18] More mixed dict + non-dict column chunks tests --- tests/testthat/_snaps/read-parquet-5.md | 40 +++++++++++ tests/testthat/data/binary.parquet | Bin 0 -> 7031 bytes tests/testthat/data/create-data.py | 90 ++++++++++++++++++++++++ tests/testthat/data/float16.parquet | Bin 0 -> 4140 bytes tests/testthat/test-read-parquet-5.R | 35 ++++++++- 5 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/data/binary.parquet create mode 100644 tests/testthat/data/float16.parquet diff --git a/tests/testthat/_snaps/read-parquet-5.md b/tests/testthat/_snaps/read-parquet-5.md index c6e84dd..5a2a196 100644 --- a/tests/testthat/_snaps/read-parquet-5.md +++ b/tests/testthat/_snaps/read-parquet-5.md @@ -226,3 +226,43 @@ 11 DATA_PAGE 1024 RLE_DICTIONARY 12 DATA_PAGE 176 PLAIN +# mixing RLE_DICTIONARY and PLAIN, BYTE_ARRAY + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 BYTE_ARRAY REQUIRED + 3 BYTE_ARRAY OPTIONAL + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 176 PLAIN + 4 DICTIONARY_PAGE 400 PLAIN + 5 DATA_PAGE 1024 RLE_DICTIONARY + 6 DATA_PAGE 176 PLAIN + +# mixing RLE_DICTIONARY and PLAIN, FLOAT16 + + Code + as.data.frame(read_parquet_schema(pf)[, c("type", "repetition_type")]) + Output + type repetition_type + 1 REQUIRED + 2 FIXED_LEN_BYTE_ARRAY REQUIRED + 3 FIXED_LEN_BYTE_ARRAY OPTIONAL + Code + as.data.frame(read_parquet_pages(pf)[, c("page_type", "num_values", "encoding")]) + Output + page_type num_values encoding + 1 DICTIONARY_PAGE 400 PLAIN + 2 DATA_PAGE 1024 RLE_DICTIONARY + 3 DATA_PAGE 176 PLAIN + 4 DICTIONARY_PAGE 401 PLAIN + 5 DATA_PAGE 1024 RLE_DICTIONARY + 6 DATA_PAGE 176 PLAIN + diff --git a/tests/testthat/data/binary.parquet b/tests/testthat/data/binary.parquet new file mode 100644 index 0000000000000000000000000000000000000000..400aaa38c8f965ae18d07ae39759403ee537da10 GIT binary patch literal 7031 zcmeI13se(V8pmhyB0=27z+eJF0Tp>_9uN|0cb)J?SOK+y+OB~`0*FFjh1#AY0wO9Z zBBBUZUP5>Zh(fJxm1jU}5n0987O+(Ww7jYw&#q^`0d28$*XdbL?e20 z;WlDi{%0ea)>}j1~KT!U<*1j*ny4=63~&s9&}`of{qL_ z(7gipGl*!4H+2taBAN5g{{S)kA+HIVSQxbDbp`u>i9-|1j>Qr`cCJ_^r9xT|%VZdW zC3CO`TGm(`8)IV(jKP;6<`kKXiwZ!I$Qo-zSz8bale;)4ZY!0^G50&0vo%4BZR&Pt zac=y!Xuheq_P31X$y_`yafhzCuiEci-nLyc1p%5vcz)7uL#e8->s>0|T?ZGgtxx7T7N%s1l^tsThQi%h z_7T0BOvj=ATiYwD;&A;wUTwCv|GBV=~8LRik=ezH6$f`(dZzv|~#Si9)V7 z7Vj@wnj_y~;9QbkyiDy|ysD|>Ks0ryHg^EWtk^}+w8nyUPky~d6^ z`R>&Pt^3WFm8@y4K6d_~Ah0dl{Y2rHS<})iO^eZCrhs$U)!+LXvwLvYbTVKOWr$ws{BT6#O<~tOKK}_eHuI3 zf9rhh@vpy#qfVRUSy$Q9kgRL<{)M`#+bzh`dh}vFs|>eB}FC!Z&9q#o_4N162Y zyx%8LVN~SkVtLc7v2F_NKN}TP7%yC^NMs8Gqn*ViMi4Z9p5Y)@YSF|$Nrf4ViRAXkL+yCi>|va9e&+1(E*)v+(-n+yAvG5~f0T!HW~& zxW~@;4S8${cylhF^}nz%SA7$xGdi&#L^5(!u+4F_xIOg7o46419CtQMQLN3X;WtWzub%q;}8miZN9b}iKl7n3zXg6Xd;ti=BC+e8^ zHfmw7wy&d(Gm8dFS`HDBP_ZVBluQ(SKT(oh=t z&4t`$Ef+_GEJRBal``zrbaTG0%wQ|U~K2dw7Xno(1NnN*x+()&(rm&WT3@dTy;>bU@X+RFo$78!dDx=`e>=1J(a8OQ zM+>{p#%t*=SpQhi+qiS4c!Q2{TpUW%=1Sv?x5m?T45YD$Nqizp&(g0h&LkmeI^RM1 za0qo`&Y0ttE{LQ!YhzY^H4(!U7ag8h*;yG$b^V>u+p%Lq!&J>}V|fzWBH$Zl+FXwe z+sF}`hn~eU_3|neeTlx!>bmIR5xOT=7^q1XMlrk$?83TgB1aj2MKgU&op*8VRAbDv0{3)F|I>*U`wP9Y9X@zMvrJd}6nJDhY_mL2 zvZ8F&C!6V38D;)8q1R4tv%0$A{m(x72SwkrOAyeMV&s_Nd98Zg#gFk_X5j`WL#bCP z6z2=W4Qn^RG?uN4mAtSjq?u1%6<%byF<~YvprG`pU-Qwa72>27D8do_ZVr#nvo=Hc zm=4blVX%3G0HeTh{dRACjg8soJTvB`M{B%A@KOiCCR`=yb$!cVY|e=UZqIyYlgV_|kYMY~i3kE{~b`p7W3K zPK!!R0>0`Je<&bb=V-JS#rq*XugodVcB9Ag^Sb;VB2-}G49TGXZ8=z+_~rR zedpxl%Q^X-d)0F794r9ig0a5fRLmKKfe-+kL4y9W z(Qb4aT}IC6HhPR+BOpYCgpd(9p(f0Pm0$@w;UruHN4Nmb8&}(m^`O2GT`#k{sDfy2*ahLk^H$a)<64>dq}sUZr`LRv(NX$dW*Wpp%+ z(@I)RleC$(&{jH^W@#I3ryaDDZlGOsC(Y5lw43gyJ@f$WrH5$12pJJ0W+aT1kulK> z&L|l*Lo#N@!dRJHhGlGwopCTurh#!WoeanHGH#}y@h}67mlVJcKZJtQH7X1Ees;Cg6@@=r-y{_mKxZLIdb2@}faBgq|Z{3NQ&x!6uO@)Fd`Vm?Wkt zCaEdPBs0x7MVn$w7{F{H*zZO}VXO!fVL@0>gxD{}B=!Sr7?A!b@lBLJayX#%*gZaa(?I`)n!=Ka_@}~fMXjxLva(D{b=4M?Pg7Sb_bKXjR?Z+>EqA->imPUu zyKLdHM|bU;XSvxxbsyb*V3GCP-hHw4dk!tleKv5nyT0V`GS+Y8J8=#6BeAxS>2yy+ zX-%U2y`=qdjb(Kyj=5{T>S-*mS2-0$@5DD%H0m0(jr7H)%4Wi~;>!N`=Do+6&P_jj zb+NgsC7t_pM z=7F1y%;gh@&gOW(y>cM2<+I;>JoM~`dzV`dpU(sSZ$&6w&I{XxW8a6pt|Q%rqRGhz zl_#q&7K`Vtz2AGX=5mQ7c4tIVYweYCsjdlLX|20jCCltPnACRkw+Es(Ke>OUt^SL{ z`1ZFZB)2!*s8N=`kNVmhZ`G@7k`E=HYP!=*wyk~8cdGfz7IXK`2`Q(K-EFhnYC=~} zAHUaOeb{#>rQ^hdv$=nK^5AMm%h%^w|F_>w?R0(9Z5unobglE`!^_}Bm*9GrAUKad z)fe4@zq|g-Te<`{y9B~OclC!m{Q2_W0%deCN^K*GHe0g~M2Ga#^di8(Y zs{vR~Fm|0sxayY(R~Aylau=STgxHtIr2H*Za@r_kU^!y*!-Vkg^_&SU$9yxeR z;Oi$=^ov4Wu4rz)a?IpJ`xH%Ue$v=!Dra2j+=66m2H{$HqqQJq-0XBNf6Tlcsp0do z+_jpv9ckkiWqW!`=M^rWur$Z}=tf(iYT~jy5EwGwrk)gAD4MEmw`nFPmPq2u<`-$F zrc}vR-E1$?O;a7l3qls`)KAydtLwC)3;q=KS7s zv3ya9>4T5|5BN`adTGR?kvqr zESV_Z5xTf+Wm5Sx%~AcCvQ;TnvokJLEG}Q2c3{!Q?{1$d&r%&;wk-%-Qen~5B$h@R zKCf7#t5+RKs9aLH*3e9xTz%*B%5_9b`h^|X(!J}cwyYaR4QKamU^=qDxm3Bd>LYYE z=MUfAIa`&Tem+kyA}q#X&FC&1GtGF;u`%=V&%?@xT>?-zvrE|haP(dt+VHTspYfJF z9{c8W2fxc7)vGADx?2DWo^=ZXiblLB@Sj!SFMK8|YU0`BZG@u0U%zieR^UIL7++)8 z1F`C&6pp*5Y{&}tY>`WU~r`H zpN+`jF}~QCAK-64G7+x<@M^H(PkI^c$<#1UMC3VX-UDCF`q-(sv>p5)`a z{uAdlpQsr2=beTZdCnUtxgzJKU;N+VE9AMpP5dHHe4L+%-y)aOiOH4~!;#|Be5vt9 zgD(%Bl)k)p(!AmnH|)eG%hRyv6JDmOSVwJ$_ig3v!vQ}4Vu>|l)5c$|+A=fKYMq&} mX;XHN Date: Sat, 8 Feb 2025 20:28:46 +0100 Subject: [PATCH 17/18] Fix tests on older R `as.POSIXct()` behaves pretty interestingly... Trick is from parsedate. --- tests/testthat/helper.R | 4 ++++ tests/testthat/test-read-parquet-5.R | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 03c6ee9..d1147c9 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -43,3 +43,7 @@ test_write <- function(d, schema = NULL, encoding = NULL) { redact_maxint64 <- function(x) { gsub("922337203685477[0-9][0-9][0-9][0-9]", "922337203685477xxxx", x) } + +utcts <- function(x) { + as.POSIXct(as.POSIXlt(as.Date(x), tz = "UTC")) +} diff --git a/tests/testthat/test-read-parquet-5.R b/tests/testthat/test-read-parquet-5.R index b6e0bcf..d6293ae 100644 --- a/tests/testthat/test-read-parquet-5.R +++ b/tests/testthat/test-read-parquet-5.R @@ -132,7 +132,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) expect_equal(tab$d, rep(0:399, 6)) - expect_equal(tab$i96, rep(as.POSIXct(as.Date(sprintf('%d-01-01', 1800:2199))), 6)) + expect_equal(tab$i96, rep(utcts(sprintf('%d-01-01', 1800:2199)), 6)) pf <- test_path("data/mixed2.parquet") expect_snapshot({ @@ -145,7 +145,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(tab$s, as.character(rep(0:399, 6))) expect_equal(tab$f, rep(0:399, 6)) expect_equal(tab$d, rep(0:399, 6)) - expect_equal(tab$i96, rep(as.POSIXct(as.Date(sprintf('%d-01-01', 1800:2199))), 6)) + expect_equal(tab$i96, rep(utcts(sprintf('%d-01-01', 1800:2199)), 6)) skip_on_cran() pf <- test_path("data/mixed-miss.parquet") @@ -158,7 +158,7 @@ test_that("mixing RLE_DICTIONARY and PLAIN", { expect_equal(d1[,1:5], d2[,1:5]) # arrow does not read INT86 into a time stamp, so compare manually expect_equal(is.na(d1[,6]), is.na(d2[,6])) - bs6 <- as.POSIXct(as.Date(sprintf('%d-01-01', 1:2400))) + bs6 <- utcts(sprintf('%d-01-01', 1:2400)) bs6[is.na(d1[,6])] <- NA expect_equal(d1[,6], bs6) }) From f7bfb2fbcd2ad084649d11f2fb47fbeb01ab7bd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Cs=C3=A1rdi?= Date: Sat, 8 Feb 2025 20:36:39 +0100 Subject: [PATCH 18/18] Add NEWS for #110 [ci skip] --- NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS.md b/NEWS.md index 70ef11f..c820812 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,10 @@ * `read_parquet()` now correctly reads `FLOAT` columns from files with multiple row groups. +* `read_parquet()` now correctly reads Parquet files that have column + chunks with both dictionary encoded and not dictionary encoded + pages (#110). + # nanoparquet 0.4.0 * API changes: