From 5608c2752e668087dce345fac02ef68169e2e248 Mon Sep 17 00:00:00 2001 From: Billy Lanchantin Date: Thu, 14 Nov 2024 16:32:23 -0500 Subject: [PATCH] Fix decimal empty list bug (#1015) * Restrict decimal generation Before we were generating values that the Rust side couldn't handle. * Add a decode failure hint Sometimes this decode failure happens because Elixir Decimals can hold a wider range of values than Rust (Arrow) Decimals. Here, we're giving a hint about why the decode may have failed. * Fix `{:list, {:decimal, ...}}` for empty lists The bug turned out to be that in the presence of empty lists, we weren't casting when we should have been. * Remove decimal-related TODOs After restricting the decimal generator such that it only produced valid Decimals and fixing the empty list bug, it now seems like it's safe to include decimals in the property tests. --- native/explorer/src/series/from_list.rs | 19 ++++++++++++++----- test/explorer/data_frame_test.exs | 24 ++++++------------------ test/explorer/series_test.exs | 4 ++++ test/support/generator.ex | 19 ++++++++++++++----- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/native/explorer/src/series/from_list.rs b/native/explorer/src/series/from_list.rs index 94c631f69..f1287ddbf 100644 --- a/native/explorer/src/series/from_list.rs +++ b/native/explorer/src/series/from_list.rs @@ -242,7 +242,7 @@ pub fn s_from_list_decimal( .map(|ex_decimal| AnyValue::Decimal(ex_decimal.signed_coef(), ex_decimal.scale())) .map_err(|error| { ExplorerError::Other(format!( - "cannot decode a valid decimal from term. error: {error:?}" + "cannot decode a valid decimal from term; check that `coef` fits into an `i128`. error: {error:?}" )) }), TermType::Atom => Ok(AnyValue::Null), @@ -264,13 +264,22 @@ pub fn s_from_list_decimal( let mut series = Series::from_any_values(name.into(), &values, true)?; - if let DataType::Decimal(result_precision, result_scale) = series.dtype() { - let p: Option = Some(precision.unwrap_or(result_precision.unwrap_or(38))); - let s: Option = Some(scale.unwrap_or(result_scale.unwrap_or(0))); + match series.dtype() { + DataType::Decimal(result_precision, result_scale) => { + let p: Option = Some(precision.unwrap_or(result_precision.unwrap_or(38))); + let s: Option = Some(scale.unwrap_or(result_scale.unwrap_or(0))); - if *result_precision != p || *result_scale != s { + if *result_precision != p || *result_scale != s { + series = series.cast(&DataType::Decimal(p, s))?; + } + } + // An empty list will result in the `Null` dtype. + DataType::Null => { + let p = Some(precision.unwrap_or(38)); + let s = Some(scale.unwrap_or(0)); series = series.cast(&DataType::Decimal(p, s))?; } + other_dtype => panic!("expected dtype to be Decimal. found: {other_dtype:?}"), } Ok(ExSeries::new(series)) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index d6360bebe..0b3fba4f0 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -4730,9 +4730,7 @@ defmodule Explorer.DataFrameTest do describe "properties" do property "should be able to create a DataFrame from valid rows" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4742,9 +4740,7 @@ defmodule Explorer.DataFrameTest do property "should be able to create a DataFrame from valid columns" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), cols <- Explorer.Generator.columns(dtypes), max_runs: 1_000 ) do @@ -4770,9 +4766,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame (without duration) to CSV" do check all( - # TODO: remove `:decimal` once we fix whatever bug(s) this is - # finding. - dtypes <- Explorer.Generator.dtypes(exclude: [:decimal, :duration]), + dtypes <- Explorer.Generator.dtypes(exclude: :duration), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4785,9 +4779,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to IPC" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4800,9 +4792,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to NDJSON" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4815,9 +4805,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to PARQUET" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index 0eef59ba5..df750af44 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -503,6 +503,10 @@ defmodule Explorer.SeriesTest do assert Series.dtype(s) == {:decimal, 38, 5} end + test "{:list, {:decimal, _, _}} works with empty lists" do + Series.from_list([[Decimal.new("3.21")], []], dtype: {:list, {:decimal, 3, 2}}) + end + test "mixing dates and integers with `:date` dtype" do s = Series.from_list([1, nil, ~D[2024-06-13]], dtype: :date) diff --git a/test/support/generator.ex b/test/support/generator.ex index 881d55d45..172fdfcb1 100644 --- a/test/support/generator.ex +++ b/test/support/generator.ex @@ -295,8 +295,8 @@ defmodule Explorer.Generator do date: constant(:date), datetime: tuple({constant(:datetime), time_unit(), constant("Etc/UTC")}), decimal: - bind(integer(0..37), fn scale -> - bind(integer((scale + 1)..38), fn precision -> + bind(integer(0..19), fn scale -> + bind(integer((scale + 1)..20), fn precision -> tuple({constant(:decimal), constant(precision), constant(scale)}) end) end), @@ -397,14 +397,23 @@ defmodule Explorer.Generator do |> map(&elem(&1, 1)) end + @max_u64 2 ** 64 - 1 @spec datetime(pos_integer(), pos_integer()) :: gen(Decimal.t()) defp decimal(precision, scale) do + # Smallest integer with `(scale + 1)` digits. + min_scale = 10 ** scale + # Largest integer with `precision` digits. + max_precision = 10 ** precision - 1 + tuple({ one_of([constant("-"), constant("+")]), - string(?0..?9, min_length: precision - scale, max_length: precision - scale), - string(?0..?9, min_length: scale, max_length: scale) + integer(min_scale..min(max_precision, @max_u64)) }) - |> map(fn {sign, integer_part, fractional_part} -> + |> map(fn {sign, coef} -> + # By construction, we are guaranteed that precision > scale. + {integer_part, fractional_part} = + coef |> Integer.to_string() |> String.split_at(precision - scale) + Decimal.new(sign <> integer_part <> "." <> fractional_part) end) end