diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index c7c991cb3..4ead18e54 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -110,6 +110,7 @@ defmodule Explorer.Backend.LazySeries do last: 1, count: 1, nil_count: 1, + size: 1, skew: 2, correlation: 4, covariance: 3, @@ -707,7 +708,7 @@ defmodule Explorer.Backend.LazySeries do end defp dtype_for_agg_operation(op, _) - when op in [:count, :nil_count, :n_distinct, :argmin, :argmax], + when op in [:count, :nil_count, :size, :n_distinct, :argmin, :argmax], do: {:u, 32} defp dtype_for_agg_operation(op, series) @@ -958,10 +959,10 @@ defmodule Explorer.Backend.LazySeries do defp to_elixir_ast(other), do: other @impl true - def size(_series) do - raise """ - cannot retrieve the size of a lazy series, use count/1 instead - """ + def size(series) do + data = new(:size, [lazy_series!(series)], {:u, 32}) + + Backend.Series.new(data, {:u, 32}) end @impl true diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 847bb9beb..2a98f4731 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5795,7 +5795,7 @@ defmodule Explorer.DataFrame do end @doc """ - Counts the number of null elements in each column. + Counts the number of `nil` elements in each column. ## Examples diff --git a/lib/explorer/polars_backend/expression.ex b/lib/explorer/polars_backend/expression.ex index 02201ecce..5cfc70d19 100644 --- a/lib/explorer/polars_backend/expression.ex +++ b/lib/explorer/polars_backend/expression.ex @@ -24,6 +24,7 @@ defmodule Explorer.PolarsBackend.Expression do binary_in: 2, coalesce: 2, count: 1, + size: 1, day_of_week: 1, day_of_year: 1, week_of_year: 1, diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index 76ddbeca4..fb2245d13 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -173,7 +173,8 @@ defmodule Explorer.PolarsBackend.Series do # Aggregation @impl true - def count(series), do: Shared.apply_series(series, :s_size) + # There is no `count` equivalent in Polars, so we need to make our own. + def count(series), do: size(series) - nil_count(series) @impl true def nil_count(series), do: Shared.apply_series(series, :s_nil_count) diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 7ca471732..858e92e36 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -1088,15 +1088,26 @@ defmodule Explorer.Series do def dtype(%Series{dtype: dtype}), do: dtype @doc """ - Returns the size of the series. + Returns the number of elements in the series. - This is not allowed inside a lazy series. Use `count/1` instead. + See also: + + * `count/1` - counts only the non-`nil` elements. + * `nil_count/1` - counts all `nil` elements. ## Examples + Basic example: + iex> s = Explorer.Series.from_list([~D[1999-12-31], ~D[1989-01-01]]) iex> Explorer.Series.size(s) 2 + + With lists: + + iex> s = Explorer.Series.from_list([[1, 2, 3], [4, 5]]) + iex> Explorer.Series.size(s) + 2 """ @doc type: :introspection @spec size(series :: Series.t()) :: non_neg_integer() | lazy_t() @@ -4624,41 +4635,65 @@ defmodule Explorer.Series do end @doc """ - Counts the number of elements in a series. + Counts the number of non-`nil` elements in a series. + + See also: - In the context of lazy series and `Explorer.Query`, - `count/1` counts the elements inside the same group. - If no group is in use, then count is going to return - the size of the series. - It is also going to result in a lazy series of `{:u, 32}` - dtype. + * `count_nil/1` - counts only the `nil` elements. + * `size/1` - counts all elements. ## Examples + Without `nil`: + iex> s = Explorer.Series.from_list(["a", "b", "c"]) iex> Explorer.Series.count(s) 3 + With `nil`: + + iex> s = Explorer.Series.from_list(["a", nil, "c"]) + iex> Explorer.Series.count(s) + 2 + + With `:nan` (`:nan` does not count as `nil`): + + iex> s = Explorer.Series.from_list([1, :nan, 3]) + iex> Explorer.Series.count(s) + 3 """ @doc type: :aggregation def count(series), do: apply_series(series, :count) @doc """ - Counts the number of null elements in a series. + Counts the number of `nil` elements in a series. + + When used in a query on grouped data, `count_nil/1` is a per-group operation. - In the context of lazy series and `Explorer.Query`, - `count/1` counts the elements inside the same group. - If no group is in use, then count is going to return - the size of the series. - It is also going to result in a lazy series of `{:u, 32}` - dtype. + See also: + + * `count/1` - counts only the non-`nil` elements. + * `size/1` - counts all elements. ## Examples - iex> s = Explorer.Series.from_list(["a", nil, "c", nil, nil]) + Without `nil`s: + + iex> s = Explorer.Series.from_list(["a", "b", "c"]) iex> Explorer.Series.nil_count(s) - 3 + 0 + With `nil`s: + + iex> s = Explorer.Series.from_list(["a", nil, "c"]) + iex> Explorer.Series.nil_count(s) + 1 + + With `:nan`s (`:nan` does not count as `nil`): + + iex> s = Explorer.Series.from_list([1, :nan, 3]) + iex> Explorer.Series.nil_count(s) + 0 """ @doc type: :aggregation def nil_count(series), do: apply_series(series, :nil_count) @@ -6072,7 +6107,7 @@ defmodule Explorer.Series do s64 [nil] > - It raises an exception if the string is invalid JSON. + It raises an exception if the string is invalid JSON. """ @doc type: :string_wise @spec json_decode(Series.t(), dtype()) :: Series.t() diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index 98a8b7238..a98469ec6 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -567,6 +567,13 @@ pub fn expr_nil_count(expr: ExExpr) -> ExExpr { ExExpr::new(expr.null_count()) } +#[rustler::nif] +pub fn expr_size(expr: ExExpr) -> ExExpr { + let expr = expr.clone_inner(); + + ExExpr::new(expr.len()) +} + #[rustler::nif] pub fn expr_n_distinct(expr: ExExpr) -> ExExpr { let expr = expr.clone_inner(); diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs index ebc80610e..d5e28144b 100644 --- a/native/explorer/src/lib.rs +++ b/native/explorer/src/lib.rs @@ -240,6 +240,7 @@ rustler::init!( expr_sum, expr_variance, expr_product, + expr_size, expr_skew, expr_correlation, expr_covariance, diff --git a/test/explorer/data_frame/grouped_test.exs b/test/explorer/data_frame/grouped_test.exs index 4eb9294aa..e768565e5 100644 --- a/test/explorer/data_frame/grouped_test.exs +++ b/test/explorer/data_frame/grouped_test.exs @@ -751,6 +751,44 @@ defmodule Explorer.DataFrame.GroupedTest do end end + describe "size/1, count/1, and count_nil/1" do + test "work with `nil` and `:nan` correctly" do + df = + DF.new( + a: [1, 2, 3], + b: [1, nil, 3], + c: [1, :nan, 3], + group: [1, 1, 2] + ) + |> DF.group_by(:group) + + assert df + |> DF.summarise( + a_count: count(a), + a_nil_count: nil_count(a), + a_size: size(a), + b_nil_count: nil_count(b), + b_count: count(b), + b_size: size(b), + c_count: count(c), + c_nil_count: nil_count(c), + c_size: size(c) + ) + |> DF.to_columns(atom_keys: true) == %{ + a_count: [2, 1], + a_nil_count: [0, 0], + a_size: [2, 1], + b_count: [1, 1], + b_nil_count: [1, 0], + b_size: [2, 1], + c_count: [2, 1], + c_nil_count: [0, 0], + c_size: [2, 1], + group: [1, 2] + } + end + end + describe "n_columns/1" do test "groups don't affect counting of columns", %{df: df} do df1 = DF.group_by(df, ["year"])