Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add eager Series.count/1 and lazy Series.size/1 #844

Merged
merged 3 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ defmodule Explorer.Backend.LazySeries do
last: 1,
count: 1,
nil_count: 1,
size: 1,
skew: 2,
correlation: 4,
covariance: 3,
Expand Down Expand Up @@ -707,7 +708,7 @@ defmodule Explorer.Backend.LazySeries do
end

defp dtype_for_agg_operation(op, _)
when op in [:count, :nil_count, :n_distinct, :argmin, :argmax],
when op in [:count, :nil_count, :size, :n_distinct, :argmin, :argmax],
do: {:u, 32}

defp dtype_for_agg_operation(op, series)
Expand Down Expand Up @@ -958,10 +959,10 @@ defmodule Explorer.Backend.LazySeries do
defp to_elixir_ast(other), do: other

@impl true
def size(_series) do
raise """
cannot retrieve the size of a lazy series, use count/1 instead
"""
def size(series) do
data = new(:size, [lazy_series!(series)], {:u, 32})

Backend.Series.new(data, {:u, 32})
end

@impl true
Expand Down
2 changes: 1 addition & 1 deletion lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5795,7 +5795,7 @@ defmodule Explorer.DataFrame do
end

@doc """
Counts the number of null elements in each column.
Counts the number of `nil` elements in each column.

## Examples

Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ defmodule Explorer.PolarsBackend.Expression do
binary_in: 2,
coalesce: 2,
count: 1,
size: 1,
day_of_week: 1,
day_of_year: 1,
week_of_year: 1,
Expand Down
3 changes: 2 additions & 1 deletion lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ defmodule Explorer.PolarsBackend.Series do
# Aggregation

@impl true
def count(series), do: Shared.apply_series(series, :s_size)
# There is no `count` equivalent in Polars, so we need to make our own.
def count(series), do: size(series) - nil_count(series)

@impl true
def nil_count(series), do: Shared.apply_series(series, :s_nil_count)
Expand Down
73 changes: 54 additions & 19 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -1088,15 +1088,26 @@ defmodule Explorer.Series do
def dtype(%Series{dtype: dtype}), do: dtype

@doc """
Returns the size of the series.
Returns the number of elements in the series.

This is not allowed inside a lazy series. Use `count/1` instead.
See also:

* `count/1` - counts only the non-`nil` elements.
* `nil_count/1` - counts all `nil` elements.

## Examples

Basic example:

iex> s = Explorer.Series.from_list([~D[1999-12-31], ~D[1989-01-01]])
iex> Explorer.Series.size(s)
2

With lists:

iex> s = Explorer.Series.from_list([[1, 2, 3], [4, 5]])
iex> Explorer.Series.size(s)
2
"""
@doc type: :introspection
@spec size(series :: Series.t()) :: non_neg_integer() | lazy_t()
Expand Down Expand Up @@ -4624,41 +4635,65 @@ defmodule Explorer.Series do
end

@doc """
Counts the number of elements in a series.
Counts the number of non-`nil` elements in a series.

See also:

In the context of lazy series and `Explorer.Query`,
`count/1` counts the elements inside the same group.
If no group is in use, then count is going to return
the size of the series.
It is also going to result in a lazy series of `{:u, 32}`
dtype.
* `count_nil/1` - counts only the `nil` elements.
* `size/1` - counts all elements.

## Examples

Without `nil`:

iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.count(s)
3

With `nil`:

iex> s = Explorer.Series.from_list(["a", nil, "c"])
iex> Explorer.Series.count(s)
2

With `:nan` (`:nan` does not count as `nil`):

iex> s = Explorer.Series.from_list([1, :nan, 3])
iex> Explorer.Series.count(s)
3
"""
@doc type: :aggregation
def count(series), do: apply_series(series, :count)

@doc """
Counts the number of null elements in a series.
Counts the number of `nil` elements in a series.

When used in a query on grouped data, `count_nil/1` is a per-group operation.

In the context of lazy series and `Explorer.Query`,
`count/1` counts the elements inside the same group.
If no group is in use, then count is going to return
the size of the series.
It is also going to result in a lazy series of `{:u, 32}`
dtype.
See also:

* `count/1` - counts only the non-`nil` elements.
* `size/1` - counts all elements.

## Examples

iex> s = Explorer.Series.from_list(["a", nil, "c", nil, nil])
Without `nil`s:

iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.nil_count(s)
3
0

With `nil`s:

iex> s = Explorer.Series.from_list(["a", nil, "c"])
iex> Explorer.Series.nil_count(s)
1

With `:nan`s (`:nan` does not count as `nil`):

iex> s = Explorer.Series.from_list([1, :nan, 3])
iex> Explorer.Series.nil_count(s)
0
"""
@doc type: :aggregation
def nil_count(series), do: apply_series(series, :nil_count)
Expand Down Expand Up @@ -6072,7 +6107,7 @@ defmodule Explorer.Series do
s64 [nil]
>

It raises an exception if the string is invalid JSON.
It raises an exception if the string is invalid JSON.
"""
@doc type: :string_wise
@spec json_decode(Series.t(), dtype()) :: Series.t()
Expand Down
7 changes: 7 additions & 0 deletions native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,13 @@ pub fn expr_nil_count(expr: ExExpr) -> ExExpr {
ExExpr::new(expr.null_count())
}

#[rustler::nif]
pub fn expr_size(expr: ExExpr) -> ExExpr {
let expr = expr.clone_inner();

ExExpr::new(expr.len())
}

#[rustler::nif]
pub fn expr_n_distinct(expr: ExExpr) -> ExExpr {
let expr = expr.clone_inner();
Expand Down
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ rustler::init!(
expr_sum,
expr_variance,
expr_product,
expr_size,
expr_skew,
expr_correlation,
expr_covariance,
Expand Down
38 changes: 38 additions & 0 deletions test/explorer/data_frame/grouped_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,44 @@ defmodule Explorer.DataFrame.GroupedTest do
end
end

describe "size/1, count/1, and count_nil/1" do
test "work with `nil` and `:nan` correctly" do
df =
DF.new(
a: [1, 2, 3],
b: [1, nil, 3],
c: [1, :nan, 3],
group: [1, 1, 2]
)
|> DF.group_by(:group)

assert df
|> DF.summarise(
a_count: count(a),
a_nil_count: nil_count(a),
a_size: size(a),
b_nil_count: nil_count(b),
b_count: count(b),
b_size: size(b),
c_count: count(c),
c_nil_count: nil_count(c),
c_size: size(c)
)
|> DF.to_columns(atom_keys: true) == %{
a_count: [2, 1],
a_nil_count: [0, 0],
a_size: [2, 1],
b_count: [1, 1],
b_nil_count: [1, 0],
b_size: [2, 1],
c_count: [2, 1],
c_nil_count: [0, 0],
c_size: [2, 1],
group: [1, 2]
}
end
end

describe "n_columns/1" do
test "groups don't affect counting of columns", %{df: df} do
df1 = DF.group_by(df, ["year"])
Expand Down
Loading