Skip to content

Commit

Permalink
Merge pull request #1 from srzeszut/knn_imputer
Browse files Browse the repository at this point in the history
Knn imputer
  • Loading branch information
srzeszut authored Nov 28, 2024
2 parents 108475d + e23a9dd commit 071fb27
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 48 deletions.
74 changes: 36 additions & 38 deletions lib/scholar/impute/knn_imputter.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ defmodule Scholar.Impute.KNNImputter do
Imputer for completing missing values using k-Nearest Neighbors.
Each sample's missing values are imputed using the mean value from
`n_neighbors` nearest neighbors found in the training set. Two samples are
close if the features that neither is missing are close.
`n_neighbors` nearest neighbors found in the training set. Two samples are
close if the features that neither is missing are close.
"""
import Nx.Defn
import Scholar.Metrics.Distance
Expand All @@ -14,15 +14,15 @@ defmodule Scholar.Impute.KNNImputter do

opts_schema = [
missing_values: [
type: {:or, [:float, :integer, {:in, [:nan]}]},
type: {:or, [:float, :integer, {:in, [:infinity, :neg_infinity, :nan]}]},
default: :nan,
doc: ~S"""
The placeholder for the missing values. All occurrences of `:missing_values` will be imputed.
The default value expects there are no NaNs in the input tensor.
"""
],
number_of_neighbors: [
num_neighbors: [
type: :pos_integer,
default: 2,
doc: "The number of nearest neighbors."
Expand All @@ -35,9 +35,9 @@ defmodule Scholar.Impute.KNNImputter do
Imputter for completing missing values using k-Nearest Neighbors.
Preconditions:
* `number_of_neighbors` is a positive integer.
* number of neighbors must be less than number valid of rows - 1 (valid row is row with more than 1 non nan value) otherwise it is better to use simple imputter
* when you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor
* The number of neighbors must be less than the number of valid rows - 1.
* A valid row is a row with more than 1 non-NaN values. Otherwise it is better to use a simpler imputter.
* When you set a value different than :nan in `missing_values` there should be no NaNs in the input tensor
## Options
Expand All @@ -47,15 +47,14 @@ defmodule Scholar.Impute.KNNImputter do
The function returns a struct with the following parameters:
* `:missing_values` - the same value as in `:missing_values`
* `:missing_values` - the same value as in the `:missing_values` option
* `:statistics` - The imputation fill value for each feature. Computing statistics can result in
[`Nx.Constant.nan/0`](https://hexdocs.pm/nx/Nx.Constants.html#nan/0) values.
* `:statistics` - The imputation fill value for each feature. Computing statistics can result in values.
## Examples
iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]])
iex> Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2)
iex> Scholar.Impute.KNNImputter.fit(x, num_neighbors: 2)
%Scholar.Impute.KNNImputter{
statistics: Nx.tensor(
[
Expand All @@ -77,20 +76,18 @@ defmodule Scholar.Impute.KNNImputter do
input_rank = Nx.rank(x)

if input_rank != 2 do
raise ArgumentError, "Wrong input rank. Expected: 2, got: #{inspect(input_rank)}"
raise ArgumentError, "wrong input rank. Expected: 2, got: #{inspect(input_rank)}"
end

x =
if opts[:missing_values] != :nan,
do: Nx.select(Nx.equal(x, opts[:missing_values]), Nx.Constants.nan(), x),
else: x
missing_values = opts[:missing_values]

num_neighbors = opts[:number_of_neighbors]
x =
if missing_values != :nan,
do: Nx.select(Nx.equal(x, missing_values), :nan, x),
else: x

placeholder_value = Nx.Constants.nan() |> Nx.tensor()

statistics = knn_impute(x, placeholder_value, num_neighbors: num_neighbors)
missing_values = opts[:missing_values]
statistics = knn_impute(x, num_neighbors: opts[:num_neighbors], missing_values: missing_values)
%__MODULE__{statistics: statistics, missing_values: missing_values}
end

Expand All @@ -104,7 +101,7 @@ defmodule Scholar.Impute.KNNImputter do
## Examples
iex> x = Nx.tensor([[40.0, 2.0],[4.0, 5.0],[7.0, :nan],[:nan, 8.0],[11.0, 11.0]])
iex> imputer = Scholar.Impute.KNNImputter.fit(x, number_of_neighbors: 2)
iex> imputer = Scholar.Impute.KNNImputter.fit(x, num_neighbors: 2)
iex> Scholar.Impute.KNNImputter.transform(imputer, x)
Nx.tensor(
[
Expand All @@ -121,31 +118,28 @@ defmodule Scholar.Impute.KNNImputter do
Nx.select(mask, statistics, x)
end

defnp knn_impute(x, placeholder_value, opts \\ []) do
defnp knn_impute(x, opts \\ []) do
mask = Nx.is_nan(x)
{num_rows, num_cols} = Nx.shape(x)
num_neighbors = opts[:num_neighbors]

placeholder_value = Nx.tensor(:nan)

values_to_impute = Nx.broadcast(placeholder_value, x)

{_, values_to_impute} =
while {{row = 0, mask, num_neighbors, num_rows, x}, values_to_impute},
Nx.less(row, num_rows) do
row < num_rows do
{_, values_to_impute} =
while {{col = 0, mask, num_neighbors, num_cols, row, x}, values_to_impute},
Nx.less(col, num_cols) do
if mask[row][col] > 0 do
col < num_cols do
if mask[row][col] do
{rows, cols} = Nx.shape(x)

neighbor_avg =
calculate_knn(x, row, col, rows: rows, num_neighbors: opts[:num_neighbors])

indices =
[Nx.stack(row), Nx.stack(col)]
|> Nx.concatenate()
|> Nx.stack()

values_to_impute = Nx.indexed_put(values_to_impute, indices, Nx.stack(neighbor_avg))
values_to_impute = Nx.put_slice(values_to_impute, [row, col], Nx.reshape(neighbor_avg, {1, 1}))
{{col + 1, mask, num_neighbors, cols, row, x}, values_to_impute}
else
{{col + 1, mask, num_neighbors, num_cols, row, x}, values_to_impute}
Expand All @@ -171,15 +165,12 @@ defmodule Scholar.Impute.KNNImputter do
# to the row is under its index in the tensor
{_, row_distances} =
while {{i = 0, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances},
Nx.less(i, rows) do
i < rows do

potential_donor = x[i]

distance =
if i == nan_row do
Nx.Constants.infinity(Nx.type(row_with_value_to_fill))
else
nan_euclidian(row_with_value_to_fill, nan_col, potential_donor)
end
calculate_distance(row_with_value_to_fill, nan_col, potential_donor,nan_row)

row_distances = Nx.indexed_put(row_distances, Nx.new_axis(i, 0), distance)
{{i + 1, x, row_with_value_to_fill, rows, nan_row, nan_col}, row_distances}
Expand All @@ -192,8 +183,15 @@ defmodule Scholar.Impute.KNNImputter do
Nx.sum(values) / num_neighbors
end

defnp calculate_distance(row,nan_col,potential_donor,nan_row) do
case row do
^nan_row -> Nx.Constants.infinity(Nx.type(row))
_ -> nan_euclidean(row, nan_col, potential_donor)
end
end

# nan_col is the column of the value to impute
defnp nan_euclidian(row, nan_col, potential_neighbor) do
defnp nan_euclidean(row, nan_col, potential_neighbor) do
{coordinates} = Nx.shape(row)

# minus nan column
Expand Down
21 changes: 11 additions & 10 deletions test/scholar/impute/knn_imputter_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ defmodule KNNImputterTest do

knn_imputer =
%KNNImputter{statistics: statistics, missing_values: missing_values} =
jit_fit.(x, missing_values: :nan, number_of_neighbors: 2)
jit_fit.(x, missing_values: :nan, num_neighbors: 2)

assert missing_values == :nan

Expand Down Expand Up @@ -47,7 +47,7 @@ defmodule KNNImputterTest do

knn_imputter =
%KNNImputter{statistics: statistics, missing_values: missing_values} =
jit_fit.(x, missing_values: :nan, number_of_neighbors: 1)
jit_fit.(x, missing_values: :nan, num_neighbors: 1)

assert missing_values == :nan

Expand All @@ -72,13 +72,14 @@ defmodule KNNImputterTest do

test "missing values different than :nan" do
x = generate_data()
x = Nx.select(Nx.is_nan(x), Nx.tensor(19.0), x)
x = Nx.select(Nx.is_nan(x), 19.0, x)
# x = Nx.select(Nx.equal(x,19), :nan, x)
jit_fit = Nx.Defn.jit(&KNNImputter.fit/2)
jit_transform = Nx.Defn.jit(&KNNImputter.transform/2)

knn_imputter =
%KNNImputter{statistics: statistics, missing_values: missing_values} =
jit_fit.(x, missing_values: 19.0, number_of_neighbors: 2)
jit_fit.(x, missing_values: 19.0, num_neighbors: 2)

assert missing_values == 19.0

Expand All @@ -103,25 +104,25 @@ defmodule KNNImputterTest do
end

describe "errors" do
test "Wrong impute rank" do
test "invalid impute rank" do
x = Nx.tensor([1, 2, 2, 3])

assert_raise ArgumentError,
"Wrong input rank. Expected: 2, got: 1",
"wrong input rank. Expected: 2, got: 1",
fn ->
KNNImputter.fit(x, missing_values: 1, number_of_neighbors: 2)
KNNImputter.fit(x, missing_values: 1, num_neighbors: 2)
end
end

test "Invalid n_neighbors value" do
test "invalid n_neighbors value" do
x = generate_data()

jit_fit = Nx.Defn.jit(&KNNImputter.fit/2)

assert_raise NimbleOptions.ValidationError,
"invalid value for :number_of_neighbors option: expected positive integer, got: -1",
"invalid value for :num_neighbors option: expected positive integer, got: -1",
fn ->
jit_fit.(x, missing_values: 1.0, number_of_neighbors: -1)
jit_fit.(x, missing_values: 1.0, num_neighbors: -1)
end
end
end
Expand Down

0 comments on commit 071fb27

Please sign in to comment.