From 96cfca52c932e8fe9b42b188a00107478c2b7319 Mon Sep 17 00:00:00 2001 From: Philip Capel Date: Wed, 7 Feb 2024 12:32:12 -0600 Subject: [PATCH 1/6] Add failing test reproduction of issue * https://github.com/elixir-explorer/explorer/issues/848 --- test/explorer/data_frame_test.exs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 18fa48fb5..5f9e7ffb4 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2763,6 +2763,14 @@ defmodule Explorer.DataFrameTest do end end + test "with keyword and a column that is duplicated" do + df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) + + assert_raise ArgumentError, ~r"duplicate column name \"g\"", fn -> + DF.rename(df, a: "first", a: "second") + end + end + test "with a map and a column that doesn't exist" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) From 4ede84feac492426f1433c04ef60313767d25892 Mon Sep 17 00:00:00 2001 From: Philip Capel Date: Wed, 7 Feb 2024 13:00:54 -0600 Subject: [PATCH 2/6] Add column check for pairs * Check pairs for duplicated source column values --- lib/explorer/data_frame.ex | 10 ++++++++++ test/explorer/data_frame_test.exs | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 9eee4a461..6c255853c 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -3609,6 +3609,7 @@ defmodule Explorer.DataFrame do df pairs -> + maybe_raise_column_duplicate(pairs) pairs_map = Map.new(pairs) old_dtypes = df.dtypes @@ -3633,6 +3634,15 @@ defmodule Explorer.DataFrame do end end + defp maybe_raise_column_duplicate(pairs) when is_column_pairs(pairs) do + Enum.reduce(pairs, MapSet.new(), fn {col, _val}, seen -> + case col in seen do + true -> raise ArgumentError, "duplicate column name \"#{col}\" in rename" + false -> MapSet.put(seen, col) + end + end) + end + defp check_new_names_length!(df, names) do width = n_columns(df) n_new_names = length(names) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 5f9e7ffb4..14904d7d3 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2766,7 +2766,7 @@ defmodule Explorer.DataFrameTest do test "with keyword and a column that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) - assert_raise ArgumentError, ~r"duplicate column name \"g\"", fn -> + assert_raise ArgumentError, ~r"duplicate column name \"a\"", fn -> DF.rename(df, a: "first", a: "second") end end From 70bca937a743380685a2216012d4cd9461cf227e Mon Sep 17 00:00:00 2001 From: Philip Date: Tue, 13 Feb 2024 09:01:49 -0600 Subject: [PATCH 3/6] Refactor maybe_raise_column_duplicate * Refactor the reduction to use a list instead of a map. This preserves the ability to name the duplicated column while simplifying the procedure. Co-authored-by: Philip Sampaio --- lib/explorer/data_frame.ex | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 6c255853c..b42e21c23 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -3635,11 +3635,10 @@ defmodule Explorer.DataFrame do end defp maybe_raise_column_duplicate(pairs) when is_column_pairs(pairs) do - Enum.reduce(pairs, MapSet.new(), fn {col, _val}, seen -> - case col in seen do - true -> raise ArgumentError, "duplicate column name \"#{col}\" in rename" - false -> MapSet.put(seen, col) - end + Enum.reduce(pairs, [], fn {col, _val}, seen -> + if col in seen, do: raise(ArgumentError, "duplicate column name \"#{col}\" in rename") + + [col | seen] end) end From 2ce4eb7cada9918ddc08869aff4573620f24ed3d Mon Sep 17 00:00:00 2001 From: Philip Capel Date: Tue, 13 Feb 2024 16:24:14 -0600 Subject: [PATCH 4/6] Refactor to a more performant check --- lib/explorer/data_frame.ex | 13 ++++--------- test/explorer/data_frame_test.exs | 18 +++++++++++++++++- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index b42e21c23..b3cef91d5 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -3609,8 +3609,11 @@ defmodule Explorer.DataFrame do df pairs -> - maybe_raise_column_duplicate(pairs) pairs_map = Map.new(pairs) + + if Enum.count(pairs) != map_size(pairs_map), + do: raise(ArgumentError, "duplicate source column for rename") + old_dtypes = df.dtypes for {name, _} <- pairs do @@ -3634,14 +3637,6 @@ defmodule Explorer.DataFrame do end end - defp maybe_raise_column_duplicate(pairs) when is_column_pairs(pairs) do - Enum.reduce(pairs, [], fn {col, _val}, seen -> - if col in seen, do: raise(ArgumentError, "duplicate column name \"#{col}\" in rename") - - [col | seen] - end) - end - defp check_new_names_length!(df, names) do width = n_columns(df) n_new_names = length(names) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 14904d7d3..36c11be69 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2766,11 +2766,27 @@ defmodule Explorer.DataFrameTest do test "with keyword and a column that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) - assert_raise ArgumentError, ~r"duplicate column name \"a\"", fn -> + assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> DF.rename(df, a: "first", a: "second") end end + test "with mix of column names and indices" do + df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) + + assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> + DF.rename(df, [{"a", "first"}, {0, "g"}]) + end + end + + test "with binary tuples and a column that is duplicated" do + df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) + + assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> + DF.rename(df, [{"a", "first"}, {"a", "second"}]) + end + end + test "with a map and a column that doesn't exist" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) From c0a5afbf4b74dcc30ab6cd5fa547c5200d97e8c4 Mon Sep 17 00:00:00 2001 From: Philip Capel Date: Wed, 14 Feb 2024 11:37:03 -0600 Subject: [PATCH 5/6] Check for Err in result of df_rename_columns * df.rename propagates error, resulting in RuntimeError for duplicated columns --- native/explorer/src/dataframe.rs | 2 +- test/explorer/data_frame_test.exs | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/native/explorer/src/dataframe.rs b/native/explorer/src/dataframe.rs index 4b32a5c22..0ca0ad0a4 100644 --- a/native/explorer/src/dataframe.rs +++ b/native/explorer/src/dataframe.rs @@ -533,7 +533,7 @@ pub fn df_rename_columns( ) -> Result { let mut df = df.clone(); for (original, new_name) in renames { - df.rename(original, new_name).expect("should rename"); + df.rename(original, new_name)?; } Ok(ExDataFrame::new(df)) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 36c11be69..2ba594f72 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2766,7 +2766,7 @@ defmodule Explorer.DataFrameTest do test "with keyword and a column that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) - assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> + assert_raise ArgumentError, ~r"duplicate source column for rename", fn -> DF.rename(df, a: "first", a: "second") end end @@ -2774,7 +2774,7 @@ defmodule Explorer.DataFrameTest do test "with mix of column names and indices" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) - assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> + assert_raise ArgumentError, ~r"duplicate source column for rename", fn -> DF.rename(df, [{"a", "first"}, {0, "g"}]) end end @@ -2782,11 +2782,19 @@ defmodule Explorer.DataFrameTest do test "with binary tuples and a column that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) - assert_raise ArgumentError, ~r"duplicate source or target column for rename", fn -> + assert_raise ArgumentError, ~r"duplicate source column for rename", fn -> DF.rename(df, [{"a", "first"}, {"a", "second"}]) end end + test "with binary tuples and a target column that is duplicated" do + df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) + + assert_raise RuntimeError, ~r"duplicate column names found", fn -> + DF.rename(df, [{"a", "first"}, {"b", "first"}]) + end + end + test "with a map and a column that doesn't exist" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) From 275f913aa5844048755413c56727f9d589340b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Wed, 14 Feb 2024 20:40:04 +0100 Subject: [PATCH 6/6] Apply suggestions from code review --- test/explorer/data_frame_test.exs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 2ba594f72..d12915843 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2771,7 +2771,7 @@ defmodule Explorer.DataFrameTest do end end - test "with mix of column names and indices" do + test "with mix of column name and index that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) assert_raise ArgumentError, ~r"duplicate source column for rename", fn -> @@ -2779,7 +2779,7 @@ defmodule Explorer.DataFrameTest do end end - test "with binary tuples and a column that is duplicated" do + test "with string column names that are duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) assert_raise ArgumentError, ~r"duplicate source column for rename", fn -> @@ -2787,7 +2787,7 @@ defmodule Explorer.DataFrameTest do end end - test "with binary tuples and a target column that is duplicated" do + test "with string column names and a target that is duplicated" do df = DF.new(a: [1, 2, 3], b: ["a", "b", "c"]) assert_raise RuntimeError, ~r"duplicate column names found", fn ->