From 85b7af36e1f832f57df41e1bfb1aff59fd05dca6 Mon Sep 17 00:00:00 2001 From: Samuel Oranyeli Date: Thu, 6 Feb 2025 12:55:00 +1100 Subject: [PATCH] [BUGFIX] droplabel when combined with other selectors does not exclude labels (#1444) * fix bug for droplabel * upload-artifact@v3 -> upload-artifact@v4 * update example in docs --------- Co-authored-by: samuel.oranyeli --- .github/workflows/docs.yml | 2 +- janitor/functions/select.py | 75 ++++++++++++++++---------- tests/functions/test_select_columns.py | 21 ++++++++ 3 files changed, 68 insertions(+), 30 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 944cd9876..618d9e922 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -36,7 +36,7 @@ jobs: - name: Build docs run: mkdocs build - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: website path: site/ diff --git a/janitor/functions/select.py b/janitor/functions/select.py index 418ff5251..3f2e90c2f 100644 --- a/janitor/functions/select.py +++ b/janitor/functions/select.py @@ -152,13 +152,13 @@ def select_columns( Exclude columns with the `DropLabel` class: >>> from janitor import DropLabel - >>> df.select_columns(DropLabel(slice("name", "awake")), "conservation") - brainwt bodywt conservation - 0 NaN 50.000 lc - 1 0.01550 0.480 NaN - 2 NaN 1.350 nt - 3 0.00029 0.019 lc - 4 0.42300 600.000 domesticated + >>> df.select_columns(DropLabel(slice("name", "awake"))) + brainwt bodywt + 0 NaN 50.000 + 1 0.01550 0.480 + 2 NaN 1.350 + 3 0.00029 0.019 + 4 0.42300 600.000 Selection on MultiIndex columns: >>> d = {'num_legs': [4, 4, 2, 2], @@ -673,7 +673,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811 Returns an array of integers. """ - level_label = {} + index = getattr(df, axis) if not isinstance(index, pd.MultiIndex): return _select_index(list(arg), df, axis) @@ -687,6 +687,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811 "in the MultiIndex, and should either be all " "strings or integers." ) + level_label = {} for key, value in arg.items(): if isinstance(value, dispatch_callable): indexer = index.get_level_values(key) @@ -757,14 +758,14 @@ def _index_dispatch(arg, df, axis): # noqa: F811 def _column_sel_dispatch(cols, df, axis): # noqa: F811 """ Base function for selection on a Pandas Index object. - Returns the inverse of the passed label(s). + Returns the inverse of the passed label(s), + or the set difference if it is part of a list of labels. Returns an array of integers. """ arr = _select_index(cols.label, df, axis) index = np.arange(getattr(df, axis).size) - arr = _index_converter(arr, index) - return np.delete(index, arr) + return _index_converter(arr, index) @_select_index.register(set) @@ -797,27 +798,43 @@ def _index_dispatch(arg, df, axis): # noqa: F811 indices = index.get_indexer_for(list(arg)) if (indices != -1).all(): return indices - # treat multiple DropLabel instances as a single unit - checks = (isinstance(entry, DropLabel) for entry in arg) - if sum(checks) > 1: - drop_labels = (entry for entry in arg if isinstance(entry, DropLabel)) - drop_labels = [entry.label for entry in drop_labels] - drop_labels = DropLabel(drop_labels) - arg = [entry for entry in arg if not isinstance(entry, DropLabel)] - arg.append(drop_labels) - indices = [_select_index(entry, df, axis) for entry in arg] + + include = [] + exclude = [] + for entry in arg: + if isinstance(entry, DropLabel): + exclude.append(entry) + else: + outcome = _select_index(entry, df, axis) + include.append(outcome) + if exclude: + if len(exclude) > 1: + exclude = [entry.label for entry in exclude] + exclude = DropLabel(exclude) + else: + exclude = exclude[0] + exclude = _select_index(exclude, df, axis) + len_exclude = len(exclude) + if len_exclude and not include: + index_arr = np.arange(getattr(df, axis).size) + return np.delete(index_arr, exclude) + if include and len_exclude: + include = [_index_converter(arr, index) for arr in include] + include = np.concatenate(include) + mask = np.isin(include, exclude) + return include[~mask] # single entry does not need to be combined # or materialized if possible; # this offers more performance - if len(indices) == 1: - if is_scalar(indices[0]): - return indices - indices = indices[0] - if is_list_like(indices): - indices = np.asanyarray(indices) - return indices - indices = [_index_converter(arr, index) for arr in indices] - return np.concatenate(indices) + if len(include) == 1: + if is_scalar(include[0]): + return include + include = include[0] + if is_list_like(include): + include = np.asanyarray(include) + return include + include = [_index_converter(arr, index) for arr in include] + return np.concatenate(include) def _index_converter(arr, index): diff --git a/tests/functions/test_select_columns.py b/tests/functions/test_select_columns.py index adfff33c9..1444a7117 100644 --- a/tests/functions/test_select_columns.py +++ b/tests/functions/test_select_columns.py @@ -44,6 +44,27 @@ def test_select_column_names_droplabel(dataframe, invert, expected): assert_frame_equal(df, dataframe[expected]) +def test_select_column_names_droplabel_mix(): + "Test DropLabel, mixed with other labels" + data = { + "rowid": [1, 2], + "species": ["Adelie", "Adelie"], + "island": ["Torgersen", "Torgersen"], + "bill_length_mm": [39.1, 39.5], + "bill_depth_mm": [18.7, 17.4], + "flipper_length_mm": [181.0, 186.0], + "body_mass_g": [3750.0, 3800.0], + "sex": ["male", "female"], + "year": [2007, 2007], + } + + df = pd.DataFrame(data) + selectors = [pd.api.types.is_numeric_dtype, DropLabel(["year", "rowid"])] + actual = df.select(selectors) + expected = df.select_dtypes("number").drop(columns=["year", "rowid"]) + assert_frame_equal(actual, expected) + + @pytest.mark.functions def test_select_column_names_droplabel_multiple(dataframe): "Base DataFrame"