Skip to content

Commit

Permalink
[BUGFIX] droplabel when combined with other selectors does not exclud…
Browse files Browse the repository at this point in the history
…e labels (#1444)

* fix bug for droplabel

* upload-artifact@v3 ->  upload-artifact@v4

* update example in docs

---------

Co-authored-by: samuel.oranyeli <samuel.oranyeli@grow.inc>
  • Loading branch information
samukweku and samuel.oranyeli authored Feb 6, 2025
1 parent 4e71e88 commit 85b7af3
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 30 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
- name: Build docs
run: mkdocs build

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: website
path: site/
Expand Down
75 changes: 46 additions & 29 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ def select_columns(
Exclude columns with the `DropLabel` class:
>>> from janitor import DropLabel
>>> df.select_columns(DropLabel(slice("name", "awake")), "conservation")
brainwt bodywt conservation
0 NaN 50.000 lc
1 0.01550 0.480 NaN
2 NaN 1.350 nt
3 0.00029 0.019 lc
4 0.42300 600.000 domesticated
>>> df.select_columns(DropLabel(slice("name", "awake")))
brainwt bodywt
0 NaN 50.000
1 0.01550 0.480
2 NaN 1.350
3 0.00029 0.019
4 0.42300 600.000
Selection on MultiIndex columns:
>>> d = {'num_legs': [4, 4, 2, 2],
Expand Down Expand Up @@ -673,7 +673,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
Returns an array of integers.
"""
level_label = {}

index = getattr(df, axis)
if not isinstance(index, pd.MultiIndex):
return _select_index(list(arg), df, axis)
Expand All @@ -687,6 +687,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
"in the MultiIndex, and should either be all "
"strings or integers."
)
level_label = {}
for key, value in arg.items():
if isinstance(value, dispatch_callable):
indexer = index.get_level_values(key)
Expand Down Expand Up @@ -757,14 +758,14 @@ def _index_dispatch(arg, df, axis): # noqa: F811
def _column_sel_dispatch(cols, df, axis): # noqa: F811
"""
Base function for selection on a Pandas Index object.
Returns the inverse of the passed label(s).
Returns the inverse of the passed label(s),
or the set difference if it is part of a list of labels.
Returns an array of integers.
"""
arr = _select_index(cols.label, df, axis)
index = np.arange(getattr(df, axis).size)
arr = _index_converter(arr, index)
return np.delete(index, arr)
return _index_converter(arr, index)


@_select_index.register(set)
Expand Down Expand Up @@ -797,27 +798,43 @@ def _index_dispatch(arg, df, axis): # noqa: F811
indices = index.get_indexer_for(list(arg))
if (indices != -1).all():
return indices
# treat multiple DropLabel instances as a single unit
checks = (isinstance(entry, DropLabel) for entry in arg)
if sum(checks) > 1:
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
drop_labels = [entry.label for entry in drop_labels]
drop_labels = DropLabel(drop_labels)
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
arg.append(drop_labels)
indices = [_select_index(entry, df, axis) for entry in arg]

include = []
exclude = []
for entry in arg:
if isinstance(entry, DropLabel):
exclude.append(entry)
else:
outcome = _select_index(entry, df, axis)
include.append(outcome)
if exclude:
if len(exclude) > 1:
exclude = [entry.label for entry in exclude]
exclude = DropLabel(exclude)
else:
exclude = exclude[0]
exclude = _select_index(exclude, df, axis)
len_exclude = len(exclude)
if len_exclude and not include:
index_arr = np.arange(getattr(df, axis).size)
return np.delete(index_arr, exclude)
if include and len_exclude:
include = [_index_converter(arr, index) for arr in include]
include = np.concatenate(include)
mask = np.isin(include, exclude)
return include[~mask]
# single entry does not need to be combined
# or materialized if possible;
# this offers more performance
if len(indices) == 1:
if is_scalar(indices[0]):
return indices
indices = indices[0]
if is_list_like(indices):
indices = np.asanyarray(indices)
return indices
indices = [_index_converter(arr, index) for arr in indices]
return np.concatenate(indices)
if len(include) == 1:
if is_scalar(include[0]):
return include
include = include[0]
if is_list_like(include):
include = np.asanyarray(include)
return include
include = [_index_converter(arr, index) for arr in include]
return np.concatenate(include)


def _index_converter(arr, index):
Expand Down
21 changes: 21 additions & 0 deletions tests/functions/test_select_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,27 @@ def test_select_column_names_droplabel(dataframe, invert, expected):
assert_frame_equal(df, dataframe[expected])


def test_select_column_names_droplabel_mix():
"Test DropLabel, mixed with other labels"
data = {
"rowid": [1, 2],
"species": ["Adelie", "Adelie"],
"island": ["Torgersen", "Torgersen"],
"bill_length_mm": [39.1, 39.5],
"bill_depth_mm": [18.7, 17.4],
"flipper_length_mm": [181.0, 186.0],
"body_mass_g": [3750.0, 3800.0],
"sex": ["male", "female"],
"year": [2007, 2007],
}

df = pd.DataFrame(data)
selectors = [pd.api.types.is_numeric_dtype, DropLabel(["year", "rowid"])]
actual = df.select(selectors)
expected = df.select_dtypes("number").drop(columns=["year", "rowid"])
assert_frame_equal(actual, expected)


@pytest.mark.functions
def test_select_column_names_droplabel_multiple(dataframe):
"Base DataFrame"
Expand Down

0 comments on commit 85b7af3

Please sign in to comment.