Skip to content

Commit

Permalink
[BUGFIX] Failures in pivot_wider (#1337)
Browse files Browse the repository at this point in the history
* handle deprecation warnings where possible; use vectorized option for truncate_datetime; fix failing tests in pivot_wider

* changelog

* fix test failures for pivot_wider; fix deprecation warnings where possible

* restore fastpath in _range_indices for conditional_join

* Delete examples/notebooks/bla.ipynb

* fix docstrings xarray

* fix docstrings xarray

* remove dropna for equi join test

* updates

* changelog

* changelog

---------

Co-authored-by: samuel.oranyeli <samuel.oranyeli@grow.inc>
Co-authored-by: Eric Ma <ericmjl@users.noreply.github.com>
  • Loading branch information
3 people authored Mar 13, 2024
1 parent 65c105b commit 519efbf
Show file tree
Hide file tree
Showing 18 changed files with 96 additions and 104 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- [ENH] `change_index_dtype` added. - @samukweku Issue #1314
- [ENH] Add `glue` and `axis` parameters to `collapse_levels`. - Issue #211 @samukweku
- [ENH] `row_to_names` now supports multiple rows conversion to columns. - @samukweku Issue #1333
- [ENH] Fix warnings from Pandas. `truncate_datetime` now uses a vectorized option. -@samukweku #1337

## [v0.26.0] - 2023-09-18

Expand Down
2 changes: 1 addition & 1 deletion janitor/functions/change_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def change_type(
elif ignore_exception == "fillna":
if isinstance(column_name, Hashable):
column_name = [column_name]
df[column_name] = df[column_name].applymap(_convert, dtype=dtype)
df[column_name] = df[column_name].map(_convert, dtype=dtype)
else:
raise ValueError("Unknown option for ignore_exception")

Expand Down
3 changes: 3 additions & 0 deletions janitor/functions/conditional_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,9 @@ def _range_indices(

right_index = np.concatenate(right_index)
left_index = left_index.repeat(repeater)

if fastpath:
return left_index, right_index
# here we search for actual positions
# where left_c is </<= right_c
# safe to index the arrays, since we are picking the positions
Expand Down
15 changes: 11 additions & 4 deletions janitor/functions/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pandas as pd
import pandas_flavor as pf
from pandas.api.types import (
is_categorical_dtype,
is_extension_array_dtype,
is_list_like,
)
Expand Down Expand Up @@ -1619,7 +1618,11 @@ def _computations_pivot_wider(
indexer = out.index
if index_expand and index:
any_categoricals = (indexer.get_level_values(name) for name in index)
any_categoricals = any(map(is_categorical_dtype, any_categoricals))
any_categoricals = (
isinstance(entry, pd.CategoricalIndex)
for entry in any_categoricals
)
any_categoricals = any(any_categoricals)
if any_categoricals:
indexer = _expand(indexer, retain_categories=True)
out = out.reindex(index=indexer)
Expand All @@ -1629,7 +1632,11 @@ def _computations_pivot_wider(
any_categoricals = (
indexer.get_level_values(name) for name in names_from
)
any_categoricals = any(map(is_categorical_dtype, any_categoricals))
any_categoricals = (
isinstance(entry, pd.CategoricalIndex)
for entry in any_categoricals
)
any_categoricals = any(any_categoricals)
if any_categoricals:
retain_categories = True
if flatten_levels & (
Expand Down Expand Up @@ -1834,7 +1841,7 @@ def _expand(indexer, retain_categories):
categories=arr.categories,
ordered=arr.ordered,
)
if is_categorical_dtype(arr)
if isinstance(arr, pd.CategoricalIndex)
else arr.unique()
)
for arr in indexer
Expand Down
4 changes: 1 addition & 3 deletions janitor/functions/sort_column_value_order.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@

from janitor.utils import check, check_column

from .remove_columns import remove_columns # noqa: F401


@pf.register_dataframe_method
def sort_column_value_order(
Expand Down Expand Up @@ -65,7 +63,7 @@ def sort_column_value_order(
if not column_value_order:
raise ValueError("column_value_order dictionary cannot be empty")

df = df.assign(cond_order=df[column].replace(column_value_order))
df = df.assign(cond_order=df[column].map(column_value_order))

sort_by = ["cond_order"]
if columns is not None:
Expand Down
77 changes: 20 additions & 57 deletions janitor/functions/truncate_datetime.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Implementation of the `truncate_datetime` family of functions."""

import datetime as dt

import numpy as np
import pandas as pd
import pandas_flavor as pf
from pandas.api.types import is_datetime64_any_dtype
Expand Down Expand Up @@ -48,67 +47,31 @@ def truncate_datetime_dataframe(
A pandas DataFrame with all valid datetimes truncated down
to the specified precision.
"""
ACCEPTABLE_DATEPARTS = ("YEAR", "MONTH", "DAY", "HOUR", "MINUTE", "SECOND")
# idea from Stack Overflow
# https://stackoverflow.com/a/28783971/7175713
# https://numpy.org/doc/stable/reference/arrays.datetime.html
ACCEPTABLE_DATEPARTS = {
"YEAR": "datetime64[Y]",
"MONTH": "datetime64[M]",
"DAY": "datetime64[D]",
"HOUR": "datetime64[h]",
"MINUTE": "datetime64[m]",
"SECOND": "datetime64[s]",
}
datepart = datepart.upper()
if datepart not in ACCEPTABLE_DATEPARTS:
raise ValueError(
"Received an invalid `datepart` precision. "
f"Please enter any one of {ACCEPTABLE_DATEPARTS}."
)

dt_cols = [
column
for column, coltype in df.dtypes.items()
if is_datetime64_any_dtype(coltype)
]
if not dt_cols:
# avoid copying df if no-op is expected
return df

df = df.copy()
# NOTE: use **kwargs of `applymap` instead of lambda when we upgrade to
# pandas >= 1.3.0
df[dt_cols] = df[dt_cols].applymap(
lambda x: _truncate_datetime(x, datepart=datepart),
)

return df


def _truncate_datetime(timestamp: dt.datetime, datepart: str) -> dt.datetime:
"""Truncate a given timestamp to the given datepart.
Truncation will only occur on valid timestamps (datetime-like objects).
Args:
timestamp: Expecting a datetime from python `datetime` class (dt).
datepart: Truncation precision, YEAR, MONTH, DAY,
HOUR, MINUTE, SECOND.
Returns:
A truncated datetime object to the precision specified by
datepart.
"""
if pd.isna(timestamp):
return timestamp

recurrence = [0, 1, 1, 0, 0, 0] # [YEAR, MONTH, DAY, HOUR, MINUTE, SECOND]
ENUM = {
"YEAR": 0,
"MONTH": 1,
"DAY": 2,
"HOUR": 3,
"MINUTE": 4,
"SECOND": 5,
0: timestamp.year,
1: timestamp.month,
2: timestamp.day,
3: timestamp.hour,
4: timestamp.minute,
5: timestamp.second,
}
dictionary = {}

for i in range(ENUM[datepart] + 1):
recurrence[i] = ENUM[i]
for label, series in df.items():
if is_datetime64_any_dtype(series):
dtype = ACCEPTABLE_DATEPARTS[datepart]
# TODO: add branch for pyarrow arrays
series = np.array(series._values, dtype=dtype)
dictionary[label] = series

return dt.datetime(*recurrence)
return pd.DataFrame(dictionary)
6 changes: 3 additions & 3 deletions janitor/xarray/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def clone_using(
... np.ones((4, 6)), new_name='new_and_improved', use_coords=False,
... )
>>> new_da
<xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)>
<xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)> Size: 192B
array([[1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1.],
[1., 1., 1., 1., 1., 1.],
Expand Down Expand Up @@ -130,10 +130,10 @@ def convert_datetime_to_number(
... )
>>> da_minutes = da.convert_datetime_to_number("s", dim="time")
>>> da_minutes
<xarray.DataArray (time: 6)>
<xarray.DataArray (time: 6)> Size: 48B
array([2, 8, 0, 1, 7, 7])
Coordinates:
* time (time) float64 0.0 60.0 120.0 180.0 240.0 300.0
* time (time) float64 48B 0.0 60.0 120.0 180.0 240.0 300.0
Args:
da_or_ds: XArray object.
Expand Down
10 changes: 5 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ markers = [

[tool.ruff]
# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
select = ["E", "F", "I"]
ignore = []
lint.select = ["E", "F", "I"]
lint.ignore = []

# Allow fix for all enabled rules (when `--fix`) is provided.
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
unfixable = []
lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
lint.unfixable = []

# Exclude a variety of commonly ignored directories.
exclude = [
Expand Down Expand Up @@ -86,7 +86,7 @@ exclude = [
line-length = 88

# Allow unused variables when underscore-prefixed.
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

# Assume Python 3.10
target-version = "py310"
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from janitor.testing_utils import date_data

os.environ["NUMBA_DISABLE_JIT"] = "1"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

TEST_DATA_DIR = "tests/test_data"
EXAMPLES_DIR = "examples/"
Expand Down
13 changes: 9 additions & 4 deletions tests/functions/test_complete.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,15 +435,18 @@ def test_complete_multiple_groupings():

def test_fill_value_scalar(taxonomy_df):
"""Test output if the fill_value is a scalar."""
result = taxonomy_df.complete(
"Year", "Taxon", fill_value=0, sort=False
).astype({"Abundance": int})
result = (
taxonomy_df.complete("Year", "Taxon", fill_value=0, sort=False)
.sort_values("Taxon", ignore_index=True)
.astype({"Abundance": int})
)
expected = (
taxonomy_df.set_index(["Year", "Taxon"])
.unstack(fill_value=0)
.stack(future_stack=True)
.reset_index()
.astype({"Taxon": "object"})
.sort_values("Taxon", ignore_index=True)
)

assert_frame_equal(result, expected)
Expand Down Expand Up @@ -525,7 +528,9 @@ def test_complete_groupby():
expected = (
df.set_index("year")
.groupby("state")
.apply(lambda x: x.reindex(range(x.index.min(), x.index.max() + 1)))
.value.apply(
lambda x: x.reindex(range(x.index.min(), x.index.max() + 1))
)
.drop(columns="state")
.reset_index()
)
Expand Down
10 changes: 5 additions & 5 deletions tests/functions/test_conditional_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -3470,8 +3470,10 @@ def test_ge_eq_and_le_datess_numba_indices(df, right):
expected = pd.Index(expected)

actual, _ = get_join_indices(
df[["B", "A", "E"]],
right[["Floats", "Integers", "Dates", "Numeric"]],
df[["B", "A", "E"]].dropna(subset=["E"]),
right[["Floats", "Integers", "Dates", "Numeric"]].dropna(
subset=["Dates"]
),
[
("A", "Integers", "<"),
("E", "Dates", "=="),
Expand Down Expand Up @@ -3520,12 +3522,10 @@ def test_eq_indices(df, right):
@pytest.mark.turtle
def test_ge_eq_and_le_datess_indices(df, right):
"""compare join indices for multiple conditions."""

expected = (
df.reset_index()
.dropna(subset=["E"])
.merge(
right.dropna(subset=["Dates"]),
right,
left_on="E",
right_on="Dates",
how="inner",
Expand Down
2 changes: 1 addition & 1 deletion tests/functions/test_groupby_agg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_groupby_agg():
by="date",
new_column_name="date_average",
agg_column_name="values",
agg=np.mean,
agg="mean",
)
assert df.shape[0] == df_new.shape[0]
assert "date_average" in df_new.columns
Expand Down
30 changes: 21 additions & 9 deletions tests/functions/test_groupby_topk.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,49 @@ def test_dtype_by(df):
def test_ascending_groupby_k_2(df):
"""Test ascending group by, k=2"""
expected = (
df.groupby("result", sort=False)
df.groupby("result", sort=False)[["age", "major", "ID"]]
.apply(lambda d: d.sort_values("age").head(2))
.droplevel(0)
.reset_index("result")
.sort_index(axis="columns")
)
assert_frame_equal(
df.groupby_topk("result", "age", 2, ignore_index=False), expected
df.groupby_topk("result", "age", 2, ignore_index=False).sort_index(
axis="columns"
),
expected,
)


def test_ascending_groupby_non_numeric(df):
"""Test output for non-numeric column"""
expected = (
df.groupby("result", sort=False)
df.groupby("result", sort=False)[["age", "major", "ID"]]
.apply(lambda d: d.sort_values("major").head(2))
.droplevel(0)
.reset_index("result")
.sort_index(axis="columns")
)
assert_frame_equal(
df.groupby_topk("result", "major", 2, ignore_index=False), expected
df.groupby_topk("result", "major", 2, ignore_index=False).sort_index(
axis="columns"
),
expected,
)


def test_descending_groupby_k_3(df):
"""Test descending group by, k=3"""
expected = (
df.groupby("result", sort=False)
df.groupby("result", sort=False)[["age", "major", "ID"]]
.apply(lambda d: d.sort_values("age", ascending=False).head(3))
.droplevel(0)
.reset_index("result")
.reset_index(drop=True)
.sort_index(axis="columns")
)
assert_frame_equal(
df.groupby_topk("result", "age", 3, ascending=False), expected
df.groupby_topk("result", "age", 3, ascending=False).sort_index(
axis="columns"
),
expected,
)


Expand Down
Loading

0 comments on commit 519efbf

Please sign in to comment.