[BUGFIX] Failures in pivot_wider (#1337)

* handle deprecation warnings where possible; use vectorized option for truncate_datetime; fix failing tests in pivot_wider * changelog * fix test failures for pivot_wider; fix deprecation warnings where possible * restore fastpath in _range_indices for conditional_join * Delete examples/notebooks/bla.ipynb * fix docstrings xarray * fix docstrings xarray * remove dropna for equi join test * updates * changelog * changelog --------- Co-authored-by: samuel.oranyeli <samuel.oranyeli@grow.inc> Co-authored-by: Eric Ma <ericmjl@users.noreply.github.com>
pyjanitor-devs · Mar 13, 2024 · 519efbf · 519efbf
1 parent 65c105b
commit 519efbf
Show file tree

Hide file tree

Showing 18 changed files with 96 additions and 104 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 -   [ENH] `change_index_dtype` added. - @samukweku Issue #1314
 -   [ENH] Add `glue` and `axis` parameters to `collapse_levels`. - Issue #211 @samukweku
 -   [ENH] `row_to_names` now supports multiple rows conversion to columns. - @samukweku Issue #1333
+-   [ENH] Fix warnings from Pandas. `truncate_datetime` now uses a vectorized option.  -@samukweku #1337
 
 ## [v0.26.0] - 2023-09-18
 

diff --git a/janitor/functions/change_type.py b/janitor/functions/change_type.py
@@ -97,7 +97,7 @@ def change_type(
     elif ignore_exception == "fillna":
         if isinstance(column_name, Hashable):
             column_name = [column_name]
-        df[column_name] = df[column_name].applymap(_convert, dtype=dtype)
+        df[column_name] = df[column_name].map(_convert, dtype=dtype)
     else:
         raise ValueError("Unknown option for ignore_exception")
 

diff --git a/janitor/functions/conditional_join.py b/janitor/functions/conditional_join.py
@@ -1074,6 +1074,9 @@ def _range_indices(
 
     right_index = np.concatenate(right_index)
     left_index = left_index.repeat(repeater)
+
+    if fastpath:
+        return left_index, right_index
     # here we search for actual positions
     # where left_c is </<= right_c
     # safe to index the arrays, since we are picking the positions

diff --git a/janitor/functions/pivot.py b/janitor/functions/pivot.py
@@ -9,7 +9,6 @@
 import pandas as pd
 import pandas_flavor as pf
 from pandas.api.types import (
-    is_categorical_dtype,
     is_extension_array_dtype,
     is_list_like,
 )
@@ -1619,7 +1618,11 @@ def _computations_pivot_wider(
     indexer = out.index
     if index_expand and index:
         any_categoricals = (indexer.get_level_values(name) for name in index)
-        any_categoricals = any(map(is_categorical_dtype, any_categoricals))
+        any_categoricals = (
+            isinstance(entry, pd.CategoricalIndex)
+            for entry in any_categoricals
+        )
+        any_categoricals = any(any_categoricals)
         if any_categoricals:
             indexer = _expand(indexer, retain_categories=True)
             out = out.reindex(index=indexer)
@@ -1629,7 +1632,11 @@ def _computations_pivot_wider(
         any_categoricals = (
             indexer.get_level_values(name) for name in names_from
         )
-        any_categoricals = any(map(is_categorical_dtype, any_categoricals))
+        any_categoricals = (
+            isinstance(entry, pd.CategoricalIndex)
+            for entry in any_categoricals
+        )
+        any_categoricals = any(any_categoricals)
         if any_categoricals:
             retain_categories = True
             if flatten_levels & (
@@ -1834,7 +1841,7 @@ def _expand(indexer, retain_categories):
                         categories=arr.categories,
                         ordered=arr.ordered,
                     )
-                    if is_categorical_dtype(arr)
+                    if isinstance(arr, pd.CategoricalIndex)
                     else arr.unique()
                 )
                 for arr in indexer

diff --git a/janitor/functions/sort_column_value_order.py b/janitor/functions/sort_column_value_order.py
@@ -5,8 +5,6 @@
 
 from janitor.utils import check, check_column
 
-from .remove_columns import remove_columns  # noqa: F401
-
 
 @pf.register_dataframe_method
 def sort_column_value_order(
@@ -65,7 +63,7 @@ def sort_column_value_order(
     if not column_value_order:
         raise ValueError("column_value_order dictionary cannot be empty")
 
-    df = df.assign(cond_order=df[column].replace(column_value_order))
+    df = df.assign(cond_order=df[column].map(column_value_order))
 
     sort_by = ["cond_order"]
     if columns is not None:

diff --git a/janitor/functions/truncate_datetime.py b/janitor/functions/truncate_datetime.py
@@ -1,7 +1,6 @@
 """Implementation of the `truncate_datetime` family of functions."""
 
-import datetime as dt
-
+import numpy as np
 import pandas as pd
 import pandas_flavor as pf
 from pandas.api.types import is_datetime64_any_dtype
@@ -48,67 +47,31 @@ def truncate_datetime_dataframe(
         A pandas DataFrame with all valid datetimes truncated down
             to the specified precision.
     """
-    ACCEPTABLE_DATEPARTS = ("YEAR", "MONTH", "DAY", "HOUR", "MINUTE", "SECOND")
+    # idea from Stack Overflow
+    # https://stackoverflow.com/a/28783971/7175713
+    # https://numpy.org/doc/stable/reference/arrays.datetime.html
+    ACCEPTABLE_DATEPARTS = {
+        "YEAR": "datetime64[Y]",
+        "MONTH": "datetime64[M]",
+        "DAY": "datetime64[D]",
+        "HOUR": "datetime64[h]",
+        "MINUTE": "datetime64[m]",
+        "SECOND": "datetime64[s]",
+    }
     datepart = datepart.upper()
     if datepart not in ACCEPTABLE_DATEPARTS:
         raise ValueError(
             "Received an invalid `datepart` precision. "
             f"Please enter any one of {ACCEPTABLE_DATEPARTS}."
         )
 
-    dt_cols = [
-        column
-        for column, coltype in df.dtypes.items()
-        if is_datetime64_any_dtype(coltype)
-    ]
-    if not dt_cols:
-        # avoid copying df if no-op is expected
-        return df
-
-    df = df.copy()
-    # NOTE: use **kwargs of `applymap` instead of lambda when we upgrade to
-    #   pandas >= 1.3.0
-    df[dt_cols] = df[dt_cols].applymap(
-        lambda x: _truncate_datetime(x, datepart=datepart),
-    )
-
-    return df
-
-
-def _truncate_datetime(timestamp: dt.datetime, datepart: str) -> dt.datetime:
-    """Truncate a given timestamp to the given datepart.
-
-    Truncation will only occur on valid timestamps (datetime-like objects).
-
-    Args:
-        timestamp: Expecting a datetime from python `datetime` class (dt).
-        datepart: Truncation precision, YEAR, MONTH, DAY,
-            HOUR, MINUTE, SECOND.
-
-    Returns:
-        A truncated datetime object to the precision specified by
-            datepart.
-    """
-    if pd.isna(timestamp):
-        return timestamp
-
-    recurrence = [0, 1, 1, 0, 0, 0]  # [YEAR, MONTH, DAY, HOUR, MINUTE, SECOND]
-    ENUM = {
-        "YEAR": 0,
-        "MONTH": 1,
-        "DAY": 2,
-        "HOUR": 3,
-        "MINUTE": 4,
-        "SECOND": 5,
-        0: timestamp.year,
-        1: timestamp.month,
-        2: timestamp.day,
-        3: timestamp.hour,
-        4: timestamp.minute,
-        5: timestamp.second,
-    }
+    dictionary = {}
 
-    for i in range(ENUM[datepart] + 1):
-        recurrence[i] = ENUM[i]
+    for label, series in df.items():
+        if is_datetime64_any_dtype(series):
+            dtype = ACCEPTABLE_DATEPARTS[datepart]
+            # TODO: add branch for pyarrow arrays
+            series = np.array(series._values, dtype=dtype)
+        dictionary[label] = series
 
-    return dt.datetime(*recurrence)
+    return pd.DataFrame(dictionary)
diff --git a/janitor/xarray/functions.py b/janitor/xarray/functions.py
@@ -48,7 +48,7 @@ def clone_using(
         ...     np.ones((4, 6)), new_name='new_and_improved', use_coords=False,
         ... )
         >>> new_da
-        <xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)>
+        <xarray.DataArray 'new_and_improved' (ax_1: 4, ax_2: 6)> Size: 192B
         array([[1., 1., 1., 1., 1., 1.],
                [1., 1., 1., 1., 1., 1.],
                [1., 1., 1., 1., 1., 1.],
@@ -130,10 +130,10 @@ def convert_datetime_to_number(
         ... )
         >>> da_minutes = da.convert_datetime_to_number("s", dim="time")
         >>> da_minutes
-        <xarray.DataArray (time: 6)>
+        <xarray.DataArray (time: 6)> Size: 48B
         array([2, 8, 0, 1, 7, 7])
         Coordinates:
-          * time     (time) float64 0.0 60.0 120.0 180.0 240.0 300.0
+          * time     (time) float64 48B 0.0 60.0 120.0 180.0 240.0 300.0
 
     Args:
         da_or_ds: XArray object.

diff --git a/pyproject.toml b/pyproject.toml
@@ -49,12 +49,12 @@ markers = [
 
 [tool.ruff]
 # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-select = ["E", "F", "I"]
-ignore = []
+lint.select = ["E", "F", "I"]
+lint.ignore = []
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
-fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
-unfixable = []
+lint.fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+lint.unfixable = []
 
 # Exclude a variety of commonly ignored directories.
 exclude = [
@@ -86,7 +86,7 @@ exclude = [
 line-length = 88
 
 # Allow unused variables when underscore-prefixed.
-dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
 # Assume Python 3.10
 target-version = "py310"

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -7,6 +7,7 @@
 from janitor.testing_utils import date_data
 
 os.environ["NUMBA_DISABLE_JIT"] = "1"
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
 
 TEST_DATA_DIR = "tests/test_data"
 EXAMPLES_DIR = "examples/"

diff --git a/tests/functions/test_complete.py b/tests/functions/test_complete.py
@@ -435,15 +435,18 @@ def test_complete_multiple_groupings():
 
 def test_fill_value_scalar(taxonomy_df):
     """Test output if the fill_value is a scalar."""
-    result = taxonomy_df.complete(
-        "Year", "Taxon", fill_value=0, sort=False
-    ).astype({"Abundance": int})
+    result = (
+        taxonomy_df.complete("Year", "Taxon", fill_value=0, sort=False)
+        .sort_values("Taxon", ignore_index=True)
+        .astype({"Abundance": int})
+    )
     expected = (
         taxonomy_df.set_index(["Year", "Taxon"])
         .unstack(fill_value=0)
         .stack(future_stack=True)
         .reset_index()
         .astype({"Taxon": "object"})
+        .sort_values("Taxon", ignore_index=True)
     )
 
     assert_frame_equal(result, expected)
@@ -525,7 +528,9 @@ def test_complete_groupby():
     expected = (
         df.set_index("year")
         .groupby("state")
-        .apply(lambda x: x.reindex(range(x.index.min(), x.index.max() + 1)))
+        .value.apply(
+            lambda x: x.reindex(range(x.index.min(), x.index.max() + 1))
+        )
         .drop(columns="state")
         .reset_index()
     )

diff --git a/tests/functions/test_conditional_join.py b/tests/functions/test_conditional_join.py
@@ -3470,8 +3470,10 @@ def test_ge_eq_and_le_datess_numba_indices(df, right):
     expected = pd.Index(expected)
 
     actual, _ = get_join_indices(
-        df[["B", "A", "E"]],
-        right[["Floats", "Integers", "Dates", "Numeric"]],
+        df[["B", "A", "E"]].dropna(subset=["E"]),
+        right[["Floats", "Integers", "Dates", "Numeric"]].dropna(
+            subset=["Dates"]
+        ),
         [
             ("A", "Integers", "<"),
             ("E", "Dates", "=="),
@@ -3520,12 +3522,10 @@ def test_eq_indices(df, right):
 @pytest.mark.turtle
 def test_ge_eq_and_le_datess_indices(df, right):
     """compare join indices for multiple conditions."""
-
     expected = (
         df.reset_index()
-        .dropna(subset=["E"])
         .merge(
-            right.dropna(subset=["Dates"]),
+            right,
             left_on="E",
             right_on="Dates",
             how="inner",

diff --git a/tests/functions/test_groupby_agg.py b/tests/functions/test_groupby_agg.py
@@ -24,7 +24,7 @@ def test_groupby_agg():
         by="date",
         new_column_name="date_average",
         agg_column_name="values",
-        agg=np.mean,
+        agg="mean",
     )
     assert df.shape[0] == df_new.shape[0]
     assert "date_average" in df_new.columns

diff --git a/tests/functions/test_groupby_topk.py b/tests/functions/test_groupby_topk.py
@@ -27,37 +27,49 @@ def test_dtype_by(df):
 def test_ascending_groupby_k_2(df):
     """Test ascending group by, k=2"""
     expected = (
-        df.groupby("result", sort=False)
+        df.groupby("result", sort=False)[["age", "major", "ID"]]
         .apply(lambda d: d.sort_values("age").head(2))
-        .droplevel(0)
+        .reset_index("result")
+        .sort_index(axis="columns")
     )
     assert_frame_equal(
-        df.groupby_topk("result", "age", 2, ignore_index=False), expected
+        df.groupby_topk("result", "age", 2, ignore_index=False).sort_index(
+            axis="columns"
+        ),
+        expected,
     )
 
 
 def test_ascending_groupby_non_numeric(df):
     """Test output for non-numeric column"""
     expected = (
-        df.groupby("result", sort=False)
+        df.groupby("result", sort=False)[["age", "major", "ID"]]
         .apply(lambda d: d.sort_values("major").head(2))
-        .droplevel(0)
+        .reset_index("result")
+        .sort_index(axis="columns")
     )
     assert_frame_equal(
-        df.groupby_topk("result", "major", 2, ignore_index=False), expected
+        df.groupby_topk("result", "major", 2, ignore_index=False).sort_index(
+            axis="columns"
+        ),
+        expected,
     )
 
 
 def test_descending_groupby_k_3(df):
     """Test descending group by, k=3"""
     expected = (
-        df.groupby("result", sort=False)
+        df.groupby("result", sort=False)[["age", "major", "ID"]]
         .apply(lambda d: d.sort_values("age", ascending=False).head(3))
-        .droplevel(0)
+        .reset_index("result")
         .reset_index(drop=True)
+        .sort_index(axis="columns")
     )
     assert_frame_equal(
-        df.groupby_topk("result", "age", 3, ascending=False), expected
+        df.groupby_topk("result", "age", 3, ascending=False).sort_index(
+            axis="columns"
+        ),
+        expected,
     )