diff --git a/CHANGELOG.md b/CHANGELOG.md index b4e1219eb..a910e3a78 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## [Unreleased] +- [ENH] Added `convert_excel_date` and `convert_matlab_date` methods for polars - Issue #1352 - [ENH] Added a `complete` method for polars. - Issue #1352 @samukweku - [ENH] `read_commandline` function now supports polars - Issue #1352 - [ENH] Improved performance for non-equi joins when using numba - @samukweku PR #1341 diff --git a/janitor/functions/convert_date.py b/janitor/functions/convert_date.py index 471bc8ebf..fbd19747f 100644 --- a/janitor/functions/convert_date.py +++ b/janitor/functions/convert_date.py @@ -1,24 +1,22 @@ -import datetime as dt -from typing import Hashable +from typing import Hashable, Union import pandas as pd import pandas_flavor as pf -from pandas.api.types import is_numeric_dtype from pandas.errors import OutOfBoundsDatetime -from janitor.utils import deprecated_alias +from janitor.utils import deprecated_alias, refactored_function @pf.register_dataframe_method -@deprecated_alias(column="column_name") +@deprecated_alias(column="column_names") def convert_excel_date( - df: pd.DataFrame, column_name: Hashable + df: pd.DataFrame, column_names: Union[Hashable, list] ) -> pd.DataFrame: """Convert Excel's serial date format into Python datetime format. - This method mutates the original DataFrame. + This method does not mutate the original DataFrame. - Implementation is also from + Implementation is based on [Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas). Examples: @@ -38,40 +36,36 @@ def convert_excel_date( Args: df: A pandas DataFrame. - column_name: A column name. - - Raises: - ValueError: If there are non numeric values in the column. + column_names: A column name, or a list of column names. Returns: A pandas DataFrame with corrected dates. """ # noqa: E501 - if not is_numeric_dtype(df[column_name]): - raise ValueError( - "There are non-numeric values in the column. " - "All values must be numeric." + if not isinstance(column_names, list): + column_names = [column_names] + # https://stackoverflow.com/a/65460255/7175713 + dictionary = { + column_name: pd.to_datetime( + df[column_name], unit="D", origin="1899-12-30" ) + for column_name in column_names + } - df[column_name] = pd.TimedeltaIndex( - df[column_name], unit="d" - ) + dt.datetime( - 1899, 12, 30 - ) # noqa: W503 - return df + return df.assign(**dictionary) @pf.register_dataframe_method -@deprecated_alias(column="column_name") +@deprecated_alias(column="column_names") def convert_matlab_date( - df: pd.DataFrame, column_name: Hashable + df: pd.DataFrame, column_names: Union[Hashable, list] ) -> pd.DataFrame: """Convert Matlab's serial date number into Python datetime format. - Implementation is also from + Implementation is based on [Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python). - This method mutates the original DataFrame. + This method does not mutate the original DataFrame. Examples: >>> import pandas as pd @@ -84,29 +78,38 @@ def convert_matlab_date( 2 737124.498500 3 737124.000000 >>> df.convert_matlab_date('date') - date - 0 2018-03-06 00:00:00.000000 - 1 2018-03-05 19:34:50.563200 - 2 2018-03-05 11:57:50.399999 - 3 2018-03-05 00:00:00.000000 + date + 0 2018-03-06 00:00:00.000000000 + 1 2018-03-05 19:34:50.563199671 + 2 2018-03-05 11:57:50.399998876 + 3 2018-03-05 00:00:00.000000000 Args: df: A pandas DataFrame. - column_name: A column name. + column_names: A column name, or a list of column names. Returns: A pandas DataFrame with corrected dates. """ # noqa: E501 - days = pd.Series([dt.timedelta(v % 1) for v in df[column_name]]) - df[column_name] = ( - df[column_name].astype(int).apply(dt.datetime.fromordinal) - + days - - dt.timedelta(days=366) - ) - return df + # https://stackoverflow.com/a/49135037/7175713 + if not isinstance(column_names, list): + column_names = [column_names] + dictionary = { + column_name: pd.to_datetime(df[column_name] - 719529, unit="D") + for column_name in column_names + } + return df.assign(**dictionary) + +@pf.register_dataframe_method @pf.register_dataframe_method +@refactored_function( + message=( + "This function will be deprecated in a 1.x release. " + "Please use `pd.to_datetime` instead." + ) +) @deprecated_alias(column="column_name") def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame: """Convert unix epoch time into Python datetime format. @@ -116,6 +119,11 @@ def convert_unix_date(df: pd.DataFrame, column_name: Hashable) -> pd.DataFrame: This method mutates the original DataFrame. + !!!note + + This function will be deprecated in a 1.x release. + Please use `pd.to_datetime` instead. + Examples: >>> import pandas as pd >>> import janitor diff --git a/janitor/polars/__init__.py b/janitor/polars/__init__.py index 1485ad3f2..972db2c2b 100644 --- a/janitor/polars/__init__.py +++ b/janitor/polars/__init__.py @@ -1,5 +1,6 @@ from .clean_names import clean_names, make_clean_names from .complete import complete +from .dates_to_polars import convert_excel_date, convert_matlab_date from .pivot_longer import pivot_longer, pivot_longer_spec from .row_to_names import row_to_names @@ -10,4 +11,6 @@ "make_clean_names", "row_to_names", "complete", + "convert_excel_date", + "convert_matlab_date", ] diff --git a/janitor/polars/dates_to_polars.py b/janitor/polars/dates_to_polars.py new file mode 100644 index 000000000..cf3246a77 --- /dev/null +++ b/janitor/polars/dates_to_polars.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +from janitor.utils import import_message + +from .polars_flavor import register_expr_method + +try: + import polars as pl +except ImportError: + import_message( + submodule="polars", + package="polars", + conda_channel="conda-forge", + pip_install=True, + ) + + +@register_expr_method +def convert_excel_date(expr: pl.Expr) -> pl.Expr: + """ + Convert Excel's serial date format into Python datetime format. + + Inspiration is from + [Stack Overflow](https://stackoverflow.com/questions/38454403/convert-excel-style-date-with-pandas). + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"date": [39690, 39690, 37118]}) + >>> df + shape: (3, 1) + ┌───────┐ + │ date │ + │ --- │ + │ i64 │ + ╞═══════╡ + │ 39690 │ + │ 39690 │ + │ 37118 │ + └───────┘ + >>> expression = pl.col('date').convert_excel_date().alias('date_') + >>> df.with_columns(expression) + shape: (3, 2) + ┌───────┬────────────┐ + │ date ┆ date_ │ + │ --- ┆ --- │ + │ i64 ┆ date │ + ╞═══════╪════════════╡ + │ 39690 ┆ 2008-08-30 │ + │ 39690 ┆ 2008-08-30 │ + │ 37118 ┆ 2001-08-15 │ + └───────┴────────────┘ + + !!! info "New in version 0.28.0" + + Returns: + A polars Expression. + """ # noqa: E501 + expression = pl.duration(days=expr) + expression += pl.date(year=1899, month=12, day=30) + return expression + + +@register_expr_method +def convert_matlab_date(expr: pl.Expr) -> pl.Expr: + """ + Convert Matlab's serial date number into Python datetime format. + + Implementation is from + [Stack Overflow](https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python). + + + Examples: + >>> import polars as pl + >>> import janitor.polars + >>> df = pl.DataFrame({"date": [737125.0, 737124.815863, 737124.4985, 737124]}) + >>> df + shape: (4, 1) + ┌───────────────┐ + │ date │ + │ --- │ + │ f64 │ + ╞═══════════════╡ + │ 737125.0 │ + │ 737124.815863 │ + │ 737124.4985 │ + │ 737124.0 │ + └───────────────┘ + >>> expression = pl.col('date').convert_matlab_date().alias('date_') + >>> df.with_columns(expression) + shape: (4, 2) + ┌───────────────┬─────────────────────────┐ + │ date ┆ date_ │ + │ --- ┆ --- │ + │ f64 ┆ datetime[μs] │ + ╞═══════════════╪═════════════════════════╡ + │ 737125.0 ┆ 2018-03-06 00:00:00 │ + │ 737124.815863 ┆ 2018-03-05 19:34:50.563 │ + │ 737124.4985 ┆ 2018-03-05 11:57:50.399 │ + │ 737124.0 ┆ 2018-03-05 00:00:00 │ + └───────────────┴─────────────────────────┘ + + !!! info "New in version 0.28.0" + + Returns: + A polars Expression. + """ # noqa: E501 + # https://stackoverflow.com/questions/13965740/converting-matlabs-datenum-format-to-python + expression = expr.sub(719529).mul(86_400_000) + expression = pl.duration(milliseconds=expression) + expression += pl.datetime(year=1970, month=1, day=1) + return expression diff --git a/tests/functions/test_convert_excel_date.py b/tests/functions/test_convert_excel_date.py index a9f54212f..5d7804bdb 100644 --- a/tests/functions/test_convert_excel_date.py +++ b/tests/functions/test_convert_excel_date.py @@ -18,15 +18,3 @@ def test_convert_excel_date(): ) assert df["hire_date"].dtype == "M8[ns]" - - -@pytest.mark.functions -def test_convert_excel_date_with_string_data(): - """Raises ValueError if values of column are not numeric""" - df = pd.read_excel( - Path(pytest.EXAMPLES_DIR) / "notebooks" / "dirty_data.xlsx", - engine="openpyxl", - ).clean_names() - - with pytest.raises(ValueError): - df.convert_excel_date("certification") diff --git a/tests/polars/functions/test_convert_excel_date_polars.py b/tests/polars/functions/test_convert_excel_date_polars.py new file mode 100644 index 000000000..654a3a786 --- /dev/null +++ b/tests/polars/functions/test_convert_excel_date_polars.py @@ -0,0 +1,11 @@ +import polars as pl + +import janitor.polars # noqa: F401 + + +def test_convert_excel_date(): + df = pl.DataFrame({"dates": [42580.3333333333]}) + + expression = pl.col("dates").convert_excel_date().alias("dd") + expression = df.with_columns(expression).get_column("dd") + assert expression.dtype.is_temporal() is True diff --git a/tests/polars/functions/test_convert_matlab_date_polars.py b/tests/polars/functions/test_convert_matlab_date_polars.py new file mode 100644 index 000000000..1e40b2c7c --- /dev/null +++ b/tests/polars/functions/test_convert_matlab_date_polars.py @@ -0,0 +1,20 @@ +import polars as pl + +import janitor.polars # noqa: F401 + + +def test_convert_matlab_date(): + df = pl.DataFrame( + { + "dates": [ + 733_301.0, + 729_159.0, + 734_471.0, + 737_299.563_296_356_5, + 737_300.000_000_000_0, + ] + } + ) + expression = pl.col("dates").convert_matlab_date().alias("dd") + expression = df.with_columns(expression).get_column("dd") + assert expression.dtype.is_temporal() is True