From d4f1c964dc26326dd058ab8e60b9767dc4d12f37 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Sat, 13 Jul 2024 15:05:51 +0100 Subject: [PATCH] remove DateTime --- aeon/transformations/date.py | 385 ------------------ aeon/transformations/tests/test_date.py | 268 ------------ docs/api_reference/transformations.rst | 18 - .../transformations/transformations.ipynb | 9 - 4 files changed, 680 deletions(-) delete mode 100644 aeon/transformations/date.py delete mode 100644 aeon/transformations/tests/test_date.py diff --git a/aeon/transformations/date.py b/aeon/transformations/date.py deleted file mode 100644 index 903292a294..0000000000 --- a/aeon/transformations/date.py +++ /dev/null @@ -1,385 +0,0 @@ -"""Extract calendar features from datetimeindex.""" - -__maintainer__ = [] -__all__ = ["DateTimeFeatures"] - -import warnings - -import numpy as np -import pandas as pd -from deprecated.sphinx import deprecated - -from aeon.transformations.base import BaseTransformer - -_RAW_DUMMIES = [ - ["child", "parent", "dummy_func", "feature_scope"], - ["year", "year", "year", "minimal"], - ["quarter", "year", "quarter", "efficient"], - ["month", "year", "month", "minimal"], - ["week", "year", "week_of_year", "efficient"], - ["day", "year", "day_of_year", "efficient"], - ["month", "quarter", "month_of_quarter", "comprehensive"], - ["week", "quarter", "week_of_quarter", "comprehensive"], - ["day", "quarter", "day_of_quarter", "comprehensive"], - ["week", "month", "week_of_month", "comprehensive"], - ["day", "month", "day", "efficient"], - ["day", "week", "weekday", "minimal"], - ["hour", "day", "hour", "minimal"], - ["minute", "hour", "minute", "minimal"], - ["second", "minute", "second", "minimal"], - ["millisecond", "second", "millisecond", "minimal"], - ["day", "week", "is_weekend", "comprehensive"], -] - - -# TODO: remove in v0.11.0 -@deprecated( - version="0.10.0", - reason="DateTimeFeatures will be removed in version 0.11.0.", - category=FutureWarning, -) -class DateTimeFeatures(BaseTransformer): - """DateTime feature extraction for use in e.g. tree based models. - - DateTimeFeatures uses a date index column and generates date features - identifying e.g. year, week of the year, day of the week. - - Parameters - ---------- - ts_freq : str, default="day" - Restricts selection of items to those with a frequency lower than - the frequency of the time series given by ts_freq. - E.g. if monthly data is provided and ts_freq = ("M"), it does not make - sense to derive dummies with higher frequency like weekly dummies. - Has to be provided by the user due to the abundance of different - frequencies supported by Pandas (e.g. every pandas allows freq of every 4 days). - Interaction with other arguments: - Used to narrow down feature selection for feature_scope, since only - features with a frequency lower than ts_freq are considered. Will be ignored - for the calculation of manually specified features, but when provided will - raise a warning if manual features have a frequency higher than ts_freq. - Only supports the following frequencies: - * Y - year - * Q - quarter - * M - month - * W - week - * D - day - * H - hour - * T - minute - * S - second - * L - millisecond - feature_scope: str, default="minimal" - Specify how many calendar features you want to be returned. - E.g., rarely used features like week of quarter will only be returned - with feature_scope = "comprehensive". - * "minimal" - * "efficient" - * "comprehensive" - manual_selection: str, default=None - Manual selection of dummys. Notation is child of parent for precise notation. - Will ignore specified feature_scope, but will still check with warning against - a specified ts_freq. - Examples for possible values: - * None - * day_of_year - * day_of_month - * day_of_quarter - * is_weekend - * year (special case with no lower frequency). - keep_original_columns : boolean, optional, default=False - Keep original columns in X passed to `.transform()`. - - Examples - -------- - >>> from aeon.transformations.date import DateTimeFeatures - >>> from aeon.datasets import load_airline - >>> y = load_airline() - - Returns columns `y`, `year`, `month_of_year` - >>> transformer = DateTimeFeatures(ts_freq="M") - >>> y_hat = transformer.fit_transform(y) - - Returns columns `y`, `month_of_year` - >>> transformer = DateTimeFeatures(ts_freq="M", manual_selection=["month_of_year"]) - >>> y_hat = transformer.fit_transform(y) - - Returns columns 'y', 'year', 'quarter_of_year', 'month_of_year', 'month_of_quarter' - >>> transformer = DateTimeFeatures(ts_freq="M", feature_scope="comprehensive") - >>> y_hat = transformer.fit_transform(y) - - Returns columns 'y', 'year', 'quarter_of_year', 'month_of_year' - >>> transformer = DateTimeFeatures(ts_freq="M", feature_scope="efficient") - >>> y_hat = transformer.fit_transform(y) - - Returns columns 'y', 'year', 'month_of_year' - >>> transformer = DateTimeFeatures(ts_freq="M", feature_scope="minimal") - >>> y_hat = transformer.fit_transform(y) - """ - - _tags = { - "input_data_type": "Series", - # what is the abstract type of X: Series, or Panel - "output_data_type": "Series", - # what abstract type is returned: Primitives, Series, Panel - "instancewise": True, # is this an instance-wise transform? - "X_inner_type": [ - "pd.Series", - "pd.DataFrame", - "pd-multiindex", - "pd_multiindex_hier", - ], - "y_inner_type": "None", - "capability:multivariate": True, - "fit_is_empty": True, - "transform-returns-same-time-index": True, - "enforce_index_type": [pd.DatetimeIndex, pd.PeriodIndex], - "skip-inverse-transform": True, - "python_dependencies": "pandas>=1.2.0", # from DateTimeProperties - } - - def __init__( - self, - ts_freq=None, - feature_scope="minimal", - manual_selection=None, - keep_original_columns=False, - ): - self.ts_freq = ts_freq - self.feature_scope = feature_scope - self.manual_selection = manual_selection - self.dummies = _prep_dummies(_RAW_DUMMIES) - self.keep_original_columns = keep_original_columns - - super().__init__() - - def _transform(self, X, y=None): - """Transform X and return a transformed version. - - private _transform containing the core logic, called from transform - - Parameters - ---------- - X : pd.Series or pd.DataFrame - Data to be transformed - y : ignored argument for interface compatibility - Additional data, e.g., labels for transformation - - Returns - ------- - Xt : pd.Series or pd.DataFrame, same type as X - transformed version of X - """ - _check_ts_freq(self.ts_freq, self.dummies) - _check_feature_scope(self.feature_scope) - _check_manual_selection(self.manual_selection, self.dummies) - - if isinstance(X.index, pd.MultiIndex): - time_index = X.index.get_level_values(-1) - else: - time_index = X.index - - x_df = pd.DataFrame(index=X.index) - if isinstance(time_index, pd.PeriodIndex): - x_df["date_sequence"] = time_index.to_timestamp() - elif isinstance(time_index, pd.DatetimeIndex): - x_df["date_sequence"] = time_index - else: - raise ValueError("Index type not supported") - - if self.manual_selection is None: - if self.ts_freq is not None: - supported = _get_supported_calendar(self.ts_freq, DUMMIES=self.dummies) - supported = supported[supported["feature_scope"] <= self.feature_scope] - calendar_dummies = supported[["dummy_func", "dummy"]] - else: - supported = self.dummies[ - self.dummies["feature_scope"] <= self.feature_scope - ] - calendar_dummies = supported[["dummy_func", "dummy"]] - else: - if self.ts_freq is not None: - supported = _get_supported_calendar(self.ts_freq, DUMMIES=self.dummies) - if not all( - elem in supported["dummy"] for elem in self.manual_selection - ): - warnings.warn( - "Level of selected dummy variable " - + " lower level than base ts_frequency.", - stacklevel=2, - ) - calendar_dummies = self.dummies.loc[ - self.dummies["dummy"].isin(self.manual_selection), - ["dummy_func", "dummy"], - ] - else: - calendar_dummies = self.dummies.loc[ - self.dummies["dummy"].isin(self.manual_selection), - ["dummy_func", "dummy"], - ] - - df = [ - _calendar_dummies(x_df, dummy) for dummy in calendar_dummies["dummy_func"] - ] - df = pd.concat(df, axis=1) - df.columns = calendar_dummies["dummy"] - - if self.keep_original_columns: - Xt = pd.concat([X, df], axis=1, copy=True) - else: - # Remove the name `"dummy"` from column index. - Xt = df.rename_axis(None, axis="columns") - - return Xt - - -def _check_manual_selection(manual_selection, DUMMIES): - if (manual_selection is not None) and ( - not all(elem in DUMMIES["dummy"].unique() for elem in manual_selection) - ): - raise ValueError( - "Invalid manual_selection specified, must be in: " - + ", ".join(DUMMIES["dummy"].unique()) - ) - - -def _check_feature_scope(feature_scope): - if feature_scope not in ["minimal", "efficient", "comprehensive"]: - raise ValueError( - "Invalid feature_scope specified," - + "must be in minimal,efficient,comprehensive" - + "(minimal lowest number of variables)" - ) - - -def _check_ts_freq(ts_freq, DUMMIES): - if (ts_freq is not None) & (ts_freq not in DUMMIES["ts_frequency"].unique()): - raise ValueError( - "Invalid ts_freq specified, must be in: " - + ", ".join(DUMMIES["ts_frequency"].unique()) - ) - - -def _calendar_dummies(x, funcs): - date_sequence = x["date_sequence"].dt - if funcs == "week_of_year": - # The first week of an ISO year is the first (Gregorian) - # calendar week of a year containing a Thursday. - # So it is possible that a week in the new year is still - # indexed starting in last year (week 52 or 53) - cd = date_sequence.isocalendar()["week"] - elif funcs == "week_of_month": - cd = (date_sequence.day - 1) // 7 + 1 - elif funcs == "month_of_quarter": - cd = (np.floor(date_sequence.month / 4) + 1).astype(np.int64) - elif funcs == "week_of_quarter": - col_names = x.columns - x_columns = col_names.intersection(["year", "quarter", "week"]).to_list() - x_columns.append("date_sequence") - df = x.copy(deep=True) - df = df[x_columns] - if "year" not in x_columns: - df["year"] = df["date_sequence"].dt.year - if "quarter" not in x_columns: - df["quarter"] = df["date_sequence"].dt.quarter - if "week" not in x_columns: - df["week"] = df["date_sequence"].dt.isocalendar()["week"] - df["qdate"] = ( - df["date_sequence"] + pd.tseries.offsets.DateOffset(days=1) - ) - pd.tseries.offsets.QuarterBegin(startingMonth=1) - df["qweek"] = df["qdate"].dt.isocalendar()["week"] - df.loc[(df["quarter"] == 1) & (df["week"] < 52), "qweek"] = 0 - cd = df["week"] - df["qweek"] + 1 - elif funcs == "millisecond": - cd = date_sequence.microsecond * 1000 - elif funcs == "day_of_quarter": - quarter = date_sequence.quarter - quarter_start = pd.DatetimeIndex( - date_sequence.year.map(str) - + "-" - + (3 * quarter - 2).map(int).map(str) - + "-01" - ) - values = ( - (x["date_sequence"] - quarter_start) / pd.to_timedelta("1D") + 1 - ).astype(int) - cd = values - elif funcs == "is_weekend": - cd = date_sequence.day_of_week > 4 - else: - cd = getattr(date_sequence, funcs) - cd = pd.DataFrame(cd) - cd = cd.rename(columns={cd.columns[0]: funcs}) - cd[funcs] = np.int64(cd[funcs]) - return cd - - -def _get_supported_calendar(ts_freq, DUMMIES): - rank = DUMMIES.loc[DUMMIES["ts_frequency"] == ts_freq, "rank"].max() - matches = DUMMIES.loc[DUMMIES["rank"] <= rank] - if matches.shape[0] == 0: - raise ValueError("Seasonality or Frequency not supported") - return matches - - -def _prep_dummies(DUMMIES): - """Use to prepare dummy data. - - Includes defining function call names and ranking - of date information based on frequency (e.g. year - has a lower frequency than week). - """ - DUMMIES = pd.DataFrame(DUMMIES[1:], columns=DUMMIES[0]) - - date_order = [ - "year", - "quarter", - "month", - "week", - "day", - "hour", - "minute", - "second", - "millisecond", - ] - - DUMMIES["fourier"] = DUMMIES["child"] + "_in_" + DUMMIES["parent"] - DUMMIES["dummy"] = DUMMIES["child"] + "_of_" + DUMMIES["parent"] - DUMMIES.loc[DUMMIES["dummy"] == "year_of_year", "dummy"] = "year" - DUMMIES.loc[DUMMIES["dummy_func"] == "is_weekend", ["dummy", "fourier"]] = ( - "is_weekend" - ) - - DUMMIES["child"] = ( - DUMMIES["child"].astype("category").cat.reorder_categories(date_order) - ) - - flist = ["minimal", "efficient", "comprehensive"] - - DUMMIES["feature_scope"] = ( - DUMMIES["feature_scope"].astype("category").cat.reorder_categories(flist) - ) - - DUMMIES["feature_scope"] = pd.Categorical(DUMMIES["feature_scope"], ordered=True) - - DUMMIES["rank"] = DUMMIES["child"].cat.codes - - col = DUMMIES["child"] - DUMMIES.insert(0, "ts_frequency", col) - - DUMMIES = DUMMIES.replace( - { - "ts_frequency": { - "year": "Y", - "quarter": "Q", - "month": "M", - "week": "W", - "day": "D", - "hour": "H", - "minute": "T", - "second": "S", - "millisecond": "L", - } - } - ) - - return DUMMIES diff --git a/aeon/transformations/tests/test_date.py b/aeon/transformations/tests/test_date.py deleted file mode 100644 index 50bdd54fd4..0000000000 --- a/aeon/transformations/tests/test_date.py +++ /dev/null @@ -1,268 +0,0 @@ -"""Unit tests of DateTimeFeatures functionality.""" - -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - -from aeon.datasets import load_airline, load_longley -from aeon.forecasting.model_selection import temporal_train_test_split -from aeon.testing.data_generation import _make_hierarchical -from aeon.transformations.date import DateTimeFeatures -from aeon.utils.validation._dependencies import _check_estimator_deps - -if _check_estimator_deps(DateTimeFeatures, severity="none"): - # Load multivariate dataset longley and apply calendar extraction - - y, X = load_longley() - y_train, y_test, X_train, X_test = temporal_train_test_split(y, X) - - # Test that comprehensive feature_scope works for weeks - pipe = DateTimeFeatures( - ts_freq="W", feature_scope="comprehensive", keep_original_columns=True - ) - pipe.fit(X_train) - test_full_featurescope = pipe.transform(X_train).columns.to_list() - - # Test that minimal feature_scope works for weeks - pipe = DateTimeFeatures( - ts_freq="W", feature_scope="minimal", keep_original_columns=True - ) - pipe.fit(X_train) - test_reduced_featurescope = pipe.transform(X_train).columns.to_list() - - # Test that comprehensive feature_scope works for months - pipe = DateTimeFeatures( - ts_freq="M", feature_scope="comprehensive", keep_original_columns=True - ) - pipe.fit(X_train) - test_changing_frequency = pipe.transform(X_train).columns.to_list() - - # Test that manual_selection works for with provided arguments - # Should ignore feature scope and raise warning for second_of_minute, - # since ts_freq = "M" is provided. - # (dummies with frequency higher than ts_freq) - pipe = DateTimeFeatures( - ts_freq="M", - feature_scope="comprehensive", - manual_selection=["year", "second_of_minute"], - keep_original_columns=True, - ) - pipe.fit(X_train) - test_manspec_with_tsfreq = pipe.transform(X_train).columns.to_list() - - # Test that manual_selection works for with provided arguments - # Should ignore feature scope and raise no warning for second_of_minute, - # since ts_freq is not provided. - - pipe = DateTimeFeatures( - manual_selection=["year", "second_of_minute"], keep_original_columns=True - ) - pipe.fit(X_train) - test_manspec_wo_tsfreq = pipe.transform(X_train).columns.to_list() - - # Test that prior test works for with univariate dataset - y = load_airline() - y_train, y_test = temporal_train_test_split(y) - - pipe = DateTimeFeatures( - manual_selection=["year", "second_of_minute"], keep_original_columns=True - ) - pipe.fit(y_train) - test_univariate_data = pipe.transform(y_train).columns.to_list() - - # Test that prior test also works when Index is converted to DateTime index - y.index = y.index.to_timestamp().astype("datetime64[ns]") - y_train, y_test = temporal_train_test_split(y) - pipe = DateTimeFeatures( - manual_selection=["year", "second_of_minute"], keep_original_columns=True - ) - pipe.fit(y_train) - test_diffdateformat = pipe.transform(y_train).columns.to_list() - - pipe = DateTimeFeatures( - ts_freq="L", feature_scope="comprehensive", keep_original_columns=True - ) - pipe.fit(y_train) - y_train_t = pipe.transform(y_train) - test_full = y_train_t.columns.to_list() - test_types = y_train_t.select_dtypes(include=["int64"]).columns.to_list() - - -# Test `is_weekend` works in manual selection -@pytest.fixture -def df_datetime_daily_idx(): - """Create timeseries with Datetime index, daily frequency.""" - return pd.DataFrame( - data={"y": [1, 1, 1, 1, 1, 1, 1]}, - index=pd.date_range(start="2000-01-01", freq="D", periods=7), - ) - - -@pytest.fixture() -def df_panel(): - """Create hierarchical data.""" - return _make_hierarchical(hierarchy_levels=(2,), min_timepoints=3, max_timepoints=3) - - -all_args = [ - "Number of airline passengers", - "year", - "quarter_of_year", - "month_of_year", - "week_of_year", - "day_of_year", - "month_of_quarter", - "week_of_quarter", - "day_of_quarter", - "week_of_month", - "day_of_month", - "day_of_week", - "hour_of_day", - "minute_of_hour", - "second_of_minute", - "millisecond_of_second", - "is_weekend", -] - - -@pytest.mark.skipif( - not _check_estimator_deps(DateTimeFeatures, severity="none"), - reason="skip test if required soft dependencies not available", -) -@pytest.mark.parametrize( - "test_input,expected", - [ - ( - test_full_featurescope, - [ - "GNPDEFL", - "GNP", - "UNEMP", - "ARMED", - "POP", - "year", - "quarter_of_year", - "month_of_year", - "week_of_year", - "month_of_quarter", - "week_of_quarter", - "week_of_month", - ], - ), - ( - test_reduced_featurescope, - ["GNPDEFL", "GNP", "UNEMP", "ARMED", "POP", "year", "month_of_year"], - ), - ( - test_changing_frequency, - [ - "GNPDEFL", - "GNP", - "UNEMP", - "ARMED", - "POP", - "year", - "quarter_of_year", - "month_of_year", - "month_of_quarter", - ], - ), - ( - test_manspec_with_tsfreq, - ["GNPDEFL", "GNP", "UNEMP", "ARMED", "POP", "year", "second_of_minute"], - ), - ( - test_manspec_wo_tsfreq, - ["GNPDEFL", "GNP", "UNEMP", "ARMED", "POP", "year", "second_of_minute"], - ), - ( - test_univariate_data, - ["Number of airline passengers", "year", "second_of_minute"], - ), - ( - test_diffdateformat, - ["Number of airline passengers", "year", "second_of_minute"], - ), - ( - test_full, - all_args, - ), - ( - test_types, - all_args[1:], - ), - ], -) -def test_eval(test_input, expected): - """Tests which columns are returned for different arguments. - - For a detailed description what these arguments do, - and how they interact see docstring of DateTimeFeatures. - """ - assert len(test_input) == len(expected) - assert all([a == b for a, b in zip(test_input, expected)]) - - -@pytest.mark.skipif( - not _check_estimator_deps(DateTimeFeatures, severity="none"), - reason="skip test if required soft dependencies not available", -) -def test_manual_selection_is_weekend(df_datetime_daily_idx): - """Tests that "is_weekend" returns correct result in `manual_selection`.""" - transformer = DateTimeFeatures( - manual_selection=["is_weekend"], keep_original_columns=True - ) - - Xt = transformer.fit_transform(df_datetime_daily_idx) - expected = pd.DataFrame( - data={"y": [1, 1, 1, 1, 1, 1, 1], "is_weekend": [1, 1, 0, 0, 0, 0, 0]}, - index=df_datetime_daily_idx.index, - ) - assert_frame_equal(Xt, expected) - - -@pytest.mark.skipif( - not _check_estimator_deps(DateTimeFeatures, severity="none"), - reason="skip test if required soft dependencies not available", -) -def test_transform_panel(df_panel): - """Test `.transform()` on panel data.""" - transformer = DateTimeFeatures( - manual_selection=["year", "month_of_year", "day_of_month"], - keep_original_columns=True, - ) - Xt = transformer.fit_transform(df_panel) - - expected = pd.DataFrame( - index=df_panel.index, - data={ - "c0": df_panel["c0"].values, - "year": [2000, 2000, 2000, 2000, 2000, 2000], - "month_of_year": [1, 1, 1, 1, 1, 1], - "day_of_month": [1, 2, 3, 1, 2, 3], - }, - ) - assert_frame_equal(Xt, expected) - - -@pytest.mark.skipif( - not _check_estimator_deps(DateTimeFeatures, severity="none"), - reason="skip test if required soft dependencies not available", -) -def test_keep_original_columns(df_panel): - """Test `.transform()` on panel data.""" - transformer = DateTimeFeatures( - manual_selection=["year", "month_of_year", "day_of_month"], - keep_original_columns=False, - ) - Xt = transformer.fit_transform(df_panel) - - expected = pd.DataFrame( - index=df_panel.index, - data={ - "year": [2000, 2000, 2000, 2000, 2000, 2000], - "month_of_year": [1, 1, 1, 1, 1, 1], - "day_of_month": [1, 2, 3, 1, 2, 3], - }, - ) - assert_frame_equal(Xt, expected) diff --git a/docs/api_reference/transformations.rst b/docs/api_reference/transformations.rst index 24fd52cdc0..2d07ddd600 100644 --- a/docs/api_reference/transformations.rst +++ b/docs/api_reference/transformations.rst @@ -299,24 +299,6 @@ Missing value imputation Imputer -Seasonality and Date-Time Features -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. currentmodule:: aeon.transformations.date - -.. autosummary:: - :toctree: auto_generated/ - :template: class.rst - - DateTimeFeatures - -.. currentmodule:: aeon.transformations.fourier - -.. autosummary:: - :toctree: auto_generated/ - :template: class.rst - - FourierFeatures Window-based series transforms ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/examples/transformations/transformations.ipynb b/examples/transformations/transformations.ipynb index 2aa195bc0e..53ff123f8d 100644 --- a/examples/transformations/transformations.ipynb +++ b/examples/transformations/transformations.ipynb @@ -798,15 +798,6 @@ "start_time": "2024-03-01T16:16:30.881930Z" } } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [], - "metadata": { - "collapsed": false - } } ], "metadata": {