From 44bd0c974cdb8291ab2a080c2a0ea93cf5512720 Mon Sep 17 00:00:00 2001 From: Tony Bagnall Date: Mon, 5 Aug 2024 12:58:32 +0200 Subject: [PATCH] [DEP] Remove FeatureSelection transformer (#1890) * remove Theta * remove * API --- aeon/transformations/feature_selection.py | 223 ------------------ .../tests/test_feature_selection.py | 90 ------- docs/api_reference/transformations.rst | 14 -- 3 files changed, 327 deletions(-) delete mode 100644 aeon/transformations/feature_selection.py delete mode 100644 aeon/transformations/tests/test_feature_selection.py diff --git a/aeon/transformations/feature_selection.py b/aeon/transformations/feature_selection.py deleted file mode 100644 index f71b246a7d..0000000000 --- a/aeon/transformations/feature_selection.py +++ /dev/null @@ -1,223 +0,0 @@ -"""Implements feature selection algorithms.""" - -__maintainer__ = [] -__all__ = ["FeatureSelection"] - -import math - -import pandas as pd -from deprecated.sphinx import deprecated - -from aeon.transformations.base import BaseTransformer -from aeon.utils.validation.forecasting import check_regressor - - -# TODO: remove in v0.11.0 -@deprecated( - version="0.10.0", - reason="DateTimeFeatures will be removed in version 0.11.0.", - category=FutureWarning, -) -class FeatureSelection(BaseTransformer): - """ - Select exogenous features. - - Transformer to enable tuneable feauture selection of exogenous data. The - FeatureSelection implements multiple methods to select features (columns). - In case X is a pd.Series, then it is just passed through, unless method="none", - then None is returned in transform(). - - Parameters - ---------- - method : str - The method of how to select the features. Implemeted methods are: - * "feature-importances": Use feature_importances_ of the regressor (meta-model) - to select n_columns with highest importance values. - Requires parameter n_columns. - * "random": Randomly select n_columns features. Requires parameter n_columns. - * "columns": Select features by given names. - * "none": Remove all columns by setting Z to None. - * "all": Select all given features. - n_columns : int, default = None - Number of feautres (columns) to select. n_columns must be <= - number of X columns. Some methods require n_columns to be given. - regressor : sklearn-like regressor, default=None - Used as meta-model for the method "feature-importances". The given - regressor must have an attribute "feature_importances_". If None, - then a GradientBoostingRegressor(max_depth=5) is used. - random_state : int, RandomState instance or None, default=None - Used to set random_state of the default regressor and to - set random.seed() if method="random". - columns : list of str - A list of columns to select. If columns is given. - - Attributes - ---------- - columns_ : list of str - List of columns that have been selected as features. - regressor_ : sklearn-like regressor - Fitted regressor (meta-model). - n_columns_: int - Derived from number of features if n_columns is None, then - n_columns_ is calculated as int(math.ceil(Z.shape[1] / 2)). So taking - half of given features only as default. - feature_importances_ : dict or None - A dictionary with column name as key and feature imporatnce value as value. - The dict is sorted descending on value. This attribute is a dict if - method="feature-importances", else None. - - Examples - -------- - >>> from aeon.transformations.feature_selection import FeatureSelection - >>> from aeon.datasets import load_longley - >>> y, X = load_longley() - >>> transformer = FeatureSelection(method="feature-importances", n_columns=3) - >>> X_hat = transformer.fit_transform(X, y) - """ - - _tags = { - "input_data_type": "Series", - # what is the abstract type of X: Series, or Panel - "output_data_type": "Series", - # what abstract type is returned: Primitives, Series, Panel - "instancewise": True, - "X_inner_type": ["pd.DataFrame", "pd.Series"], - "y_inner_type": "pd.DataFrame", - "fit_is_empty": False, - "transform-returns-same-time-index": True, - "skip-inverse-transform": True, - "capability:multivariate": True, - } - - def __init__( - self, - method="feature-importances", - n_columns=None, - regressor=None, - random_state=None, - columns=None, - ): - self.n_columns = n_columns - self.method = method - self.regressor = regressor - self.random_state = random_state - self.columns = columns - - super().__init__() - - def _fit(self, X, y=None): - """Fit transformer to X and y. - - private _fit containing the core logic, called from fit - - Parameters - ---------- - X : pd.Series or pd.DataFrame - Data to fit transform to - y : pd.DataFrame, default=None - Additional data, e.g., labels for transformation - - Returns - ------- - self: a fitted instance of the estimator - """ - self.n_columns_ = self.n_columns - self.feature_importances_ = None - - if self.method == "none": - self.set_tags(**{"output_data_type": "Primitives"}) - - # multivariate X - if not isinstance(X, pd.Series): - if self.method == "feature-importances": - self.regressor_ = check_regressor( - regressor=self.regressor, random_state=self.random_state - ) - self._check_n_columns(X) - # fit regressor with X as exog data and y as endog data (target) - self.regressor_.fit(X=X, y=y) - if not hasattr(self.regressor_, "feature_importances_"): - raise ValueError( - """The given regressor must have an - attribute feature_importances_ after fitting.""" - ) - # create dict with columns name (key) and feauter importance (value) - d = dict(zip(X.columns, self.regressor_.feature_importances_)) - # sort d descending - d = {k: d[k] for k in sorted(d, key=d.get, reverse=True)} - self.feature_importances_ = d - self.columns_ = list(d.keys())[: self.n_columns_] - elif self.method == "random": - self._check_n_columns(X) - self.columns_ = list( - X.sample( - n=self.n_columns_, random_state=self.random_state, axis=1 - ).columns - ) - elif self.method == "columns": - if self.columns is None: - raise AttributeError("Parameter columns must be given.") - self.columns_ = self.columns - elif self.method == "none": - self.columns_ = None - elif self.method == "all": - self.columns_ = list(X.columns) - else: - raise ValueError("Incorrect method given. Try another method.") - return self - - def _transform(self, X, y=None): - """Transform X and return a transformed version. - - private _transform containing the core logic, called from transform - - Parameters - ---------- - X : pd.Series or pd.DataFrame - Data to be transformed - y : ignored argument for interface compatibility - Additional data, e.g., labels for transformation - - Returns - ------- - Xt : pd.Series or pd.DataFrame, same type as X - transformed version of X - """ - # multivariate case - if not isinstance(X, pd.Series): - if self.method == "none": - Xt = None - else: - Xt = X[self.columns_] - # univariate case - else: - if self.method == "none": - Xt = None - else: - Xt = X - return Xt - - def _check_n_columns(self, Z): - if not isinstance(self.n_columns_, int): - self.n_columns_ = int(math.ceil(Z.shape[1] / 2)) - - @classmethod - def get_test_params(cls, parameter_set="default"): - """Return testing parameter settings for the estimator. - - Parameters - ---------- - parameter_set : str, default="default" - Name of the set of test parameters to return, for use in tests. If no - special parameters are defined for a value, will return `"default"` set. - - - Returns - ------- - params : dict or list of dict, default = {} - Parameters to create testing instances of the class - Each dict are parameters to construct an "interesting" test instance, i.e., - `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. - `create_test_instance` uses the first (or only) dictionary in `params` - """ - return {"method": "all"} diff --git a/aeon/transformations/tests/test_feature_selection.py b/aeon/transformations/tests/test_feature_selection.py deleted file mode 100644 index 74065091d9..0000000000 --- a/aeon/transformations/tests/test_feature_selection.py +++ /dev/null @@ -1,90 +0,0 @@ -"""Test FeatureSelection transformer.""" - -__maintainer__ = [] -__all__ = [] - -import math - -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal -from sklearn.tree import DecisionTreeRegressor - -from aeon.datasets import load_longley -from aeon.forecasting.model_selection import temporal_train_test_split -from aeon.transformations.feature_selection import FeatureSelection - -y, X = load_longley() -y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=3) - - -@pytest.mark.parametrize( - "method", ["feature-importances", "random", "columns", "none", "all"] -) -@pytest.mark.parametrize("n_columns", [None, 2]) -@pytest.mark.parametrize("random_state", [1, 3]) -def test_feature_selection(method, n_columns, random_state): - columns = ["GNP", "UNEMP"] if method == "columns" else None - transformer = FeatureSelection( - method=method, columns=columns, n_columns=n_columns, random_state=random_state - ) - transformer.fit(X=X_train, y=y_train) - X_hat = transformer.transform(X=X_test, y=y_test) - if method != "none": - assert isinstance(X_hat, pd.DataFrame) - else: - assert X_hat is None - - if method == "feature-importances": - if n_columns is None: - n_columns = int(math.ceil(X_train.shape[1] / 2)) - else: - assert X_hat.shape[1] == n_columns - assert isinstance(transformer.feature_importances_, dict) - assert len(transformer.feature_importances_) == X_train.shape[1] - assert isinstance(transformer.feature_importances_, dict) - # test custom regressor - transformer_f1 = FeatureSelection( - method=method, regressor=DecisionTreeRegressor() - ) - transformer_f1.fit(X=X_train, y=y_train) - _ = transformer_f1.transform(X=X_test, y=y_test) - transformer_f2 = FeatureSelection(method=method) - transformer_f2.fit(X=X_train, y=y_train) - _ = transformer_f2.transform(X=X_test, y=y_test) - - assert ( - transformer_f1.feature_importances_ != transformer_f2.feature_importances_ - ) - - if method == "random": - if n_columns is None: - n_columns = int(math.ceil(X_train.shape[1] / 2)) - else: - assert X_hat.shape[1] == n_columns - # test random state - transformer_rand1 = FeatureSelection( - method=method, random_state=random_state - ) - transformer_rand1.fit(X_train) - X_hat_rand1 = transformer_rand1.transform(X_test) - - transformer_rand2 = FeatureSelection(method=method, random_state=3) - transformer_rand2.fit(X_train) - X_hat_rand2 = transformer_rand2.transform(X_test) - - if random_state == 3: - assert_frame_equal(X_hat_rand1, X_hat_rand2) - if random_state != 3: - with pytest.raises(AssertionError): - assert_frame_equal(X_hat_rand1, X_hat_rand2) - if method == "columns": - if columns is None: - assert X_hat.shape[1] == X_train.shape[1] - else: - assert X_hat.shape[1] == len(columns) - for c in columns: - assert c in X_hat.columns - - -test_feature_selection("random", 2, None) diff --git a/docs/api_reference/transformations.rst b/docs/api_reference/transformations.rst index 10ead8c943..b38db9bd14 100644 --- a/docs/api_reference/transformations.rst +++ b/docs/api_reference/transformations.rst @@ -242,20 +242,6 @@ These transformers create a series based on a sequence of sliding windows. HOG1DTransformer - -FeatureSelection -~~~~~~~~~~~~~~~~ - -These transformers select features in `X` based on `y`. - -.. currentmodule:: aeon.transformations.feature_selection - -.. autosummary:: - :toctree: auto_generated/ - :template: class.rst - - FeatureSelection - .. currentmodule:: aeon.transformations.collection.channel_selection .. autosummary::