From 3b6eb49746e2b5b6ff20012793f6253d764b6b06 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Thu, 17 Oct 2024 09:59:10 +0200 Subject: [PATCH 1/4] Invert order of CV so it runs the biggest first --- julearn/api.py | 8 ++++---- julearn/model_selection/final_model_cv.py | 5 +++-- .../tests/test_final_model_cv.py | 12 +++++------ julearn/models/tests/test_models.py | 2 +- julearn/tests/test_api.py | 20 +++++++++---------- pyproject.toml | 4 ++-- 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/julearn/api.py b/julearn/api.py index ec8288b67..ea0e4bf32 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -594,13 +594,13 @@ def run_cross_validation( ) if include_final_model: - # If we include the final model, we need to remove the last item in + # If we include the final model, we need to remove the first item in # the scores as this is the final model - pipeline = scores["estimator"][-1] + pipeline = scores["estimator"][0] if return_estimator == "final": scores.pop("estimator") - scores = {k: v[:-1] for k, v in scores.items()} - fold_sizes = fold_sizes[:-1] + scores = {k: v[1:] for k, v in scores.items()} + fold_sizes = fold_sizes[1:] n_repeats = getattr(cv_outer, "n_repeats", 1) n_folds = len(scores["fit_time"]) // n_repeats diff --git a/julearn/model_selection/final_model_cv.py b/julearn/model_selection/final_model_cv.py index 4ceb271d6..c27f24569 100644 --- a/julearn/model_selection/final_model_cv.py +++ b/julearn/model_selection/final_model_cv.py @@ -68,11 +68,12 @@ def split( profitting for joblib calls. """ - yield from self.cv.split(X, y, groups) + # For the first fold, train on all samples and return only 2 for test all_inds = np.arange(len(X)) - # For the last fold, train on all samples and return only 2 for testing yield all_inds, all_inds[:2] + yield from self.cv.split(X, y, groups) + def get_n_splits(self) -> int: """Get the number of splits. diff --git a/julearn/model_selection/tests/test_final_model_cv.py b/julearn/model_selection/tests/test_final_model_cv.py index fb2977ed1..8670908e2 100644 --- a/julearn/model_selection/tests/test_final_model_cv.py +++ b/julearn/model_selection/tests/test_final_model_cv.py @@ -31,13 +31,13 @@ def test_final_model_cv() -> None: all_sk = list(sklearn_cv.split(X, y)) assert len(all_ju) == len(all_sk) + 1 - for i in range(10): - assert_array_equal(all_ju[i][0], all_sk[i][0]) - assert_array_equal( all_ju[i][1], all_sk[i][1]) + for i in range(1, 11): + assert_array_equal(all_ju[i][0], all_sk[i-1][0]) + assert_array_equal(all_ju[i][1], all_sk[i-1][1]) - assert all_ju[-1][0].shape[0] == n_samples - assert all_ju[-1][1].shape[0] == 2 - assert_array_equal(all_ju[-1][0], np.arange(n_samples)) + assert all_ju[0][0].shape[0] == n_samples + assert all_ju[0][1].shape[0] == 2 + assert_array_equal(all_ju[0][0], np.arange(n_samples)) def test_final_model_cv_mdsum() -> None: diff --git a/julearn/models/tests/test_models.py b/julearn/models/tests/test_models.py index 011986cc8..fcf68f5e7 100644 --- a/julearn/models/tests/test_models.py +++ b/julearn/models/tests/test_models.py @@ -189,7 +189,7 @@ def test_naive_bayes_estimators( "estimator": DecisionTreeClassifier(random_state=42), }, ), - ("gradientboost", GradientBoostingClassifier, {}), + ("gradientboost", GradientBoostingClassifier, {"random_state": 42}), ], ) def test_classificationestimators( diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py index af7a3641a..17a0cbe9b 100644 --- a/julearn/tests/test_api.py +++ b/julearn/tests/test_api.py @@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None: scoring = "accuracy" np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=3, n_repeats=2) - cv_inner = RepeatedKFold(n_splits=3, n_repeats=2) + cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9) + cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10) model_params = {"svm__C": [0.01, 0.001]} search_params = {"cv": cv_inner} @@ -438,8 +438,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None: # Now do the same with scikit-learn np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=3, n_repeats=2) - cv_inner = RepeatedKFold(n_splits=3, n_repeats=2) + cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9) + cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10) clf = make_pipeline(SVC()) gs = GridSearchCV( @@ -672,8 +672,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: scoring = "accuracy" np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) search_params = {"cv": cv_inner} actual1, actual_estimator1 = run_cross_validation( @@ -701,8 +701,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: ) np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) search_params = {"cv": cv_inner} actual2, actual_estimator2 = run_cross_validation( X=X, @@ -718,8 +718,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: # Now do the same with scikit-learn np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) clf = make_pipeline(SVC()) grid = [ diff --git a/pyproject.toml b/pyproject.toml index bcb3b73bf..4db54d213 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,9 +38,9 @@ classifiers = [ ] dependencies = [ "numpy>=1.24,<1.27", - "pandas>=1.5.0,<2.2", + "pandas>=1.5.0,<2.3", "statsmodels>=0.13,<0.15", - "scikit-learn>=1.2.0,<1.5.0", + "scikit-learn>=1.2.0,<1.6.0", ] dynamic = ["version"] From 1f47a4f6f41df1d7fd76e22580f128f3dadebb3f Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Thu, 17 Oct 2024 11:42:58 +0200 Subject: [PATCH 2/4] Add newsfragment --- docs/changes/newsfragments/275.enh | 1 + 1 file changed, 1 insertion(+) create mode 100644 docs/changes/newsfragments/275.enh diff --git a/docs/changes/newsfragments/275.enh b/docs/changes/newsfragments/275.enh new file mode 100644 index 000000000..a79cb03bf --- /dev/null +++ b/docs/changes/newsfragments/275.enh @@ -0,0 +1 @@ +Place the final model CV split at the beginning instead of the end of the CV iterator wrapper by `Fede Raimondo`_ \ No newline at end of file From d9afff37749d4f722bf34dbfb696887a9f3ffb39 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Thu, 17 Oct 2024 15:35:20 +0200 Subject: [PATCH 3/4] revert pyproject.toml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4db54d213..bcb3b73bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,9 +38,9 @@ classifiers = [ ] dependencies = [ "numpy>=1.24,<1.27", - "pandas>=1.5.0,<2.3", + "pandas>=1.5.0,<2.2", "statsmodels>=0.13,<0.15", - "scikit-learn>=1.2.0,<1.6.0", + "scikit-learn>=1.2.0,<1.5.0", ] dynamic = ["version"] From a3d56ed7bee85b7c6c86da9155f65b51c87d5316 Mon Sep 17 00:00:00 2001 From: Fede Raimondo Date: Thu, 17 Oct 2024 15:36:04 +0200 Subject: [PATCH 4/4] Update scikit-learn and pandas versions --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bcb3b73bf..4db54d213 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,9 +38,9 @@ classifiers = [ ] dependencies = [ "numpy>=1.24,<1.27", - "pandas>=1.5.0,<2.2", + "pandas>=1.5.0,<2.3", "statsmodels>=0.13,<0.15", - "scikit-learn>=1.2.0,<1.5.0", + "scikit-learn>=1.2.0,<1.6.0", ] dynamic = ["version"]