diff --git a/docs/changes/newsfragments/275.enh b/docs/changes/newsfragments/275.enh new file mode 100644 index 000000000..a79cb03bf --- /dev/null +++ b/docs/changes/newsfragments/275.enh @@ -0,0 +1 @@ +Place the final model CV split at the beginning instead of the end of the CV iterator wrapper by `Fede Raimondo`_ \ No newline at end of file diff --git a/julearn/api.py b/julearn/api.py index ec8288b67..ea0e4bf32 100644 --- a/julearn/api.py +++ b/julearn/api.py @@ -594,13 +594,13 @@ def run_cross_validation( ) if include_final_model: - # If we include the final model, we need to remove the last item in + # If we include the final model, we need to remove the first item in # the scores as this is the final model - pipeline = scores["estimator"][-1] + pipeline = scores["estimator"][0] if return_estimator == "final": scores.pop("estimator") - scores = {k: v[:-1] for k, v in scores.items()} - fold_sizes = fold_sizes[:-1] + scores = {k: v[1:] for k, v in scores.items()} + fold_sizes = fold_sizes[1:] n_repeats = getattr(cv_outer, "n_repeats", 1) n_folds = len(scores["fit_time"]) // n_repeats diff --git a/julearn/model_selection/final_model_cv.py b/julearn/model_selection/final_model_cv.py index 4ceb271d6..c27f24569 100644 --- a/julearn/model_selection/final_model_cv.py +++ b/julearn/model_selection/final_model_cv.py @@ -68,11 +68,12 @@ def split( profitting for joblib calls. """ - yield from self.cv.split(X, y, groups) + # For the first fold, train on all samples and return only 2 for test all_inds = np.arange(len(X)) - # For the last fold, train on all samples and return only 2 for testing yield all_inds, all_inds[:2] + yield from self.cv.split(X, y, groups) + def get_n_splits(self) -> int: """Get the number of splits. diff --git a/julearn/model_selection/tests/test_final_model_cv.py b/julearn/model_selection/tests/test_final_model_cv.py index fb2977ed1..8670908e2 100644 --- a/julearn/model_selection/tests/test_final_model_cv.py +++ b/julearn/model_selection/tests/test_final_model_cv.py @@ -31,13 +31,13 @@ def test_final_model_cv() -> None: all_sk = list(sklearn_cv.split(X, y)) assert len(all_ju) == len(all_sk) + 1 - for i in range(10): - assert_array_equal(all_ju[i][0], all_sk[i][0]) - assert_array_equal( all_ju[i][1], all_sk[i][1]) + for i in range(1, 11): + assert_array_equal(all_ju[i][0], all_sk[i-1][0]) + assert_array_equal(all_ju[i][1], all_sk[i-1][1]) - assert all_ju[-1][0].shape[0] == n_samples - assert all_ju[-1][1].shape[0] == 2 - assert_array_equal(all_ju[-1][0], np.arange(n_samples)) + assert all_ju[0][0].shape[0] == n_samples + assert all_ju[0][1].shape[0] == 2 + assert_array_equal(all_ju[0][0], np.arange(n_samples)) def test_final_model_cv_mdsum() -> None: diff --git a/julearn/models/tests/test_models.py b/julearn/models/tests/test_models.py index 011986cc8..fcf68f5e7 100644 --- a/julearn/models/tests/test_models.py +++ b/julearn/models/tests/test_models.py @@ -189,7 +189,7 @@ def test_naive_bayes_estimators( "estimator": DecisionTreeClassifier(random_state=42), }, ), - ("gradientboost", GradientBoostingClassifier, {}), + ("gradientboost", GradientBoostingClassifier, {"random_state": 42}), ], ) def test_classificationestimators( diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py index af7a3641a..17a0cbe9b 100644 --- a/julearn/tests/test_api.py +++ b/julearn/tests/test_api.py @@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None: scoring = "accuracy" np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=3, n_repeats=2) - cv_inner = RepeatedKFold(n_splits=3, n_repeats=2) + cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9) + cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10) model_params = {"svm__C": [0.01, 0.001]} search_params = {"cv": cv_inner} @@ -438,8 +438,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None: # Now do the same with scikit-learn np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=3, n_repeats=2) - cv_inner = RepeatedKFold(n_splits=3, n_repeats=2) + cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9) + cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10) clf = make_pipeline(SVC()) gs = GridSearchCV( @@ -672,8 +672,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: scoring = "accuracy" np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) search_params = {"cv": cv_inner} actual1, actual_estimator1 = run_cross_validation( @@ -701,8 +701,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: ) np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) search_params = {"cv": cv_inner} actual2, actual_estimator2 = run_cross_validation( X=X, @@ -718,8 +718,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None: # Now do the same with scikit-learn np.random.seed(42) - cv_outer = RepeatedKFold(n_splits=2, n_repeats=1) - cv_inner = RepeatedKFold(n_splits=2, n_repeats=1) + cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9) + cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10) clf = make_pipeline(SVC()) grid = [