From 3b6eb49746e2b5b6ff20012793f6253d764b6b06 Mon Sep 17 00:00:00 2001
From: Fede Raimondo <f.raimondo@fz-juelich.de>
Date: Thu, 17 Oct 2024 09:59:10 +0200
Subject: [PATCH 1/4] Invert order of CV so it runs the biggest first

---
 julearn/api.py                                |  8 ++++----
 julearn/model_selection/final_model_cv.py     |  5 +++--
 .../tests/test_final_model_cv.py              | 12 +++++------
 julearn/models/tests/test_models.py           |  2 +-
 julearn/tests/test_api.py                     | 20 +++++++++----------
 pyproject.toml                                |  4 ++--
 6 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/julearn/api.py b/julearn/api.py
index ec8288b67..ea0e4bf32 100644
--- a/julearn/api.py
+++ b/julearn/api.py
@@ -594,13 +594,13 @@ def run_cross_validation(
     )
 
     if include_final_model:
-        # If we include the final model, we need to remove the last item in
+        # If we include the final model, we need to remove the first item in
         # the scores as this is the final model
-        pipeline = scores["estimator"][-1]
+        pipeline = scores["estimator"][0]
         if return_estimator == "final":
             scores.pop("estimator")
-        scores = {k: v[:-1] for k, v in scores.items()}
-        fold_sizes = fold_sizes[:-1]
+        scores = {k: v[1:] for k, v in scores.items()}
+        fold_sizes = fold_sizes[1:]
 
     n_repeats = getattr(cv_outer, "n_repeats", 1)
     n_folds = len(scores["fit_time"]) // n_repeats
diff --git a/julearn/model_selection/final_model_cv.py b/julearn/model_selection/final_model_cv.py
index 4ceb271d6..c27f24569 100644
--- a/julearn/model_selection/final_model_cv.py
+++ b/julearn/model_selection/final_model_cv.py
@@ -68,11 +68,12 @@ def split(
         profitting for joblib calls.
 
         """
-        yield from self.cv.split(X, y, groups)
+        # For the first fold, train on all samples and return only 2 for test
         all_inds = np.arange(len(X))
-        # For the last fold, train on all samples and return only 2 for testing
         yield all_inds, all_inds[:2]
 
+        yield from self.cv.split(X, y, groups)
+
     def get_n_splits(self) -> int:
         """Get the number of splits.
 
diff --git a/julearn/model_selection/tests/test_final_model_cv.py b/julearn/model_selection/tests/test_final_model_cv.py
index fb2977ed1..8670908e2 100644
--- a/julearn/model_selection/tests/test_final_model_cv.py
+++ b/julearn/model_selection/tests/test_final_model_cv.py
@@ -31,13 +31,13 @@ def test_final_model_cv() -> None:
     all_sk = list(sklearn_cv.split(X, y))
 
     assert len(all_ju) == len(all_sk) + 1
-    for i in range(10):
-        assert_array_equal(all_ju[i][0], all_sk[i][0])
-        assert_array_equal( all_ju[i][1], all_sk[i][1])
+    for i in range(1, 11):
+        assert_array_equal(all_ju[i][0], all_sk[i-1][0])
+        assert_array_equal(all_ju[i][1], all_sk[i-1][1])
 
-    assert all_ju[-1][0].shape[0] == n_samples
-    assert all_ju[-1][1].shape[0] == 2
-    assert_array_equal(all_ju[-1][0], np.arange(n_samples))
+    assert all_ju[0][0].shape[0] == n_samples
+    assert all_ju[0][1].shape[0] == 2
+    assert_array_equal(all_ju[0][0], np.arange(n_samples))
 
 
 def test_final_model_cv_mdsum() -> None:
diff --git a/julearn/models/tests/test_models.py b/julearn/models/tests/test_models.py
index 011986cc8..fcf68f5e7 100644
--- a/julearn/models/tests/test_models.py
+++ b/julearn/models/tests/test_models.py
@@ -189,7 +189,7 @@ def test_naive_bayes_estimators(
                 "estimator": DecisionTreeClassifier(random_state=42),
             },
         ),
-        ("gradientboost", GradientBoostingClassifier, {}),
+        ("gradientboost", GradientBoostingClassifier, {"random_state": 42}),
     ],
 )
 def test_classificationestimators(
diff --git a/julearn/tests/test_api.py b/julearn/tests/test_api.py
index af7a3641a..17a0cbe9b 100644
--- a/julearn/tests/test_api.py
+++ b/julearn/tests/test_api.py
@@ -415,8 +415,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
     scoring = "accuracy"
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
-    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
+    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)
 
     model_params = {"svm__C": [0.01, 0.001]}
     search_params = {"cv": cv_inner}
@@ -438,8 +438,8 @@ def test_tune_hyperparam_gridsearch(df_iris: pd.DataFrame) -> None:
 
     # Now do the same with scikit-learn
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2)
-    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2)
+    cv_outer = RepeatedKFold(n_splits=3, n_repeats=2, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=3, n_repeats=2, random_state=10)
 
     clf = make_pipeline(SVC())
     gs = GridSearchCV(
@@ -672,8 +672,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
     scoring = "accuracy"
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
 
     search_params = {"cv": cv_inner}
     actual1, actual_estimator1 = run_cross_validation(
@@ -701,8 +701,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
     )
 
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
     search_params = {"cv": cv_inner}
     actual2, actual_estimator2 = run_cross_validation(
         X=X,
@@ -718,8 +718,8 @@ def test_tune_hyperparams_multiple_grid(df_iris: pd.DataFrame) -> None:
 
     # Now do the same with scikit-learn
     np.random.seed(42)
-    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1)
-    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1)
+    cv_outer = RepeatedKFold(n_splits=2, n_repeats=1, random_state=9)
+    cv_inner = RepeatedKFold(n_splits=2, n_repeats=1, random_state=10)
 
     clf = make_pipeline(SVC())
     grid = [
diff --git a/pyproject.toml b/pyproject.toml
index bcb3b73bf..4db54d213 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,9 +38,9 @@ classifiers = [
 ]
 dependencies = [
     "numpy>=1.24,<1.27",
-    "pandas>=1.5.0,<2.2",
+    "pandas>=1.5.0,<2.3",
     "statsmodels>=0.13,<0.15",
-    "scikit-learn>=1.2.0,<1.5.0",
+    "scikit-learn>=1.2.0,<1.6.0",
 ]
 dynamic = ["version"]
 

From 1f47a4f6f41df1d7fd76e22580f128f3dadebb3f Mon Sep 17 00:00:00 2001
From: Fede Raimondo <f.raimondo@fz-juelich.de>
Date: Thu, 17 Oct 2024 11:42:58 +0200
Subject: [PATCH 2/4] Add newsfragment

---
 docs/changes/newsfragments/275.enh | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 docs/changes/newsfragments/275.enh

diff --git a/docs/changes/newsfragments/275.enh b/docs/changes/newsfragments/275.enh
new file mode 100644
index 000000000..a79cb03bf
--- /dev/null
+++ b/docs/changes/newsfragments/275.enh
@@ -0,0 +1 @@
+Place the final model CV split at the beginning instead of the end of the CV iterator wrapper by `Fede Raimondo`_
\ No newline at end of file

From d9afff37749d4f722bf34dbfb696887a9f3ffb39 Mon Sep 17 00:00:00 2001
From: Fede Raimondo <f.raimondo@fz-juelich.de>
Date: Thu, 17 Oct 2024 15:35:20 +0200
Subject: [PATCH 3/4] revert pyproject.toml

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4db54d213..bcb3b73bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,9 +38,9 @@ classifiers = [
 ]
 dependencies = [
     "numpy>=1.24,<1.27",
-    "pandas>=1.5.0,<2.3",
+    "pandas>=1.5.0,<2.2",
     "statsmodels>=0.13,<0.15",
-    "scikit-learn>=1.2.0,<1.6.0",
+    "scikit-learn>=1.2.0,<1.5.0",
 ]
 dynamic = ["version"]
 

From a3d56ed7bee85b7c6c86da9155f65b51c87d5316 Mon Sep 17 00:00:00 2001
From: Fede Raimondo <f.raimondo@fz-juelich.de>
Date: Thu, 17 Oct 2024 15:36:04 +0200
Subject: [PATCH 4/4] Update scikit-learn and pandas versions

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bcb3b73bf..4db54d213 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,9 +38,9 @@ classifiers = [
 ]
 dependencies = [
     "numpy>=1.24,<1.27",
-    "pandas>=1.5.0,<2.2",
+    "pandas>=1.5.0,<2.3",
     "statsmodels>=0.13,<0.15",
-    "scikit-learn>=1.2.0,<1.5.0",
+    "scikit-learn>=1.2.0,<1.6.0",
 ]
 dynamic = ["version"]