From 5a74227bc3825ae75a73d032ea0040a9b4563ceb Mon Sep 17 00:00:00 2001
From: Daniel Grindrod <dannycg1996@gmail.com>
Date: Fri, 1 Nov 2024 02:06:15 +0000
Subject: [PATCH] Flaml: fix lgbm reproducibility (#1369)

* fix: Fixed bug where every underlying LGBMRegressor or LGBMClassifier had n_estimators = 1

* test: Added test showing case where FLAMLised CatBoostModel result isn't reproducible

* fix: Fixing issue where callbacks cause LGBM results to not be reproducible

* Update test/automl/test_regression.py

Co-authored-by: Li Jiang <bnujli@gmail.com>

* fix: Adding back the LGBM EarlyStopping

* refactor: Fix tweaked to ensure other models aren't likely to be affected

* test: Fixed test to allow reproduced results to be better than the FLAML results, when LGBM earlystopping is involved

---------

Co-authored-by: Daniel Grindrod <Daniel.Grindrod@evotec.com>
Co-authored-by: Li Jiang <bnujli@gmail.com>
---
 flaml/automl/model.py              |  7 ++---
 test/automl/test_classification.py |  2 +-
 test/automl/test_regression.py     | 49 +++++++++++++++++++++++++++++-
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/flaml/automl/model.py b/flaml/automl/model.py
index 182b4103ec..024a641dd3 100644
--- a/flaml/automl/model.py
+++ b/flaml/automl/model.py
@@ -1585,18 +1585,17 @@ def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
                     callbacks = None
             if callbacks is None:
                 self._fit(X_train, y_train, **kwargs)
-            else:
-                self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
-            if callbacks is None:
                 # for xgboost>=1.6.0, pop callbacks to enable pickle
                 callbacks = self.params.pop("callbacks")
                 self._model.set_params(callbacks=callbacks[:-1])
+            else:
+                self._fit(X_train, y_train, callbacks=callbacks, **kwargs)
             best_iteration = (
                 getattr(self._model.get_booster(), "best_iteration", None)
                 if isinstance(self, XGBoostSklearnEstimator)
                 else self._model.best_iteration_
             )
-            if best_iteration is not None:
+            if best_iteration is not None and best_iteration > 0:
                 self._model.set_params(n_estimators=best_iteration + 1)
         else:
             self._fit(X_train, y_train, **kwargs)
diff --git a/test/automl/test_classification.py b/test/automl/test_classification.py
index d4ff03b972..4305e3bcff 100644
--- a/test/automl/test_classification.py
+++ b/test/automl/test_classification.py
@@ -493,7 +493,7 @@ def test_reproducibility_of_classification_models(estimator: str):
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         # "lrl1",
         "lrl2",
         "svc",
diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py
index 87f8ed6b23..892ad1eceb 100644
--- a/test/automl/test_regression.py
+++ b/test/automl/test_regression.py
@@ -339,6 +339,52 @@ def test_reproducibility_of_catboost_regression_model():
     assert pytest.approx(val_loss_flaml) == reproduced_val_loss
 
 
+def test_reproducibility_of_lgbm_regression_model():
+    """FLAML finds the best model for a given dataset, which it then provides to users.
+
+    However, there are reported issues around LGBMs - see here:
+    https://github.com/microsoft/FLAML/issues/1368
+    In this test we take the best LGBM regression model which FLAML provided us, and then retrain and test it on the
+    same folds, to verify that the result is reproducible.
+    """
+    automl = AutoML()
+    automl_settings = {
+        "time_budget": 3,
+        "task": "regression",
+        "n_jobs": 1,
+        "estimator_list": ["lgbm"],
+        "eval_method": "cv",
+        "n_splits": 9,
+        "metric": "r2",
+        "keep_search_state": True,
+        "skip_transform": True,
+        "retrain_full": True,
+    }
+    X, y = fetch_california_housing(return_X_y=True, as_frame=True)
+    automl.fit(X_train=X, y_train=y, **automl_settings)
+    best_model = automl.model
+    assert best_model is not None
+    config = best_model.get_params()
+    val_loss_flaml = automl.best_result["val_loss"]
+
+    # Take the best model, and see if we can reproduce the best result
+    reproduced_val_loss, metric_for_logging, train_time, pred_time = automl._state.task.evaluate_model_CV(
+        config=config,
+        estimator=best_model,
+        X_train_all=automl._state.X_train_all,
+        y_train_all=automl._state.y_train_all,
+        budget=None,
+        kf=automl._state.kf,
+        eval_metric="r2",
+        best_val_loss=None,
+        cv_score_agg_func=None,
+        log_training_metric=False,
+        fit_kwargs=None,
+        free_mem_ratio=0,
+    )
+    assert pytest.approx(val_loss_flaml) == reproduced_val_loss or val_loss_flaml > reproduced_val_loss
+
+
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -347,7 +393,7 @@ def test_reproducibility_of_catboost_regression_model():
         "extra_tree",
         "histgb",
         "kneighbor",
-        # "lgbm",
+        "lgbm",
         "rf",
         "xgboost",
         "xgb_limitdepth",
@@ -376,6 +422,7 @@ def test_reproducibility_of_underlying_regression_models(estimator: str):
         "metric": "r2",
         "keep_search_state": True,
         "skip_transform": True,
+        "retrain_full": False,
     }
     X, y = fetch_california_housing(return_X_y=True, as_frame=True)
     automl.fit(X_train=X, y_train=y, **automl_settings)