mllam · khintz · Jan 27, 2025 · Jun 3, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [unreleased](https://github.com/mllam/neural-lam/compare/v0.3.0...HEAD)
 
 ### Added
+
+- Add support for MLFlow logging and metrics tracking. [\#77](https://github.com/mllam/neural-lam/pull/77)
+  @khintz
+
 - Add support for multi-node training.
 [\#103](https://github.com/mllam/neural-lam/pull/103) @simonkamuk @sadamov
 

diff --git a/README.md b/README.md
@@ -382,7 +382,9 @@ The graphs used for the different models in the [paper](#graph-based-neural-weat
 
 The graph-related files are stored in a directory called `graphs`.
 
-## Weights & Biases Integration
+## Logging your experiments
+
+### Weights & Biases Integration
 The project is fully integrated with [Weights & Biases](https://www.wandb.ai/) (W&B) for logging and visualization, but can just as easily be used without it.
 When W&B is used, training configuration, training/test statistics and plots are sent to the W&B servers and made available in an interactive web interface.
 If W&B is turned off, logging instead saves everything locally to a directory like `wandb/dryrun...`.
@@ -398,6 +400,13 @@ If you would like to turn off W&B and just log things locally, run:
 wandb off
 ```
 
+### MLFlow Integration
+The project is also integrated with [MLFlow](https://mlflow.org/) for logging and storing artefacts.
+
+MLFlow is not used by default, but can be switched to by setting `--logger mlflow` in the training command. With MLFlow enabled, training configuration, training/test statistics and plots are logged to the MLFlow server. MLFlow is self-hosted and can be run locally or on a server. See the [MLFlow documentation](https://mlflow.org/docs/latest/index.html) for details.
+
+Use the environment variable `MLFLOW_TRACKING_URI` to set the URI of the MLFlow server. If not set the logging can not be used. An example of setting the URI to a server is and running a training command is `MLFLOW_TRACKING_URI=http://localhost:5000 python -m neural_lam.train_model --config_path <config_path> --logger mlflow`.
+
 ## Train Models
 Models can be trained using `python -m neural_lam.train_model --config_path <config_path>`.
 Run `python neural_lam.train_model --help` for a full list of training options.

diff --git a/neural_lam/custom_loggers.py b/neural_lam/custom_loggers.py
@@ -0,0 +1,68 @@
+# Standard library
+import sys
+
+# Third-party
+import mlflow
+import mlflow.pytorch
+import pytorch_lightning as pl
+from loguru import logger
+
+
+class CustomMLFlowLogger(pl.loggers.MLFlowLogger):
+    """
+    Custom MLFlow logger that adds the `log_image()` functionality not
+    present in the default implementation from pytorch-lightning as
+    of version `2.0.3` at least.
+    """
+
+    def __init__(self, experiment_name, tracking_uri, run_name):
+        super().__init__(
+            experiment_name=experiment_name, tracking_uri=tracking_uri
+        )
+
+        mlflow.start_run(run_id=self.run_id, log_system_metrics=True)
+        mlflow.set_tag("mlflow.runName", run_name)
+        mlflow.log_param("run_id", self.run_id)
+
+    @property
+    def save_dir(self):
+        """
+        Returns the directory where the MLFlow artifacts are saved.
+        Used to define the path to save output when using the logger.
+
+        Returns
+        -------
+        str
+            Path to the directory where the artifacts are saved.
+        """
+        return "mlruns"
+
+    def log_image(self, key, images, step=None):
+        """
+        Log a matplotlib figure as an image to MLFlow
+
+        key: str
+            Key to log the image under
+        images: list
+            List of matplotlib figures to log
+        step: Union[int, None]
+            Step to log the image under. If None, logs under the key directly
+        """
+        # Third-party
+        import botocore
+        from PIL import Image
+
+        if step is not None:
+            key = f"{key}_{step}"
+
+        # Need to save the image to a temporary file, then log that file
+        # mlflow.log_image, should do this automatically, but is buggy
+        temporary_image = f"{key}.png"
+        images[0].savefig(temporary_image)
+
+        img = Image.open(temporary_image)
+        try:
+            mlflow.log_image(img, f"{key}.png")
+        except botocore.exceptions.NoCredentialsError:
+            logger.error("Error logging image\nSet AWS credentials")
+            sys.exit(1)
diff --git a/neural_lam/models/ar_model.py b/neural_lam/models/ar_model.py
@@ -1,13 +1,13 @@
 # Standard library
 import os
+import warnings
 from typing import List, Union
 
 # Third-party
 import matplotlib.pyplot as plt
 import numpy as np
 import pytorch_lightning as pl
 import torch
-import wandb
 import xarray as xr
 
 # Local
@@ -539,14 +539,26 @@ def plot_examples(self, batch, n_examples, split, prediction=None):
 
                 example_i = self.plotted_examples
 
-                wandb.log(
-                    {
-                        f"{var_name}_example_{example_i}": wandb.Image(fig)
-                        for var_name, fig in zip(
-                            self._datastore.get_vars_names("state"), var_figs
+                for var_name, fig in zip(
+                    self._datastore.get_vars_names("state"), var_figs
+                ):
+
+                    # We need treat logging images differently for different
+                    # loggers. WANDB can log multiple images to the same key,
+                    # while other loggers, as MLFlow, need unique keys for
+                    # each image.
+                    if isinstance(self.logger, pl.loggers.WandbLogger):
+                        key = f"{var_name}_example_{example_i}"
+                    else:
+                        key = f"{var_name}_example"
+
+                    if hasattr(self.logger, "log_image"):
+                        self.logger.log_image(key=key, images=[fig], step=t_i)
+                    else:
+                        warnings.warn(
+                            f"{self.logger} does not support image logging."
                         )
-                    }
-                )
+
                 plt.close(
                     "all"
                 )  # Close all figs for this time step, saves memory
@@ -555,13 +567,15 @@ def plot_examples(self, batch, n_examples, split, prediction=None):
             torch.save(
                 pred_slice.cpu(),
                 os.path.join(
-                    wandb.run.dir, f"example_pred_{self.plotted_examples}.pt"
+                    self.logger.save_dir,
+                    f"example_pred_{self.plotted_examples}.pt",
                 ),
             )
             torch.save(
                 target_slice.cpu(),
                 os.path.join(
-                    wandb.run.dir, f"example_target_{self.plotted_examples}.pt"
+                    self.logger.save_dir,
+                    f"example_target_{self.plotted_examples}.pt",
                 ),
             )
 
@@ -582,16 +596,16 @@ def create_metric_log_dict(self, metric_tensor, prefix, metric_name):
             datastore=self._datastore,
         )
         full_log_name = f"{prefix}_{metric_name}"
-        log_dict[full_log_name] = wandb.Image(metric_fig)
+        log_dict[full_log_name] = metric_fig
 
         if prefix == "test":
             # Save pdf
             metric_fig.savefig(
-                os.path.join(wandb.run.dir, f"{full_log_name}.pdf")
+                os.path.join(self.logger.save_dir, f"{full_log_name}.pdf")
             )
             # Save errors also as csv
             np.savetxt(
-                os.path.join(wandb.run.dir, f"{full_log_name}.csv"),
+                os.path.join(self.logger.save_dir, f"{full_log_name}.csv"),
                 metric_tensor.cpu().numpy(),
                 delimiter=",",
             )
@@ -639,8 +653,27 @@ def aggregate_and_plot_metrics(self, metrics_dict, prefix):
                     )
                 )
 
+        # Ensure that log_dict has structure for
+        # logging as dict(str, plt.Figure)
+        assert all(
+            isinstance(key, str) and isinstance(value, plt.Figure)
+            for key, value in log_dict.items()
+        )
+
         if self.trainer.is_global_zero and not self.trainer.sanity_checking:
-            wandb.log(log_dict)  # Log all
+
+            current_epoch = self.trainer.current_epoch
+
+            for key, figure in log_dict.items():
+                # For other loggers than wandb, add epoch to key.
+                # Wandb can log multiple images to the same key, while other
+                # loggers, such as MLFlow need unique keys for each image.
+                if not isinstance(self.logger, pl.loggers.WandbLogger):
+                    key = f"{key}-{current_epoch}"
+
+                if hasattr(self.logger, "log_image"):
+                    self.logger.log_image(key=key, images=[figure])
+
             plt.close("all")  # Close all figs
 
     def on_test_epoch_end(self):
@@ -672,9 +705,13 @@ def on_test_epoch_end(self):
                 )
             ]
 
-            # log all to same wandb key, sequentially
-            for fig in loss_map_figs:
-                wandb.log({"test_loss": wandb.Image(fig)})
+            # log all to same key, sequentially
+            for i, fig in enumerate(loss_map_figs):
+                key = "test_loss"
+                if not isinstance(self.logger, pl.loggers.WandbLogger):
+                    key = f"{key}_{i}"
+                if hasattr(self.logger, "log_image"):
+                    self.logger.log_image(key=key, images=[fig])
 
             # also make without title and save as pdf
             pdf_loss_map_figs = [
@@ -683,14 +720,16 @@ def on_test_epoch_end(self):
                 )
                 for loss_map in mean_spatial_loss
             ]
-            pdf_loss_maps_dir = os.path.join(wandb.run.dir, "spatial_loss_maps")
+            pdf_loss_maps_dir = os.path.join(
+                self.logger.save_dir, "spatial_loss_maps"
+            )
             os.makedirs(pdf_loss_maps_dir, exist_ok=True)
             for t_i, fig in zip(self.args.val_steps_to_log, pdf_loss_map_figs):
                 fig.savefig(os.path.join(pdf_loss_maps_dir, f"loss_t{t_i}.pdf"))
             # save mean spatial loss as .pt file also
             torch.save(
                 mean_spatial_loss.cpu(),
-                os.path.join(wandb.run.dir, "mean_spatial_loss.pt"),
+                os.path.join(self.logger.save_dir, "mean_spatial_loss.pt"),
             )
 
         self.spatial_loss_maps.clear()

diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py
@@ -5,6 +5,7 @@
 from argparse import ArgumentParser
 
 # Third-party
+# for logging the model:
 import pytorch_lightning as pl
 import torch
 from lightning_fabric.utilities import seed
@@ -182,10 +183,17 @@ def main(input_args=None):
 
     # Logger Settings
     parser.add_argument(
-        "--wandb_project",
+        "--logger",
+        type=str,
+        default="wandb",
+        choices=["wandb", "mlflow"],
+        help="Logger to use for training (wandb/mlflow) (default: wandb)",
+    )
+    parser.add_argument(
+        "--logger-project",
         type=str,
         default="neural_lam",
-        help="Wandb project name (default: neural_lam)",
+        help="Logger project name, for eg. Wandb (default: neural_lam)",
     )
     parser.add_argument(
         "--val_steps_to_log",
@@ -286,26 +294,26 @@ def main(input_args=None):
         f"{prefix}{args.model}-{args.processor_layers}x{args.hidden_dim}-"
         f"{time.strftime('%m_%d_%H')}-{random_run_id:04d}"
     )
+
+    training_logger = utils.setup_training_logger(
+        datastore=datastore, args=args, run_name=run_name
+    )
+
     checkpoint_callback = pl.callbacks.ModelCheckpoint(
         dirpath=f"saved_models/{run_name}",
         filename="min_val_loss",
         monitor="val_mean_loss",
         mode="min",
         save_last=True,
     )
-    logger = pl.loggers.WandbLogger(
-        project=args.wandb_project,
-        name=run_name,
-        config=dict(training=vars(args), datastore=datastore._config),
-    )
     trainer = pl.Trainer(
         max_epochs=args.epochs,
         deterministic=True,
         strategy="ddp",
         accelerator=device_name,
         num_nodes=args.num_nodes,
         devices=devices,
-        logger=logger,
+        logger=training_logger,
         log_every_n_steps=1,
         callbacks=[checkpoint_callback],
         check_val_every_n_epoch=args.val_interval,
@@ -314,11 +322,15 @@ def main(input_args=None):
 
     # Only init once, on rank 0 only
     if trainer.global_rank == 0:
-        utils.init_wandb_metrics(
-            logger, val_steps=args.val_steps_to_log
-        )  # Do after wandb.init
+        utils.init_training_logger_metrics(
+            training_logger, val_steps=args.val_steps_to_log
+        )  # Do after initializing logger
     if args.eval:
-        trainer.test(model=model, datamodule=data_module, ckpt_path=args.load)
+        trainer.test(
+            model=model,
+            datamodule=data_module,
+            ckpt_path=args.load,
+        )
     else:
         trainer.fit(model=model, datamodule=data_module, ckpt_path=args.load)