From d2e26a9792a934407c0b11015b4c7c5694a8d9d3 Mon Sep 17 00:00:00 2001
From: SimonKamuk <43374850+SimonKamuk@users.noreply.github.com>
Date: Thu, 23 Jan 2025 09:05:20 +0100
Subject: [PATCH] Add multi-node-training (#103)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Describe your changes

This PR adds support for multi-node GPU training using the SLURM job
scheduler. The changes allow setting the number of nodes with the cli
argument `num_nodes`. It is also possible to select a subset of visible
GPU's using the argument `devices` (only when not using SLURM).

Replaces https://github.com/mllam/neural-lam/pull/26 with a simpler
method based on advice from @sadamov

## Type of change

- [ ] 🐛 Bug fix (non-breaking change that fixes an issue)
- [x] ✨ New feature (non-breaking change that adds functionality)
- [ ] 💥 Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- [ ] 📖 Documentation (Addition or improvements to documentation)

## Checklist before requesting a review

- [x] My branch is up-to-date with the target branch - if not update
your fork with the changes from the target branch (use `pull` with
`--rebase` option if possible).
- [x] I have performed a self-review of my code
- [x] For any new/modified functions/classes I have added docstrings
that clearly describe its purpose, expected inputs and returned values
- [x] I have placed in-line comments to clarify the intent of any
hard-to-understand passages of my code
- [x] I have updated the [README](README.MD) to cover introduced code
changes
- [ ] I have added tests that prove my fix is effective or that my
feature works
- [x] I have given the PR a name that clearly describes the change,
written in imperative form
([context](https://www.gitkraken.com/learn/git/best-practices/git-commit-message#using-imperative-verb-form)).
- [x] I have requested a reviewer and an assignee (assignee is
responsible for merging). This applies only if you have write access to
the repo, otherwise feel free to tag a maintainer to add a reviewer and
assignee.

## Checklist for reviewers

Each PR comes with its own improvements and flaws. The reviewer should
check the following:
- [x] the code is readable
- [ ] the code is well tested
- [x] the code is documented (including return types and parameters)
- [x] the code is easy to maintain

## Author checklist after completed review

- [ ] I have added a line to the CHANGELOG describing this change, in a
section
  reflecting type of change (add section where missing):
  - *added*: when you have added new functionality
  - *changed*: when default behaviour of the code has been changed
  - *fixes*: when your contribution fixes a bug

## Checklist for assignee

- [ ] PR is up to date with the base branch
- [ ] the tests pass
- [ ] author has added an entry to the changelog (and designated the
change as *added*, *changed* or *fixed*)
- Once the PR is ready to be merged, squash commits and merge the PR.

---------

Co-authored-by: Simon Kamuk Christiansen <skc@volta.dmi.dk>
---
 CHANGELOG.md                             |  8 +++++++
 README.md                                | 30 ++++++++++++++++++++++++
 neural_lam/datastore/mdp.py              |  9 +++----
 neural_lam/models/base_graph_model.py    |  2 +-
 neural_lam/models/base_hi_graph_model.py | 10 ++++----
 neural_lam/models/graph_lam.py           |  2 +-
 neural_lam/train_model.py                | 27 +++++++++++++++++++++
 neural_lam/utils.py                      |  7 ++++++
 8 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 44c0be1d..f413c970 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [unreleased](https://github.com/mllam/neural-lam/compare/v0.3.0...HEAD)
 
+### Added
+- Add support for multi-node training.
+[\#103](https://github.com/mllam/neural-lam/pull/103) @simonkamuk @sadamov
+
+### Fixed
+- Only print on rank 0 to avoid duplicates of all print statements.
+[\#103](https://github.com/mllam/neural-lam/pull/103) @simonkamuk @sadamov
+
 ## [v0.3.0](https://github.com/mllam/neural-lam/releases/tag/v0.3.0)
 
 This release introduces Datastores to represent input data from different sources (including zarr and numpy) while keeping graph generation within neural-lam.
diff --git a/README.md b/README.md
index 7c7cd3a1..801edd73 100644
--- a/README.md
+++ b/README.md
@@ -448,6 +448,36 @@ python -m neural_lam.train_model --model hi_lam_parallel --graph hierarchical ..
 
 Checkpoint files for our models trained on the MEPS data are available upon request.
 
+### High Performance Computing
+
+The training script can be run on a cluster with multiple GPU-nodes. Neural LAM is set up to use PyTorch Lightning's `DDP` backend for distributed training.
+The code can be used on systems both with and without slurm. If the cluster has multiple nodes, set the `--num_nodes` argument accordingly.
+
+Using SLURM, the job can be started with `sbatch slurm_job.sh` with a shell script like the following.
+```
+#!/bin/bash -l
+#SBATCH --job-name=Neural-LAM
+#SBATCH --time=24:00:00
+#SBATCH --nodes=2
+#SBATCH --ntasks-per-node=4
+#SBATCH --gres:gpu=4
+#SBATCH --partition=normal
+#SBATCH --mem=444G
+#SBATCH --no-requeue
+#SBATCH --exclusive
+#SBATCH --output=lightning_logs/neurallam_out_%j.log
+#SBATCH --error=lightning_logs/neurallam_err_%j.log
+
+# Load necessary modules or activate environment, for example:
+conda activate neural-lam
+
+srun -ul python -m neural_lam.train_model \
+    --config_path /path/to/config.yaml \
+    --num_nodes $SLURM_JOB_NUM_NODES
+```
+
+When using on a system without SLURM, where all GPU's are visible, it is possible to select a subset of GPU's to use for training with the `devices` cli argument, e.g. `--devices 0 1` to use the first 2 GPU's.
+
 ## Evaluate Models
 Evaluation is also done using `python -m neural_lam.train_model --config_path <config-path>`, but using the `--eval` option.
 Use `--eval val` to evaluate the model on the validation set and `--eval test` to evaluate on test data.
diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index 0d1aac7b..2898df80 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -13,6 +13,7 @@
 from numpy import ndarray
 
 # Local
+from ..utils import rank_zero_print
 from .base import BaseRegularGridDatastore, CartesianGridShape
 
 
@@ -72,11 +73,11 @@ def __init__(self, config_path, n_boundary_points=30, reuse_existing=True):
             self._ds.to_zarr(fp_ds)
         self._n_boundary_points = n_boundary_points
 
-        print("The loaded datastore contains the following features:")
+        rank_zero_print("The loaded datastore contains the following features:")
         for category in ["state", "forcing", "static"]:
             if len(self.get_vars_names(category)) > 0:
                 var_names = self.get_vars_names(category)
-                print(f" {category:<8s}: {' '.join(var_names)}")
+                rank_zero_print(f" {category:<8s}: {' '.join(var_names)}")
 
         # check that all three train/val/test splits are available
         required_splits = ["train", "val", "test"]
@@ -87,12 +88,12 @@ def __init__(self, config_path, n_boundary_points=30, reuse_existing=True):
                 f"splits: {available_splits}"
             )
 
-        print("With the following splits (over time):")
+        rank_zero_print("With the following splits (over time):")
         for split in required_splits:
             da_split = self._ds.splits.sel(split_name=split)
             da_split_start = da_split.sel(split_part="start").load().item()
             da_split_end = da_split.sel(split_part="end").load().item()
-            print(f" {split:<8s}: {da_split_start} to {da_split_end}")
+            rank_zero_print(f" {split:<8s}: {da_split_start} to {da_split_end}")
 
         # find out the dimension order for the stacking to grid-index
         dim_order = None
diff --git a/neural_lam/models/base_graph_model.py b/neural_lam/models/base_graph_model.py
index 6233b4d1..728a9489 100644
--- a/neural_lam/models/base_graph_model.py
+++ b/neural_lam/models/base_graph_model.py
@@ -34,7 +34,7 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore):
 
         # Specify dimensions of data
         self.num_mesh_nodes, _ = self.get_num_mesh()
-        print(
+        utils.rank_zero_print(
             f"Loaded graph with {self.num_grid_nodes + self.num_mesh_nodes} "
             f"nodes ({self.num_grid_nodes} grid, {self.num_mesh_nodes} mesh)"
         )
diff --git a/neural_lam/models/base_hi_graph_model.py b/neural_lam/models/base_hi_graph_model.py
index 8ec46b4f..13ffcdc2 100644
--- a/neural_lam/models/base_hi_graph_model.py
+++ b/neural_lam/models/base_hi_graph_model.py
@@ -27,10 +27,10 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore):
         ]  # Needs as python list for later
 
         # Print some useful info
-        print("Loaded hierarchical graph with structure:")
+        utils.rank_zero_print("Loaded hierarchical graph with structure:")
         for level_index, level_mesh_size in enumerate(self.level_mesh_sizes):
             same_level_edges = self.m2m_features[level_index].shape[0]
-            print(
+            utils.rank_zero_print(
                 f"level {level_index} - {level_mesh_size} nodes, "
                 f"{same_level_edges} same-level edges"
             )
@@ -38,8 +38,10 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore):
             if level_index < (self.num_levels - 1):
                 up_edges = self.mesh_up_features[level_index].shape[0]
                 down_edges = self.mesh_down_features[level_index].shape[0]
-                print(f"  {level_index}<->{level_index + 1}")
-                print(f" - {up_edges} up edges, {down_edges} down edges")
+                utils.rank_zero_print(f"  {level_index}<->{level_index + 1}")
+                utils.rank_zero_print(
+                    f" - {up_edges} up edges, {down_edges} down edges"
+                )
         # Embedders
         # Assume all levels have same static feature dimensionality
         mesh_dim = self.mesh_static_features[0].shape[1]
diff --git a/neural_lam/models/graph_lam.py b/neural_lam/models/graph_lam.py
index 68b7d01e..0a5b6b57 100644
--- a/neural_lam/models/graph_lam.py
+++ b/neural_lam/models/graph_lam.py
@@ -27,7 +27,7 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore):
         # grid_dim from data + static + batch_static
         mesh_dim = self.mesh_static_features.shape[1]
         m2m_edges, m2m_dim = self.m2m_features.shape
-        print(
+        utils.rank_zero_print(
             f"Edges in subgraphs: m2m={m2m_edges}, g2m={self.g2m_edges}, "
             f"m2g={self.m2g_edges}"
         )
diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py
index 74146c89..6894dd0d 100644
--- a/neural_lam/train_model.py
+++ b/neural_lam/train_model.py
@@ -49,6 +49,22 @@ def main(input_args=None):
         default=4,
         help="Number of workers in data loader (default: 4)",
     )
+    parser.add_argument(
+        "--num_nodes",
+        type=int,
+        default=1,
+        help="Number of nodes to use in DDP (default: 1)",
+    )
+    parser.add_argument(
+        "--devices",
+        nargs="+",
+        type=str,
+        default=["auto"],
+        help="Devices to use for training. Can be the string 'auto' or a list "
+        "of integer id's corresponding to the desired devices, e.g. "
+        "'--devices 0 1'. Note that this cannot be used with SLURM, instead "
+        "set 'ntasks-per-node' in the slurm setup (default: auto)",
+    )
     parser.add_argument(
         "--epochs",
         type=int,
@@ -249,6 +265,15 @@ def main(input_args=None):
     else:
         device_name = "cpu"
 
+    # Set devices to use
+    if args.devices == ["auto"]:
+        devices = "auto"
+    else:
+        try:
+            devices = [int(i) for i in args.devices]
+        except ValueError:
+            raise ValueError("devices should be 'auto' or a list of integers")
+
     # Load model parameters Use new args for model
     ModelClass = MODELS[args.model]
     model = ModelClass(args, config=config, datastore=datastore)
@@ -278,6 +303,8 @@ def main(input_args=None):
         deterministic=True,
         strategy="ddp",
         accelerator=device_name,
+        num_nodes=args.num_nodes,
+        devices=devices,
         logger=logger,
         log_every_n_steps=1,
         callbacks=[checkpoint_callback],
diff --git a/neural_lam/utils.py b/neural_lam/utils.py
index 4a0752e4..2cf20852 100644
--- a/neural_lam/utils.py
+++ b/neural_lam/utils.py
@@ -4,6 +4,7 @@
 
 # Third-party
 import torch
+from pytorch_lightning.utilities import rank_zero_only
 from torch import nn
 from tueplots import bundles, figsizes
 
@@ -233,6 +234,12 @@ def fractional_plot_bundle(fraction):
     return bundle
 
 
+@rank_zero_only
+def rank_zero_print(*args, **kwargs):
+    """Print only from rank 0 process"""
+    print(*args, **kwargs)
+
+
 def init_wandb_metrics(wandb_logger, val_steps):
     """
     Set up wandb metrics to track