From d2e26a9792a934407c0b11015b4c7c5694a8d9d3 Mon Sep 17 00:00:00 2001 From: SimonKamuk <43374850+SimonKamuk@users.noreply.github.com> Date: Thu, 23 Jan 2025 09:05:20 +0100 Subject: [PATCH] Add multi-node-training (#103) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Describe your changes This PR adds support for multi-node GPU training using the SLURM job scheduler. The changes allow setting the number of nodes with the cli argument `num_nodes`. It is also possible to select a subset of visible GPU's using the argument `devices` (only when not using SLURM). Replaces https://github.com/mllam/neural-lam/pull/26 with a simpler method based on advice from @sadamov ## Type of change - [ ] 🐛 Bug fix (non-breaking change that fixes an issue) - [x] ✨ New feature (non-breaking change that adds functionality) - [ ] 💥 Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] 📖 Documentation (Addition or improvements to documentation) ## Checklist before requesting a review - [x] My branch is up-to-date with the target branch - if not update your fork with the changes from the target branch (use `pull` with `--rebase` option if possible). - [x] I have performed a self-review of my code - [x] For any new/modified functions/classes I have added docstrings that clearly describe its purpose, expected inputs and returned values - [x] I have placed in-line comments to clarify the intent of any hard-to-understand passages of my code - [x] I have updated the [README](README.MD) to cover introduced code changes - [ ] I have added tests that prove my fix is effective or that my feature works - [x] I have given the PR a name that clearly describes the change, written in imperative form ([context](https://www.gitkraken.com/learn/git/best-practices/git-commit-message#using-imperative-verb-form)). - [x] I have requested a reviewer and an assignee (assignee is responsible for merging). This applies only if you have write access to the repo, otherwise feel free to tag a maintainer to add a reviewer and assignee. ## Checklist for reviewers Each PR comes with its own improvements and flaws. The reviewer should check the following: - [x] the code is readable - [ ] the code is well tested - [x] the code is documented (including return types and parameters) - [x] the code is easy to maintain ## Author checklist after completed review - [ ] I have added a line to the CHANGELOG describing this change, in a section reflecting type of change (add section where missing): - *added*: when you have added new functionality - *changed*: when default behaviour of the code has been changed - *fixes*: when your contribution fixes a bug ## Checklist for assignee - [ ] PR is up to date with the base branch - [ ] the tests pass - [ ] author has added an entry to the changelog (and designated the change as *added*, *changed* or *fixed*) - Once the PR is ready to be merged, squash commits and merge the PR. --------- Co-authored-by: Simon Kamuk Christiansen --- CHANGELOG.md | 8 +++++++ README.md | 30 ++++++++++++++++++++++++ neural_lam/datastore/mdp.py | 9 +++---- neural_lam/models/base_graph_model.py | 2 +- neural_lam/models/base_hi_graph_model.py | 10 ++++---- neural_lam/models/graph_lam.py | 2 +- neural_lam/train_model.py | 27 +++++++++++++++++++++ neural_lam/utils.py | 7 ++++++ 8 files changed, 85 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44c0be1d..f413c970 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [unreleased](https://github.com/mllam/neural-lam/compare/v0.3.0...HEAD) +### Added +- Add support for multi-node training. +[\#103](https://github.com/mllam/neural-lam/pull/103) @simonkamuk @sadamov + +### Fixed +- Only print on rank 0 to avoid duplicates of all print statements. +[\#103](https://github.com/mllam/neural-lam/pull/103) @simonkamuk @sadamov + ## [v0.3.0](https://github.com/mllam/neural-lam/releases/tag/v0.3.0) This release introduces Datastores to represent input data from different sources (including zarr and numpy) while keeping graph generation within neural-lam. diff --git a/README.md b/README.md index 7c7cd3a1..801edd73 100644 --- a/README.md +++ b/README.md @@ -448,6 +448,36 @@ python -m neural_lam.train_model --model hi_lam_parallel --graph hierarchical .. Checkpoint files for our models trained on the MEPS data are available upon request. +### High Performance Computing + +The training script can be run on a cluster with multiple GPU-nodes. Neural LAM is set up to use PyTorch Lightning's `DDP` backend for distributed training. +The code can be used on systems both with and without slurm. If the cluster has multiple nodes, set the `--num_nodes` argument accordingly. + +Using SLURM, the job can be started with `sbatch slurm_job.sh` with a shell script like the following. +``` +#!/bin/bash -l +#SBATCH --job-name=Neural-LAM +#SBATCH --time=24:00:00 +#SBATCH --nodes=2 +#SBATCH --ntasks-per-node=4 +#SBATCH --gres:gpu=4 +#SBATCH --partition=normal +#SBATCH --mem=444G +#SBATCH --no-requeue +#SBATCH --exclusive +#SBATCH --output=lightning_logs/neurallam_out_%j.log +#SBATCH --error=lightning_logs/neurallam_err_%j.log + +# Load necessary modules or activate environment, for example: +conda activate neural-lam + +srun -ul python -m neural_lam.train_model \ + --config_path /path/to/config.yaml \ + --num_nodes $SLURM_JOB_NUM_NODES +``` + +When using on a system without SLURM, where all GPU's are visible, it is possible to select a subset of GPU's to use for training with the `devices` cli argument, e.g. `--devices 0 1` to use the first 2 GPU's. + ## Evaluate Models Evaluation is also done using `python -m neural_lam.train_model --config_path `, but using the `--eval` option. Use `--eval val` to evaluate the model on the validation set and `--eval test` to evaluate on test data. diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py index 0d1aac7b..2898df80 100644 --- a/neural_lam/datastore/mdp.py +++ b/neural_lam/datastore/mdp.py @@ -13,6 +13,7 @@ from numpy import ndarray # Local +from ..utils import rank_zero_print from .base import BaseRegularGridDatastore, CartesianGridShape @@ -72,11 +73,11 @@ def __init__(self, config_path, n_boundary_points=30, reuse_existing=True): self._ds.to_zarr(fp_ds) self._n_boundary_points = n_boundary_points - print("The loaded datastore contains the following features:") + rank_zero_print("The loaded datastore contains the following features:") for category in ["state", "forcing", "static"]: if len(self.get_vars_names(category)) > 0: var_names = self.get_vars_names(category) - print(f" {category:<8s}: {' '.join(var_names)}") + rank_zero_print(f" {category:<8s}: {' '.join(var_names)}") # check that all three train/val/test splits are available required_splits = ["train", "val", "test"] @@ -87,12 +88,12 @@ def __init__(self, config_path, n_boundary_points=30, reuse_existing=True): f"splits: {available_splits}" ) - print("With the following splits (over time):") + rank_zero_print("With the following splits (over time):") for split in required_splits: da_split = self._ds.splits.sel(split_name=split) da_split_start = da_split.sel(split_part="start").load().item() da_split_end = da_split.sel(split_part="end").load().item() - print(f" {split:<8s}: {da_split_start} to {da_split_end}") + rank_zero_print(f" {split:<8s}: {da_split_start} to {da_split_end}") # find out the dimension order for the stacking to grid-index dim_order = None diff --git a/neural_lam/models/base_graph_model.py b/neural_lam/models/base_graph_model.py index 6233b4d1..728a9489 100644 --- a/neural_lam/models/base_graph_model.py +++ b/neural_lam/models/base_graph_model.py @@ -34,7 +34,7 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore): # Specify dimensions of data self.num_mesh_nodes, _ = self.get_num_mesh() - print( + utils.rank_zero_print( f"Loaded graph with {self.num_grid_nodes + self.num_mesh_nodes} " f"nodes ({self.num_grid_nodes} grid, {self.num_mesh_nodes} mesh)" ) diff --git a/neural_lam/models/base_hi_graph_model.py b/neural_lam/models/base_hi_graph_model.py index 8ec46b4f..13ffcdc2 100644 --- a/neural_lam/models/base_hi_graph_model.py +++ b/neural_lam/models/base_hi_graph_model.py @@ -27,10 +27,10 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore): ] # Needs as python list for later # Print some useful info - print("Loaded hierarchical graph with structure:") + utils.rank_zero_print("Loaded hierarchical graph with structure:") for level_index, level_mesh_size in enumerate(self.level_mesh_sizes): same_level_edges = self.m2m_features[level_index].shape[0] - print( + utils.rank_zero_print( f"level {level_index} - {level_mesh_size} nodes, " f"{same_level_edges} same-level edges" ) @@ -38,8 +38,10 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore): if level_index < (self.num_levels - 1): up_edges = self.mesh_up_features[level_index].shape[0] down_edges = self.mesh_down_features[level_index].shape[0] - print(f" {level_index}<->{level_index + 1}") - print(f" - {up_edges} up edges, {down_edges} down edges") + utils.rank_zero_print(f" {level_index}<->{level_index + 1}") + utils.rank_zero_print( + f" - {up_edges} up edges, {down_edges} down edges" + ) # Embedders # Assume all levels have same static feature dimensionality mesh_dim = self.mesh_static_features[0].shape[1] diff --git a/neural_lam/models/graph_lam.py b/neural_lam/models/graph_lam.py index 68b7d01e..0a5b6b57 100644 --- a/neural_lam/models/graph_lam.py +++ b/neural_lam/models/graph_lam.py @@ -27,7 +27,7 @@ def __init__(self, args, config: NeuralLAMConfig, datastore: BaseDatastore): # grid_dim from data + static + batch_static mesh_dim = self.mesh_static_features.shape[1] m2m_edges, m2m_dim = self.m2m_features.shape - print( + utils.rank_zero_print( f"Edges in subgraphs: m2m={m2m_edges}, g2m={self.g2m_edges}, " f"m2g={self.m2g_edges}" ) diff --git a/neural_lam/train_model.py b/neural_lam/train_model.py index 74146c89..6894dd0d 100644 --- a/neural_lam/train_model.py +++ b/neural_lam/train_model.py @@ -49,6 +49,22 @@ def main(input_args=None): default=4, help="Number of workers in data loader (default: 4)", ) + parser.add_argument( + "--num_nodes", + type=int, + default=1, + help="Number of nodes to use in DDP (default: 1)", + ) + parser.add_argument( + "--devices", + nargs="+", + type=str, + default=["auto"], + help="Devices to use for training. Can be the string 'auto' or a list " + "of integer id's corresponding to the desired devices, e.g. " + "'--devices 0 1'. Note that this cannot be used with SLURM, instead " + "set 'ntasks-per-node' in the slurm setup (default: auto)", + ) parser.add_argument( "--epochs", type=int, @@ -249,6 +265,15 @@ def main(input_args=None): else: device_name = "cpu" + # Set devices to use + if args.devices == ["auto"]: + devices = "auto" + else: + try: + devices = [int(i) for i in args.devices] + except ValueError: + raise ValueError("devices should be 'auto' or a list of integers") + # Load model parameters Use new args for model ModelClass = MODELS[args.model] model = ModelClass(args, config=config, datastore=datastore) @@ -278,6 +303,8 @@ def main(input_args=None): deterministic=True, strategy="ddp", accelerator=device_name, + num_nodes=args.num_nodes, + devices=devices, logger=logger, log_every_n_steps=1, callbacks=[checkpoint_callback], diff --git a/neural_lam/utils.py b/neural_lam/utils.py index 4a0752e4..2cf20852 100644 --- a/neural_lam/utils.py +++ b/neural_lam/utils.py @@ -4,6 +4,7 @@ # Third-party import torch +from pytorch_lightning.utilities import rank_zero_only from torch import nn from tueplots import bundles, figsizes @@ -233,6 +234,12 @@ def fractional_plot_bundle(fraction): return bundle +@rank_zero_only +def rank_zero_print(*args, **kwargs): + """Print only from rank 0 process""" + print(*args, **kwargs) + + def init_wandb_metrics(wandb_logger, val_steps): """ Set up wandb metrics to track