diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 00000000..5c193b63 --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,17 @@ +name: nightly-ci-integration-tests + +on: + schedule: + - cron: "0 23 * * *" # every day at 22pm on default branch + workflow_dispatch: + push: + branches: + - main + - test/initial-integration-tests + +jobs: + integration_tests: + uses: ecmwf-actions/reusable-workflows/.github/workflows/integration-tests.yml@add-integration-test-anemoi + with: + python-version: '3.10' + install-package: "-e ./training[all,tests] -e ./graphs[all,tests] -e ./models[all,tests]" diff --git a/tests/integration/basic_config.yaml b/tests/integration/basic_config.yaml new file mode 100644 index 00000000..4c9f3f77 --- /dev/null +++ b/tests/integration/basic_config.yaml @@ -0,0 +1,45 @@ +defaults: +- data: zarr +- dataloader: native_grid +- diagnostics: evaluation +- hardware: example +- graph: multi_scale +- model: gnn +- training: default +- _self_ + +no_validation: False +# diagnostics: +# plot: +# callbacks: [] +hardware: + files: + dataset: aifs-ea-an-oper-0001-mars-o48-1979-19-6h-v6-testset.zarr + graph: test_graph.pt + paths: + data: https://object-store.os-api.cci1.ecmwf.int/ml-tests/test-data/samples/anemoi-integration-tests + output: ${oc.env:PWD}/tmp_output/ + + # number of GPUs per node and number of nodes (for DDP) + accelerator: auto + num_gpus_per_node: 1 + num_nodes: 1 + num_gpus_per_model: 1 + + +model: + num_channels: 16 + # processor: + # attention_implementation: scaled_dot_product_attention + +dataloader: + limit_batches: + training: 100 + validation: 100 + training: + end: 1979-01-08 18:00:00 + validation: + start: 1979-01-08 12:00:00 + +training: + max_epochs: 2 diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..4b631afb --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,31 @@ +import pytest +from hydra import compose +from hydra import initialize +from omegaconf import OmegaConf + + +@pytest.fixture( + params=[ + ["model=gnn"], + ["model=graphtransformer"], + [ + "model=transformer", + "graph=encoder_decoder_only", + "model.processor.attention_implementation=scaled_dot_product_attention", + ], + ] +) +def architecture_config(request) -> None: + overrides = request.param + with initialize(version_base=None, config_path="", job_name="test_training"): + cfg = compose(config_name="basic_config", overrides=overrides) + OmegaConf.resolve(cfg) + return cfg + + +@pytest.fixture() +def stretched_config() -> None: + with initialize(version_base=None, config_path="", job_name="test_stretched"): + cfg = compose(config_name="stretched_config") + OmegaConf.resolve(cfg) + return cfg diff --git a/tests/integration/stretched_config.yaml b/tests/integration/stretched_config.yaml new file mode 100644 index 00000000..20ecf17e --- /dev/null +++ b/tests/integration/stretched_config.yaml @@ -0,0 +1,141 @@ +defaults: +- data: zarr +- dataloader: native_grid +- diagnostics: evaluation +- hardware: example +- graph: stretched_grid +- model: graphtransformer +- training: default +- _self_ + +no_validation: True + +diagnostics: + plot: + callbacks: [] + +dataloader: + dataset: + cutout: + - dataset: ${hardware.files.dataset} + thinning: 25 + - dataset: ${hardware.files.forcing_dataset} + adjust: all + min_distance_km: 0 + num_workers: + training: 2 + validation: 2 + batch_size: + training: 2 + validation: 2 + limit_batches: + training: 2 + validation: 2 + training: + start: "2017-02-01" + end: "2017-02-07" + validation: + start: "2018-01-01" + end: "2018-01-04" + +data: + forcing: + - "cos_latitude" + - "cos_longitude" + - "sin_latitude" + - "sin_longitude" + - "cos_julian_day" + - "cos_local_time" + - "sin_julian_day" + - "sin_local_time" + - "lsm" + # features that are only part of the forecast state + # but are not used as the input to the model + diagnostic: + - tp + remapped: + normalizer: + default: "mean-std" + std: + - "tp" + min-max: + max: + none: + - "cos_latitude" + - "cos_longitude" + - "sin_latitude" + - "sin_longitude" + - "cos_julian_day" + - "cos_local_time" + - "sin_julian_day" + - "sin_local_time" + - "lsm" + +training: + loss_scaling: + spatial: + _target_: anemoi.training.data.scaling.ReweightedGraphAttribute + target_nodes: ${graph.data} + scaled_attribute: area_weight # it must be a node attribute of the output nodes + cutout_weight_frac_of_global: 2 + max_epochs: 2 + +hardware: + files: + dataset: cerra-rr-an-oper-0001-mars-5p5km-1984-2020-6h-v2-hmsi + forcing_dataset: aifs-od-an-oper-0001-mars-o96-2016-2023-6h-v6 + graph: test_stretched_graph.pt + paths: + output: ${oc.env:PWD}/tmp_output/ + + # number of GPUs per node and number of nodes (for DDP) + accelerator: auto + num_gpus_per_node: 1 + num_nodes: 1 + num_gpus_per_model: 1 + +graph: + overwrite: True + nodes: + data: + node_builder: + _target_: anemoi.graphs.nodes.ZarrDatasetNodes + dataset: ${dataloader.dataset} + attributes: + weights: + _target_: anemoi.graphs.nodes.attributes.SphericalAreaWeights + fill_value: 0 + norm: unit-max + cutout_mask: + _target_: anemoi.graphs.nodes.attributes.CutOutMask + hidden: + node_builder: + _target_: anemoi.graphs.nodes.StretchedTriNodes + lam_resolution: 5 + global_resolution: 3 + reference_node_name: data + mask_attr_name: cutout_mask + + edges: + - source_name: data + target_name: hidden + edge_builders: + - _target_: anemoi.graphs.edges.KNNEdges + num_nearest_neighbours: 12 + attributes: ${graph.attributes.edges} + - source_name: hidden + target_name: hidden + edge_builders: + - _target_: anemoi.graphs.edges.MultiScaleEdges + x_hops: 1 + attributes: ${graph.attributes.edges} + - source_name: hidden + target_name: data + edge_builders: + - _target_: anemoi.graphs.edges.KNNEdges + num_nearest_neighbours: 3 + attributes: ${graph.attributes.edges} + + # model: + # attributes: + # edges: [] diff --git a/tests/test_integration.py b/tests/integration/test_integration.py similarity index 100% rename from tests/test_integration.py rename to tests/integration/test_integration.py diff --git a/tests/integration/test_training_cycle.py b/tests/integration/test_training_cycle.py new file mode 100644 index 00000000..22d9a005 --- /dev/null +++ b/tests/integration/test_training_cycle.py @@ -0,0 +1,41 @@ +# (C) Copyright 2024 Anemoi contributors. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import logging +import shutil + +import pytest +import torch + +from anemoi.training.train.train import AnemoiTrainer + +LOGGER = logging.getLogger(__name__) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="No GPU available") +def test_training_cycle_architecture_configs(architecture_config) -> None: + AnemoiTrainer(architecture_config).train() + shutil.rmtree(architecture_config.hardware.paths.output) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="No GPU available") +def test_training_cycle_grid_configs(stretched_config) -> None: + AnemoiTrainer(stretched_config).train() + shutil.rmtree(stretched_config.hardware.paths.output) + + +if __name__ == "__main__": + from hydra import compose + from hydra import initialize + from omegaconf import OmegaConf + + with initialize(version_base=None, config_path="", job_name="test_training"): + cfg = compose(config_name="stretched_config", overrides=[]) + OmegaConf.resolve(cfg) + test_training_cycle_architecture_configs(cfg)