Merge pull request #2 from NCAR/djgagne

Updated dependencies and added scaler application
NCAR · Feb 14, 2024 · b9de9b5 · b9de9b5
2 parents 2584fc9 + a7eb170
commit b9de9b5
Show file tree

Hide file tree

Showing 11 changed files with 1,767 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,29 @@
-# miles-credit
+# NSF NCAR MILES Community Runnable Earth Digital Intelligence Twin (CREDIT)
+
+## About
+CREDIT is a package to train and run neural networks
+that can emulate full NWP models by predicting
+the next state of the atmosphere given the current state.
+
+## Installation
+Clone from miles-credit github page:
+```bash
+git clone git@github.com:NCAR/miles-credit.git
+cd miles-credit
+```
+
+Install dependencies using environment.yml file:
+```bash
+mamba env create -f environment.yml
+conda activate credit
+```
+
+To enable GPU support, install pytorch-cuda:
+```bash
+mamba install pytorch-cuda=12.1 -c pytorch -c nvidia
+```
+
+Install miles-credit with the following command:
+```bash
+pip install .
+```
diff --git a/applications/scaler.py b/applications/scaler.py
@@ -0,0 +1,71 @@
+import os
+import numpy as np
+import xarray as xr
+import pandas as pd
+import yaml
+import argparse
+from glob import glob
+from multiprocessing import Pool
+from bridgescaler.distributed import DQuantileTransformer
+from bridgescaler.backend import print_scaler
+from os.path import exists, join
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", help="Path to config file")
+    parser.add_argument("-o", "--out", help="Path to save scaler files.")
+    parser.add_argument("-p", "--procs", type=int, help="Number of processors")
+    args = parser.parse_args()
+    args_dict = vars(args)
+    config = args_dict.pop("config")
+    with open(config) as cf:
+        conf = yaml.load(cf, Loader=yaml.FullLoader)
+    all_era5_files = sorted(glob(conf["data"]["save_loc"]))
+    for e5 in all_era5_files:
+        if "_small_" in e5:
+            print(e5)
+            all_era5_files.remove(e5)
+    all_era5_filenames = [f.split("/")[-1] for f in all_era5_files]
+    with Pool(args.procs) as p:
+        all_scalers = p.map(fit_era5_scaler_year, all_era5_files)
+    all_scalers_df = pd.DataFrame(all_scalers, columns=["scaler_3d", "scaler_surface"],
+                                  index=all_era5_filenames)
+    if not exists(args.out):
+        os.makedirs(args.out)
+    now = pd.Timestamp.utcnow().strftime("%Y-%m-%d_%H:%M")
+    all_scalers_df.to_parquet(join(args.out, f"era5_quantile_scalers_{now}.parquet"))
+    return
+
+
+def fit_era5_scaler_year(era5_file):
+    n_times = 300
+    eds = xr.open_zarr(era5_file)
+    vars_3d = ['U', 'V', 'T', 'Q']
+    vars_surf = ['SP', 't2m', 'V500', 'U500', 'T500', 'Z500', 'Q500']
+    levels = eds.level.values
+    var_levels = []
+    for var in vars_3d:
+        for level in levels:
+            var_levels.append(f"{var}_{level:d}")
+    dqs_3d = DQuantileTransformer(distribution="normal")
+    dqs_surf = DQuantileTransformer(distribution="normal")
+    rand_times = np.sort(np.random.choice(eds["time"].values, size=n_times, replace=False))
+    for time in rand_times:
+        print(time)
+        var_slices = []
+        for var in vars_3d:
+            for level in levels:
+                var_slices.append(eds[var].sel(time=time, level=level))
+        e3d = xr.concat(var_slices, pd.Index(var_levels, name="variable")
+                        ).transpose("latitude", "longitude", "variable")
+        dqs_3d.fit(e3d)
+        e_surf = xr.concat([eds[v].sel(time=time) for v in vars_surf], pd.Index(vars_surf, name="variable")
+                           ).transpose("latitude", "longitude", "variable")
+        dqs_surf.fit(e_surf)
+    dqs_3d_json = print_scaler(dqs_3d)
+    dqs_surf_json = print_scaler(dqs_surf)
+    return dqs_3d_json, dqs_surf_json
+
+if __name__ == '__main__':
+    main()
diff --git a/credit/data.py b/credit/data.py
@@ -362,7 +362,7 @@ def __init__(
         history_len: int = 1,
         forecast_len: int = 2,
         transform: Optional[Callable] = None,
-        SEED=42,
+        seed=42,
         skip_periods=None,
     ):
         self.history_len = history_len
@@ -376,7 +376,7 @@ def __init__(
             all_fils.append(get_forward_data(filename=fn))
         self.all_fils = all_fils
         self.data_array = all_fils[0]
-        self.rng = np.random.default_rng(seed=SEED)
+        self.rng = np.random.default_rng(seed=seed)
 
         #set data places:
         indo = 0

diff --git a/credit/scheduler.py b/credit/scheduler.py
@@ -1,7 +1,7 @@
 import copy
 import math
 import torch
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import LRScheduler
 from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau
 
 
@@ -65,7 +65,7 @@ def lr_lambda_phase1(epoch, num_epochs=100, warmup_epochs=10):
         return 0.5 * (1 + math.cos(math.pi * progress))
 
 
-class CosineAnnealingWarmupRestarts(_LRScheduler):
+class CosineAnnealingWarmupRestarts(LRScheduler):
     """
         optimizer (Optimizer): Wrapped optimizer.
         first_cycle_steps (int): First cycle step size.

diff --git a/credit/tests/data_test.py b/credit/tests/data_test.py
@@ -0,0 +1,6 @@
+from credit.data import ERA5Dataset
+
+
+def test_data():
+    assert True
+    return
diff --git a/environment.yml b/environment.yml
@@ -1,10 +1,11 @@
 name: credit
 channels:
   - conda-forge
+  - pytorch
 dependencies:
-  - python
+  - python=3.11
   - pip
-  - numpy
+  - numpy<1.24
   - pandas
   - matplotlib
   - cartopy
@@ -14,9 +15,21 @@ dependencies:
   - xarray
   - netcdf4
   - pytorch
+  - torchvision
+  - wandb
+  - pyyaml
+  - cartopy
+  - dask
+  - distributed
+  - dask-jobqueue
+  - zarr
+  - jupyter
   - pip:
+    - einops
+    - echo-opt
     - bridgescaler
     - rotary-embedding-torch
-    - cartopy
+    - segmentation-models-pytorch
+    - vector_quantize_pytorch
     - .