Skip to content

Commit

Permalink
Merge pull request #2 from NCAR/djgagne
Browse files Browse the repository at this point in the history
Updated dependencies and added scaler application
  • Loading branch information
djgagne authored Feb 14, 2024
2 parents 2584fc9 + a7eb170 commit b9de9b5
Show file tree
Hide file tree
Showing 11 changed files with 1,767 additions and 16 deletions.
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,29 @@
# miles-credit
# NSF NCAR MILES Community Runnable Earth Digital Intelligence Twin (CREDIT)

## About
CREDIT is a package to train and run neural networks
that can emulate full NWP models by predicting
the next state of the atmosphere given the current state.

## Installation
Clone from miles-credit github page:
```bash
git clone git@github.com:NCAR/miles-credit.git
cd miles-credit
```

Install dependencies using environment.yml file:
```bash
mamba env create -f environment.yml
conda activate credit
```

To enable GPU support, install pytorch-cuda:
```bash
mamba install pytorch-cuda=12.1 -c pytorch -c nvidia
```

Install miles-credit with the following command:
```bash
pip install .
```
71 changes: 71 additions & 0 deletions applications/scaler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import numpy as np
import xarray as xr
import pandas as pd
import yaml
import argparse
from glob import glob
from multiprocessing import Pool
from bridgescaler.distributed import DQuantileTransformer
from bridgescaler.backend import print_scaler
from os.path import exists, join


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--config", help="Path to config file")
parser.add_argument("-o", "--out", help="Path to save scaler files.")
parser.add_argument("-p", "--procs", type=int, help="Number of processors")
args = parser.parse_args()
args_dict = vars(args)
config = args_dict.pop("config")
with open(config) as cf:
conf = yaml.load(cf, Loader=yaml.FullLoader)
all_era5_files = sorted(glob(conf["data"]["save_loc"]))
for e5 in all_era5_files:
if "_small_" in e5:
print(e5)
all_era5_files.remove(e5)
all_era5_filenames = [f.split("/")[-1] for f in all_era5_files]
with Pool(args.procs) as p:
all_scalers = p.map(fit_era5_scaler_year, all_era5_files)
all_scalers_df = pd.DataFrame(all_scalers, columns=["scaler_3d", "scaler_surface"],
index=all_era5_filenames)
if not exists(args.out):
os.makedirs(args.out)
now = pd.Timestamp.utcnow().strftime("%Y-%m-%d_%H:%M")
all_scalers_df.to_parquet(join(args.out, f"era5_quantile_scalers_{now}.parquet"))
return


def fit_era5_scaler_year(era5_file):
n_times = 300
eds = xr.open_zarr(era5_file)
vars_3d = ['U', 'V', 'T', 'Q']
vars_surf = ['SP', 't2m', 'V500', 'U500', 'T500', 'Z500', 'Q500']
levels = eds.level.values
var_levels = []
for var in vars_3d:
for level in levels:
var_levels.append(f"{var}_{level:d}")
dqs_3d = DQuantileTransformer(distribution="normal")
dqs_surf = DQuantileTransformer(distribution="normal")
rand_times = np.sort(np.random.choice(eds["time"].values, size=n_times, replace=False))
for time in rand_times:
print(time)
var_slices = []
for var in vars_3d:
for level in levels:
var_slices.append(eds[var].sel(time=time, level=level))
e3d = xr.concat(var_slices, pd.Index(var_levels, name="variable")
).transpose("latitude", "longitude", "variable")
dqs_3d.fit(e3d)
e_surf = xr.concat([eds[v].sel(time=time) for v in vars_surf], pd.Index(vars_surf, name="variable")
).transpose("latitude", "longitude", "variable")
dqs_surf.fit(e_surf)
dqs_3d_json = print_scaler(dqs_3d)
dqs_surf_json = print_scaler(dqs_surf)
return dqs_3d_json, dqs_surf_json

if __name__ == '__main__':
main()
4 changes: 2 additions & 2 deletions credit/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def __init__(
history_len: int = 1,
forecast_len: int = 2,
transform: Optional[Callable] = None,
SEED=42,
seed=42,
skip_periods=None,
):
self.history_len = history_len
Expand All @@ -376,7 +376,7 @@ def __init__(
all_fils.append(get_forward_data(filename=fn))
self.all_fils = all_fils
self.data_array = all_fils[0]
self.rng = np.random.default_rng(seed=SEED)
self.rng = np.random.default_rng(seed=seed)

#set data places:
indo = 0
Expand Down
4 changes: 2 additions & 2 deletions credit/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import math
import torch
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import LRScheduler
from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau


Expand Down Expand Up @@ -65,7 +65,7 @@ def lr_lambda_phase1(epoch, num_epochs=100, warmup_epochs=10):
return 0.5 * (1 + math.cos(math.pi * progress))


class CosineAnnealingWarmupRestarts(_LRScheduler):
class CosineAnnealingWarmupRestarts(LRScheduler):
"""
optimizer (Optimizer): Wrapped optimizer.
first_cycle_steps (int): First cycle step size.
Expand Down
6 changes: 6 additions & 0 deletions credit/tests/data_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from credit.data import ERA5Dataset


def test_data():
assert True
return
19 changes: 16 additions & 3 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
name: credit
channels:
- conda-forge
- pytorch
dependencies:
- python
- python=3.11
- pip
- numpy
- numpy<1.24
- pandas
- matplotlib
- cartopy
Expand All @@ -14,9 +15,21 @@ dependencies:
- xarray
- netcdf4
- pytorch
- torchvision
- wandb
- pyyaml
- cartopy
- dask
- distributed
- dask-jobqueue
- zarr
- jupyter
- pip:
- einops
- echo-opt
- bridgescaler
- rotary-embedding-torch
- cartopy
- segmentation-models-pytorch
- vector_quantize_pytorch
- .

Loading

0 comments on commit b9de9b5

Please sign in to comment.