diff --git a/LICENSE.md b/LICENSE.md index 6224393..0daaf65 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2022 Peter Van Katwyk +Copyright (c) 2024 Peter Van Katwyk Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 691b616..d93f19b 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,6 @@ This repository contains source code for processing climate forcings from [ISMIP Documentation can be found at here: . +To access code for exact replication of "A Variational LSTM Emulator of Sea Level Contribution From the Antarctic Ice Sheet", see the release [https://github.com/Brown-SciML/ise/releases/tag/v1.0.0](https://github.com/Brown-SciML/ise/releases/tag/v1.0.0). + *This repository is a work in progress that is actively being updated and improved. Feel free to contact Peter Van Katwyk, Ph.D. student @ Brown University at peter_van_katwyk@brown.edu with further questions.* diff --git a/compute_ivaf_minus_control.py b/compute_ivaf_minus_control.py deleted file mode 100644 index 312661e..0000000 --- a/compute_ivaf_minus_control.py +++ /dev/null @@ -1,257 +0,0 @@ -import xarray as xr -import os -import pandas as pd -import numpy as np -import netCDF4 as nc -import warnings -warnings.simplefilter("ignore") -# warnings.simplefilter("ignore", category=SerializationWarning) - -# # Goelzer et al., 2020 -- https://doi.org/10.5194/tc-14-833-2020 -# thif = -(rhow/rhoi)*topg; where (thif<0) thif=0 -# af=(lithk-thif)*sftgif*maxmask1*af2; where(af<0) af=0 -# ivaf=af.total($x,$y)*dx^2 - -# thif = ocean_density / ice_density * min(bed_i,0) -# ivaf = (thickness_i + thif) * groundmask_i * mask_i * scalefac_model * (resolution*1000)^2 - -# thickness_i = lithk -# groundmask_i = sftgrf -# mask_i = sftgif -# scalefac_model = af2 - -data_directory = r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-GrIS/" -densities_fp = r'/users/pvankatw/research/current/ise/utils/gris_model_densities.csv' -scalefac_fp = r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-GrIS/af2_ISMIP6_GrIS_05000m.nc" - -def interpolate_values(data): - y = pd.Series(data.y.values) - y = y.replace(0, np.NaN) - y = np.array(y.interpolate()) - - # first and last are NaNs, replace with correct values - y[0] = y[1] - (y[2]-y[1]) - y[-1] = y[-2] + (y[-2]-y[-3]) - - x = pd.Series(data.x.values) - x = x.replace(0, np.NaN) - x = np.array(x.interpolate()) - - # first and last are NaNs, replace with correct values - x[0] = x[1] - (x[2]-x[1]) - x[-1] = x[-2] + (x[-2]-x[-3]) - - return x, y - -def get_gris_model_densities(zenodo_directory: str, output_path: str=None): - """Used for getting rhoi and rhow values from the GrIS models outputs. - - Args: - zenodo_directory (_type_): _description_ - output_path (_type_): _description_ - - Returns: - _type_: _description_ - """ - results = [] - for root, dirs, files in os.walk(zenodo_directory): - for file in files: - if file.endswith(".nc"): # Check if the file is a NetCDF file - file_path = os.path.join(root, file) - try: - # Open the NetCDF file using xarray - dataset = xr.open_dataset(file_path) - - # Extract values for rhoi and rhow - if 'rhoi' in dataset and 'rhow' in dataset: - rhoi_values = dataset['rhoi'].values - rhow_values = dataset['rhow'].values - - # Append the filename and values to the results list - results.append({ - 'filename': file, - 'rhoi': rhoi_values, - 'rhow': rhow_values - }) - - # Close the dataset - dataset.close() - except Exception as e: - print(f"Error processing {file}: {e}") - - densities = [] - for file in results: - if 'ctrl_proj' in file['filename']: - continue - elif 'ILTS' in file['filename']: - fp = file['filename'].split('_') - group = 'ILTS_PIK' - model = fp[-2] - else: - fp = file['filename'].split('_') - group = fp[-3] - model = fp[-2] - densities.append([group, model, file['rhoi'], file['rhow']]) - - df = pd.DataFrame(densities, columns=['group', 'model', 'rhoi', 'rhow']) - df['rhoi'], df['rhow'] = df.rhoi.astype('float'), df.rhow.astype('float') - df = df.drop_duplicates() - - if output_path is not None: - df.to_csv(output_path, index=False) - - return df - - -def calculate_ivaf_single_file(directory, densities, scalefac_model, ctrl_proj=False): - - resolution = 5 #km - - path = directory.split('/') - exp = path[-1] - model = path[-2] - group = path[-3] - - # MUN_GISM1 is corrupted, skip - if group == 'MUN' and model == 'GSM1': - return -1 - - # exp = 'expd08' - # model = 'ISSM2' - # group = 'AWI' - - - # lookup densities from csv - subset_densities = densities[(densities.group == group) & (densities.model == model)] - rhoi = subset_densities.rhoi.values[0] - rhow = subset_densities.rhow.values[0] - - # load data - try: # error with MUN_GSM1 (HDF Error), maybe corrupted? Doesn't work in Jupyter either. - bed = xr.open_dataset(os.path.join(directory, f'topg_GIS_{group}_{model}_{exp}.nc')) - except OSError: - return 0 - thickness = xr.open_dataset(os.path.join(directory, f'lithk_GIS_{group}_{model}_{exp}.nc')) - mask = xr.open_dataset(os.path.join(directory, f'sftgif_GIS_{group}_{model}_{exp}.nc')) - ground_mask = xr.open_dataset(os.path.join(directory, f'sftgrf_GIS_{group}_{model}_{exp}.nc')) - length_time = len(thickness.time) - - # fill na values with zero - - if np.any(thickness.lithk.isnull()) or np.any(mask.sftgif.isnull()) or np.any(ground_mask.sftgrf.isnull()): - thickness = thickness.fillna(0) - mask = mask.fillna(0) - ground_mask = ground_mask.fillna(0) - - # na_values = [np.any(thickness.lithk.isnull()), np.any(mask.sftgif.isnull()), np.any(ground_mask.sftgrf.isnull())] - # labels = ['thickness', 'mask', 'ground_mask'] - # nas = [labels[i] for i, x in enumerate(na_values) if x] - # print(f"{group}_{model}_{exp}: Null values found in {nas}, processing unsuccessful.") - # continue - - #! TODO: Ask about this - if len(set(thickness.y.values)) != len(scalefac_model.y.values): - bed['x'], bed['y'] = interpolate_values(bed) - thickness['x'], thickness['y'] = interpolate_values(thickness) - mask['x'], mask['y'] = interpolate_values(mask) - ground_mask['x'], ground_mask['y'] = interpolate_values(ground_mask) - # print(f"{group}_{model}_{exp}: y dimensions do not match scalefac_model, processing unsuccessful.") - # continue - - # clip masks if they are below 0 or above 1 - if np.min(mask.sftgif.values) < 0 or np.max(mask.sftgif.values) > 1: - mask['sftgif'] = np.clip(mask.sftgif, 0., 1.) - if np.min(ground_mask.sftgrf.values) < 0 or np.max(ground_mask.sftgrf.values) > 1: - ground_mask['sftgrf'] = np.clip(ground_mask.sftgrf, 0., 1.) - - # if time is not a dimension, add copies for each time step - if 'time' not in bed.dims or bed.dims['time'] == 1: - try: - bed = bed.drop_vars(['time',]) - except ValueError: - pass - bed = bed.expand_dims(dim={'time': length_time}) - - ivaf = np.zeros(bed.topg.values.shape) - for i in range(length_time): - # bed_values = bed.topg.values[i,:,:] if len(bed.topg.dims) == 3 else bed.topg.values # sometimes time is missing for dims, so just use x,y - thif = rhow / rhoi * np.min(bed.topg.values[i,:,:],0) - masked_output = (thickness.lithk[i, :, :] + thif) * ground_mask.sftgrf[i, :, :] * mask.sftgif[i, :, :] - ivaf[i, :, :] = masked_output * scalefac_model.af2.values * (resolution*1000)**2 - - # subtract out control if for an experment - ivaf_nc = bed.copy() # copy file structure and metadata for ivaf file - if not ctrl_proj: - # open control dataset - ivaf_ctrl = xr.open_dataset(os.path.join("/".join(path[:-1]), f'ctrl_proj/ivaf_GIS_{group}_{model}_ctrl_proj.nc')) - - # if the time lengths don't match (one goes for 85 years and the other 86) select only time frames that match - if ivaf_ctrl.time.values.shape[0] > ivaf.shape[0]: - ivaf_ctrl = ivaf_ctrl.isel(time=slice(0,ivaf.shape[0])) - ivaf_nc = ivaf_nc.drop_sel(time=ivaf_nc.time.values[:(ivaf.shape[0]-ivaf_ctrl.time.values.shape[0])]) # drop extra time steps - elif ivaf_ctrl.time.values.shape[0] < ivaf.shape[0]: - ivaf_nc = ivaf_nc.drop_sel(time=ivaf_nc.time.values[ivaf_ctrl.time.values.shape[0]-ivaf.shape[0]:]) # drop extra time steps - ivaf = ivaf[0:ivaf_ctrl.time.values.shape[0],:,:] - - else: - pass - - ivaf = ivaf_ctrl.ivaf.values - ivaf - else: - pass - - # save ivaf file - ivaf_nc['ivaf'] = (('time', 'y', 'x'), ivaf) - ivaf_nc = ivaf_nc.drop_vars(['topg',]) - ivaf_nc.to_netcdf(os.path.join(directory, f'ivaf_GIS_{group}_{model}_{exp}.nc')) - - print(f"{group}_{model}_{exp}: Processing successful.") - - return 1 - -def calculate_ivaf_minus_control(data_directory, densities, scalefac_path): - - # error handling for densities argument (must be str filepath or dataframe) - if densities_fp is None: - raise ValueError("densities_fp must be specified. Run get_model_densities() to get density data.") - if isinstance(densities_fp, str): - densities = pd.read_csv(densities) - elif isinstance(densities_fp, pd.DataFrame): - pass - else: - raise ValueError("densities argument must be a string or a pandas DataFrame.") - - scalefac_model = xr.open_dataset(scalefac_path) - - ctrl_proj_dirs = [] - exp_dirs = [] - for root, dirs, files in os.walk(data_directory): - for directory in dirs: - if "ctrl_proj" in directory: - ctrl_proj_dirs.append(os.path.join(root, directory)) - elif 'exp' in directory: - exp_dirs.append(os.path.join(root, directory)) - else: - pass - - - # first calculate ivaf for control projections - for directory in ctrl_proj_dirs: - calculate_ivaf_single_file(directory, densities, scalefac_model, ctrl_proj=True) - - # then, for each experiment, calculate ivaf and subtract out control - for directory in exp_dirs: - calculate_ivaf_single_file(directory, densities, scalefac_model, ctrl_proj=False) - - - return 1 - -calculate_ivaf_minus_control(data_directory, densities_fp, scalefac_fp) - - - - -stop = '' - - - \ No newline at end of file diff --git a/examples/gp.py b/examples/gp.py deleted file mode 100644 index 88f4552..0000000 --- a/examples/gp.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys -sys.path.append("..") -from ise.models.gp import kernels -from ise.pipelines import training -import numpy as np -import pandas as pd - -processed_data = r"/users/pvankatw/emulator/untracked_folder/ml_data_directory" -gp_save_dir = r'/users/pvankatw/emulator/untracked_folder/gp/year_temp_salinity_n10000/' -kernel = kernels.PowerExponentialKernel() + kernels.NuggetKernel() -training.train_gaussian_process(data_directory=processed_data, n=10000, - features=['year', 'temperature', 'salinity'], sampling_method='first_n', - kernel=kernel, verbose=True, save_directory=gp_save_dir) \ No newline at end of file diff --git a/examples/grids/process.py b/examples/grids/process.py new file mode 100644 index 0000000..73e067f --- /dev/null +++ b/examples/grids/process.py @@ -0,0 +1,49 @@ +from ise.data.process import DatasetMerger, DimensionalityReducer, ProjectionProcessor + +ice_sheet = "AIS" +print(f"ice sheet: {ice_sheet}") + +# all filepaths... +forcing_directory = r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/" +projections_directory = ( + r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-AIS/" + if ice_sheet == "AIS" + else r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-GrIS/" +) +scalefac_fp = ( + r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/af2_el_ismip6_ant_01.nc" + if ice_sheet == "AIS" + else r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-GrIS/af2_ISMIP6_GrIS_05000m.nc" +) +densities_path = ( + r"/users/pvankatw/research/current/supplemental/AIS_densities.csv" + if ice_sheet == "AIS" + else r"/users/pvankatw/research/current/supplemental/GIS_densities.csv" +) + +scaler = 'robust' +output_dir = f"/oscar/scratch/pvankatw/datasets/pca_full_{scaler}/{ice_sheet}" +converted_forcing_dir = f"{output_dir}/forcings/" +converted_projection_dir = f"{output_dir}/projections/" +experiment_file = r"/users/pvankatw/research/current/ise/utils/ismip6_experiments_updated.csv" +# df = get_model_densities(r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/v7_CMIP5_pub", r"/users/pvankatw/research/current/supplemental/") + + +# Take both the forcing files and the projections, train PCA models, and convert forcings and projections to PCA space +pca = DimensionalityReducer( + forcing_dir=forcing_directory, + projection_dir=projections_directory, + output_dir=output_dir, + ice_sheet=ice_sheet, +) +pca.generate_pca_models(num_forcing_pcs='99%', num_projection_pcs='99.99%', scaling_method=scaler) +pca.convert_forcings(pca_model_directory=f"{output_dir}/pca_models/") +pca.convert_projections(pca_model_directory=f"{output_dir}/pca_models/") + +# Merge the converted forcings and projections into a single dataset +merger = DatasetMerger( + ice_sheet, converted_forcing_dir, converted_projection_dir, experiment_file, output_dir +) +merger.merge_dataset() + +print("Done!") diff --git a/examples/analyze_model.py b/examples/sectors/analyze_model.py similarity index 59% rename from examples/analyze_model.py rename to examples/sectors/analyze_model.py index 9cdb145..1608485 100644 --- a/examples/analyze_model.py +++ b/examples/sectors/analyze_model.py @@ -1,25 +1,26 @@ import sys -sys.path.append("..") -from ise.models.timeseries import TimeSeriesEmulator -from ise.pipelines.testing import analyze_model + import pandas as pd +sys.path.append("../..") +from ise.models.timeseries import TimeSeriesEmulator +from ise.pipelines.testing import analyze_model DATA_DIRECTORY = r"/users/pvankatw/emulator/untracked_folder/ml_data_directory" -PRETRAINED_MODELS = r'/users/pvankatw/emulator/ise/models/pretrained/' -UNTRACKED = r'/users/pvankatw/emulator/untracked_folder' +PRETRAINED_MODELS = r"/users/pvankatw/emulator/ise/models/pretrained/" +UNTRACKED = r"/users/pvankatw/emulator/untracked_folder" train_features = pd.read_csv(f"{DATA_DIRECTORY}/ts_train_features.csv") architecture = { - 'num_rnn_layers': 4, - 'num_rnn_hidden': 256, - 'input_layer_size': train_features.shape[1] + "num_rnn_layers": 4, + "num_rnn_hidden": 256, + "input_layer_size": train_features.shape[1], } -model_path = f'{PRETRAINED_MODELS}/Emulator.pt' +model_path = f"{PRETRAINED_MODELS}/Emulator.pt" -print('\nAnalyzing') +print("\nAnalyzing") analyze_model( data_directory=DATA_DIRECTORY, model_path=model_path, @@ -30,5 +31,5 @@ dropout_prob=0.3, mc_iterations=100, verbose=False, - save_directory=f'{UNTRACKED}/analyze_model' -) \ No newline at end of file + save_directory=f"{UNTRACKED}/analyze_model", +) diff --git a/examples/sectors/gp.py b/examples/sectors/gp.py new file mode 100644 index 0000000..bd3e149 --- /dev/null +++ b/examples/sectors/gp.py @@ -0,0 +1,18 @@ +import sys + +sys.path.append("../..") +from ise.models.gp import kernels +from ise.pipelines import training + +GP_SAVE_DIR = r"/users/pvankatw/emulator/untracked_folder/gp/year_temp_salinity_n10000/" +PROCESSED_DATA = r"/users/pvankatw/emulator/untracked_folder/ml_data_directory" +kernel = kernels.PowerExponentialKernel() + kernels.NuggetKernel() +training.train_gaussian_process( + data_directory=PROCESSED_DATA, + n=10000, + features=["year", "temperature", "salinity"], + sampling_method="first_n", + kernel=kernel, + verbose=True, + save_directory=GP_SAVE_DIR, +) diff --git a/examples/run.py b/examples/sectors/run.py similarity index 59% rename from examples/run.py rename to examples/sectors/run.py index e3b2d03..0226ce8 100644 --- a/examples/run.py +++ b/examples/sectors/run.py @@ -1,32 +1,37 @@ import sys -sys.path.append("..") -from ise.pipelines import processing, feature_engineering, training +sys.path.append("../..") + from ise.models.gp import kernels +from ise.pipelines import feature_engineering, processing, training FORCING_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/" -ISMIP6_OUTPUT_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/Zenodo_Outputs/" -GRIDS_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/ISMIP6_sectors/" +ISMIP6_OUTPUT_DIRECTORY = ( + r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/Zenodo_Outputs/" +) +GRIDS_DIRECTORY = ( + r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/ISMIP6_sectors/" +) PROCESSED_FORCING_OUTPUTS = r"/users/pvankatw/emulator/untracked_folder/processed_forcing_outputs/" ML_DATA_DIRECTORY = r"/users/pvankatw/emulator/untracked_folder/ml_data_directory/" SAVED_MODEL_PATH = r"/users/pvankatw/emulator/untracked_folder/saved_models/" -print('1/4: Processing Data') +print("1/4: Processing Data") master = processing.process_data( - forcing_directory=FORCING_DIRECTORY, + forcing_directory=FORCING_DIRECTORY, grids_directory=GRIDS_DIRECTORY, ismip6_output_directory=ISMIP6_OUTPUT_DIRECTORY, export_directory=PROCESSED_FORCING_OUTPUTS, ) -print('2/4: Feature Engineering') +print("2/4: Feature Engineering") feature_engineering.feature_engineer( data_directory=PROCESSED_FORCING_OUTPUTS, time_series=True, export_directory=ML_DATA_DIRECTORY, ) -print('3/4: Training Neural Network Model') +print("3/4: Training Neural Network Model") model, metrics, test_preds = training.train_timeseries_network( data_directory=ML_DATA_DIRECTORY, save_model=SAVED_MODEL_PATH, @@ -34,12 +39,17 @@ epochs=10, ) -print('4/4: Training Gaussian Process Model') -kernel = kernels.PowerExponentialKernel(exponential=1.9, ) + kernels.NuggetKernel() +print("4/4: Training Gaussian Process Model") +kernel = ( + kernels.PowerExponentialKernel( + exponential=1.9, + ) + + kernels.NuggetKernel() +) preds, std_prediction, metrics = training.train_gaussian_process( data_directory=ML_DATA_DIRECTORY, n=1000, kernel=kernel, - features=['temperature', 'salinity'], - sampling_method='first_n', + features=["temperature", "salinity"], + sampling_method="first_n", ) diff --git a/ise/__init__.py b/ise/__init__.py index 39d97f6..188b0a2 100644 --- a/ise/__init__.py +++ b/ise/__init__.py @@ -6,12 +6,15 @@ The main features of ISE include loading and processing of ISMIP6 sea level contribution simulations, data preparation and feature engineering for machine learning, and training and testing of trained neural network emulators. +The package is divided into two sections: `sectors` and `grids`. The sectors module provides all necessary functions for +creating and training emulators based on the 18 ISMIP6 sectors, while the grids module provides the same functionality +for smaller kilometer-scale grids. # Quickstart To get started, you must first have access to the Globus Archive containing the ISMIP6 climate -forcings and ice-sheet model outputs located at [GHub-ISMIP6-Forcing](https://app.globus.org/file-manager?origin_id=ad1a6ed8-4de0-4490-93a9-8258931766c7&origin_path=%2F). -Do not change the file structure or directory tree. +forcings and ISMIP6 model outputs. For information on gaining access to these datasets, see the [ISMIP +wiki page](https://theghub.org/groups/ismip6/wiki). Next, clone the repository by running the following command in your terminal: ```shell @@ -23,6 +26,6 @@ pip install -e . ``` -*This repository is a work in progress that is actively being updated and improved. Feel free to contact Peter Van Katwyk, Ph.D. student @ Brown University at peter_van_katwyk@brown.edu with further questions.* +*This repository is a work in progress that is actively being updated and improved. Feel free to contact Peter Van Katwyk, Ph.D. Candidate @ Brown University at peter_van_katwyk@brown.edu with further questions.* """ diff --git a/ise/data/EmulatorData.py b/ise/data/EmulatorData.py deleted file mode 100644 index aef26e3..0000000 --- a/ise/data/EmulatorData.py +++ /dev/null @@ -1,537 +0,0 @@ -"""Module containing EmulatorData class with all associated methods and attributes. Primarily carries out data loading, feature engineering & processing of formatted data.""" -import random -import pandas as pd -import numpy as np -from sklearn import preprocessing as sp -from typing import List - -np.random.seed(10) - - -class EmulatorData: - """Class containing attributes and methods for storing and handling ISMIP6 ice sheet data.""" - - def __init__(self, directory: str, spatial_grouping: str = "sectors"): - """Initializes class and opens/stores data. Includes loading processed files from - ise.data.processors functions, converting IVAF to SLE, and saving initial condition data - for later use. - - Args: - directory (str): Directory containing processed files from ise.data.processors functions. Should contain master.csv. - spatial_grouping (str, optional): Spatial grouping to be used for data aggregation. Must be in [sectors, regions]. Defaults to 'sectors'. - - """ - - self.directory = directory - - if spatial_grouping not in ["sectors", "regions"]: - raise ValueError( - "spatial_grouping must be in ['sectors', 'regions']." - ) - self.spatial_grouping = spatial_grouping - - try: - self.data = pd.read_csv(f"{self.directory}/master.csv", low_memory=False) - except FileNotFoundError: - try: - self.inputs = pd.read_csv(f"{self.directory}/inputs.csv") - self.outputs = pd.read_csv(f"{self.directory}/outputs.csv") - - # TODO: merge self.inputs and self.ouputs to make self.data - except FileNotFoundError: - raise FileNotFoundError( - "Files not found, make sure to run all processing functions." - ) - - # convert to SLE - self.data["sle"] = ( - self.data.ivaf / 1e9 / 362.5 - ) # m^3 / 1e9 / 362.5 = Gt / 1 mm --> mm SLE - self.data["modelname"] = self.data.groupname + "_" + self.data.modelname - - # Save data on initial conditions for use in data splitting - unique_batches = ( - self.data.groupby(["modelname", self.spatial_grouping, "exp_id"]) - .size() - .reset_index() - .rename(columns={0: "count"}) - .drop(columns="count") - ) - self.batches = unique_batches.values.tolist() - self.output_columns = [ - "icearea", - "iareafl", - "iareagr", - "ivol", - "ivaf", - "smb", - "smbgr", - "bmbfl", - "sle", - ] - - for col in [ - "icearea", - "iareafl", - "iareagr", - "ivol", - "smb", - "smbgr", - "bmbfl", - "sle", - ]: - self.data[col] = self.data[col].fillna(self.data[col].mean()) - - self.X = None - self.y = None - self.scaler_X = None - self.scaler_y = None - - def process( - self, - target_column: str = "sle", - drop_missing: bool = True, - drop_columns: List[str] = True, - boolean_indices: bool = True, - scale: bool = True, - split_type: str = "batch", - drop_outliers: str = False, - drop_expression: List[tuple] = None, - time_series: bool = False, - lag: int = None, - ): - """Carries out feature engineering & processing of formatted data. Includes dropping missing - values, eliminating columns, creating boolean indices of categorical variables, scaling, - dataset splitting, and more. Refer to the individual functions contained in the source - code for more information on each process. - - Args: - target_column (str, optional): Column to be predicted. Defaults to 'sle'. - drop_missing (bool, optional): Flag denoting whether to drop missing values. Defaults to True. - drop_columns (List[str], optional): List containing which columns (variables) to be dropped. Should be List[str] or boolean. If True is chosen, columns are dropped that will result in optimal performance. Defaults to True. - boolean_indices (bool, optional): Flag denoting whether to create boolean indices for all categorical variables left after dropping columns. Defaults to True. - scale (bool, optional): Flag denoting whether to scale data between zero and 1. Sklearn's MinMaxScaler is used. Defaults to True. - split_type (str, optional): Method to split data into training and testing set, must be in [random, batch]. Random is not recommended but is included for completeness. Defaults to 'batch'. - drop_outliers (str, optional): Method by which outliers will be dropped, must be in [quantile, explicit]. Defaults to False. - drop_expression (list[tuple], optional): Expressions by which to drop outliers, see EmulatorData.drop_outliers. If drop_outliers==quantile, drop_expression must be a list[float] containing quantile bounds. Defaults to None. - time_series (bool, optional): Flag denoting whether to process the data as a time-series dataset or traditional non-time dataset. Defaults to False. - lag (int, optional): Lag variable for time-series processing. Defaults to None. - - Returns: - tuple: Multi-output returning [EmulatorData, train_features, test_features, train_labels, test_labels] - """ - - if time_series: - if lag is None: - raise ValueError("If time_series == True, lag cannot be None") - time_dependent_columns = [ - "salinity", - "temperature", - "thermal_forcing", - "pr_anomaly", - "evspsbl_anomaly", - "mrro_anomaly", - "smb_anomaly", - "ts_anomaly", - ] - separated_dfs = [ - y for x, y in self.data.groupby([self.spatial_grouping, "exp_id", "modelname"]) - ] - - for df in separated_dfs: - for shift in range(1, lag + 1): - for column in time_dependent_columns: - df[f"{column}.lag{shift}"] = df[column].shift( - shift, fill_value=np.nan - ).fillna(method='bfill') - - self.data = pd.concat(separated_dfs) - - if drop_columns: - if drop_columns is True: - secondary_spatial_grouping = 'regions' if self.spatial_grouping == 'sectors' else 'sectors' - self.drop_columns( - columns=["experiment", "exp_id", "groupname", secondary_spatial_grouping] - ) - elif isinstance(drop_columns, list): - self.drop_columns(columns=drop_columns) - else: - raise ValueError( - f"drop_columns argument must be of type boolean|list, received {type(drop_columns)}" - ) - print('drop_missing') - if drop_missing: - self = self.drop_missing() - - if self.spatial_grouping == 'regions': - self = self.group_by_region() - - if boolean_indices: - self = self.create_boolean_indices() - - print('drop_outliers') - if drop_outliers: - if drop_outliers.lower() == "quantile": - self = self.drop_outliers( - method=drop_outliers, quantiles=drop_expression - ) - elif drop_outliers.lower() == "explicit": - self = self.drop_outliers( - method=drop_outliers, expression=drop_expression - ) - else: - raise ValueError( - "drop_outliers argument must be in [quantile, explicit]" - ) - - print('split_data') - self = self.split_data(target_column=target_column) - - if scale: - self.X = self.scale(self.X, "inputs", scaler="MinMaxScaler") - if target_column == "sle": - self.y = np.array(self.y) - else: - self.y = self.scale(self.y, "outputs", scaler="MinMaxScaler") - - print('train_test_split') - self = self.train_test_split(split_type=split_type) - - return ( - self, - self.train_features, - self.test_features, - self.train_labels, - self.test_labels, - ) - - def drop_outliers( - self, - method: str, - expression: List[tuple] = None, - quantiles: List[float] = [0.01, 0.99], - ): - """Drops simulations that are outliers based on the provided method and expression. - Extra complexity is handled due to the necessity of removing the entire 85 row series from - the dataset rather than simply removing the rows with given conditions. Note that the - condition indicates rows to be DROPPED, not kept (e.g. 'sle', '>', '20' would drop all - simulations containing sle values over 20). If quantile method is used, outliers are dropped - from the SLE column based on the provided quantile in the quantiles argument. If explicit is - chosen, expression must contain a list of tuples such that the tuple contains - [(column, operator, expression)] of the subset, e.g. [("sle", ">", 20), ("sle", "<", -20)]. - - Args: - method (str): Method of outlier deletion, must be in [quantile, explicit] - expression (list[tuple]): List of subset expressions in the form [column, operator, value], defaults to None. - quantiles (list[float]): , defaults to [0.01, 0.99]. - - Returns: - EmulatorData: self, with self.data having outliers dropped. - """ - - if method.lower() == "quantile": - if quantiles is None: - raise AttributeError( - "If method == quantile, quantiles argument cannot be None" - ) - lower_sle, upper_sle = np.quantile(np.array(self.data.sle), quantiles) - outlier_data = self.data[ - (self.data["sle"] <= lower_sle) | (self.data["sle"] >= upper_sle) - ] - elif method.lower() == "explicit": - - if expression is None: - raise AttributeError( - "If method == explicit, expression argument cannot be None" - ) - elif not isinstance(expression, list) or not isinstance( - expression[0], tuple - ): - raise AttributeError( - 'Expression argument must be a list of tuples, e.g. [("sle", ">", 20), ("sle", "<", -20)]' - ) - - outlier_data = self.data - for subset_expression in expression: - column, operator, value = subset_expression - - if operator.lower() in ("equal", "equals", "=", "=="): - outlier_data = outlier_data[outlier_data[column] == value] - elif operator.lower() in ("not equal", "not equals", "!=", "~="): - outlier_data = outlier_data[outlier_data[column] != value] - elif operator.lower() in ("greather than", "greater", ">=", ">"): - outlier_data = outlier_data[outlier_data[column] > value] - elif operator.lower() in ("less than", "less", "<=", "<"): - outlier_data = outlier_data[outlier_data[column] < value] - else: - raise ValueError( - f'Operator must be in ["==", "!=", ">", "<"], received {operator}' - ) - - if outlier_data.empty: - return self - - cols = outlier_data.columns - nonzero_columns = outlier_data.apply(lambda x: x > 0).apply( - lambda x: list(cols[x.values]), axis=1 - ) - - # Create dataframe of experiments with outliers (want to delete the entire 85 rows) - outlier_runs = pd.DataFrame() - outlier_runs["modelname"] = nonzero_columns.apply(lambda x: x[-6]) - outlier_runs["exp_id"] = nonzero_columns.apply(lambda x: x[-5]) - outlier_runs[self.spatial_grouping] = outlier_data[self.spatial_grouping] - outlier_runs_list = outlier_runs.values.tolist() - unique_outliers = [list(x) for x in set(tuple(x) for x in outlier_runs_list)] - - # Drop those runs - for i in unique_outliers: - modelname = i[0] - exp_id = i[1] - sector = i[2] - self.data = self.data.drop( - self.data[ - (self.data[modelname] == 1) - & (self.data[exp_id] == 1) - & (self.data[self.spatial_grouping] == sector) - ].index - ) - - return self - - - def group_by_region(self): - # self.data = self.data.groupby(by=[self.data.regions, self.data.modelname, self.data.exp_id, self.data.year]).mean() - agg_dict = {} - cols = [x for x in self.data.columns if self.data[x].dtype not in ['object', 'str']] - for x in cols: - if x == 'sle': - agg_dict['sle'] = 'sum' - elif x in ('regions', 'modelname', 'exp_id', 'year'): - pass - else: - agg_dict[x] = 'mean' - - self.data = self.data.groupby(by=[self.data.regions, self.data.modelname, self.data.exp_id, self.data.year]).agg(agg_dict) - - self.data = self.data.reset_index() - self.data = self.data.drop(columns=['sectors']) - return self - - def split_data( - self, - target_column: str, - ): - """Splits data into features and labels based on target column. - - Args: - target_column (str): Output column to be predicted. - - Returns: - EmulatorData: self, with self.X and self.y as attributes. - """ - self.target_column = target_column - self.X = self.data.drop(columns=self.output_columns) - self.y = self.data[target_column] - self.input_columns = self.X.columns - return self - - def train_test_split(self, train_size: float = 0.7, split_type: str = "batch"): - """Splits dataset into training set and testing set. Can be split using two different - methods: random and batch. The random method splits by randomly sampling rows, whereas - batch method randomly samples entire simulation series (85 rows) in order to keep simulations - together during testing. Random method is included for completeness but is not recommended - for use in emulator creation. - - Args: - train_size (float, optional): Proportion of data in training set, between 0 and 1. Defaults to 0.7. - split_type (str, optional): Splitting method, must be in [random, batch]. Defaults to 'batch'. - - Returns: - EmulatorData: self, with self.train_features, self.test_features, self.train_labels, self.test_labels as attributes. - """ - - if not isinstance(self.X, pd.DataFrame): - self.X = pd.DataFrame(self.X, columns=self.input_columns) - - if "random" in split_type.lower(): - self.train_features = self.X.sample(frac=train_size, random_state=0) - training_indices = self.train_features.index - self.train_labels = self.y[training_indices].squeeze() - - self.test_features = self.X.drop(training_indices) - self.test_labels = pd.Series(self.y.squeeze()).drop(training_indices) - - elif split_type.lower() == "batch": - # batch -- grouping of 85 years of a particular model, experiment, and sector - # Calculate how many batches you'll need (roughly) for train/test proportion - test_num_rows = len(self.X) * (1 - train_size) - num_years = len(set(self.data.year)) - num_test_batches = test_num_rows // num_years - - # Get all possible values for sector, experiment, and model - all_sectors = list(set(self.X[self.spatial_grouping])) - all_experiments = [col for col in self.X.columns if "exp_id" in col] - all_modelnames = [col for col in self.X.columns if "modelname" in col] - - # Set up concatenation of test data scenarios... - test_scenarios = [] - test_dataset = pd.DataFrame() - - # Keep this running until you have enough samples - np.random.seed(10) - while len(test_scenarios) < num_test_batches: - print(len(test_scenarios), '/', num_test_batches, end='\r') - # Get a random - random_model = np.random.choice(all_modelnames) - random_sector = np.random.choice(all_sectors) - random_experiment = np.random.choice(all_experiments) - test_scenario = [random_model, random_sector, random_experiment] - if test_scenario not in test_scenarios: - scenario_df = self.X[ - (self.X[random_model] == 1) - & (self.X[self.spatial_grouping] == random_sector) - & (self.X[random_experiment] == 1) - ] - if not scenario_df.empty: - test_scenarios.append(test_scenario) - test_dataset = pd.concat([test_dataset, scenario_df]) - self.test_features = test_dataset - testing_indices = self.test_features.index - self.test_labels = self.y[testing_indices].squeeze() - - self.train_features = self.X.drop(testing_indices) - self.train_labels = pd.Series(self.y.squeeze()).drop(testing_indices) - - self.test_scenarios = test_scenarios - - else: - raise (f"split_type must be in [random, batch], received {split_type}") - - return self - - def drop_missing(self): - """Drops rows with missing values (wrapper for pandas.DataFrame.dropna()). - - Returns: - EmulatorData: self, with NA values dropped from self.data. - """ - self.data = self.data.dropna() - return self - - def create_boolean_indices(self, columns: str = "all"): - """Creates boolean indices (one hot encoding) for categoritcal variables in columns - argument. Wrapper for pandas.get_dummies() with added functionality for prefix separation. - - Args: - columns (str, optional): Categorical variables to be encoded. Defaults to 'all'. - - Returns: - EmulatorData: self, with boolean indices in self.data. - """ - if columns == "all": - self.data = pd.get_dummies(self.data, prefix_sep="-") - else: - if not isinstance(columns, list): - raise ValueError( - f"Columns argument must be of type: list, received {type(columns)}." - ) - - self.data = pd.get_dummies(self.data, columns=columns, prefix_sep="-") - - for col in self.data.columns: - self.data[col] = self.data[col].astype(float) - return self - - def drop_columns(self, columns: List[str]): - """Drops columns in columns argument from the dataset. Wrapper for pandas.DataFrame.drop() - with error checking. - - Args: - columns (List[str]): List of columns (or singular string column) to be dropped from the dataset. - - Returns: - EmulatorData: self, with desired columns dropped from self.data. - """ - if not isinstance(columns, list) and not isinstance(columns, str): - raise ValueError( - f"Columns argument must be of type: str|list, received {type(columns)}." - ) - columns = list(columns) - - self.data = self.data.drop(columns=columns) - - return self - - def scale( - self, values: pd.DataFrame, values_type: str, scaler: str = "MinMaxScaler" - ): - """Scales dataframe and saves scaler for future use in unscaling. Sklearn's scaling API is - used. MinMaxScaler is recommended but StandardScaler is also supported. - - Args: - values (pd.DataFrame): Dataframe to be scaled. - values_type (str): Whether the dataframe to be scaled is a feature or labels dataframe, must be in [inputs, outputs] - scaler (str, optional): Type of scaler to be used, must be in [MinMaxScaler, StandardScaler]. Defaults to "MinMaxScaler". - - Returns: - pd.DataFrame: scaled dataset with self.scaler_X and self.scaler_y as attributes in the EmulatorData class. - """ - if self.X is None and self.y is None: - raise AttributeError( - "Data must be split before scaling using model.split_data method." - ) - - if "minmax" in scaler.lower(): - if "input" in values_type.lower(): - self.scaler_X = sp.MinMaxScaler() - else: - self.scaler_y = sp.MinMaxScaler() - elif "standard" in scaler.lower(): - if "input" in values_type.lower(): - self.scaler_X = sp.StandardScaler() - else: - self.scaler_y = sp.StandardScaler() - else: - raise ValueError( - f"scaler argument must be in ['MinMaxScaler', 'StandardScaler'], received {scaler}" - ) - - if "input" in values_type.lower(): - self.input_columns = self.X.columns - self.scaler_X.fit(self.X) - return pd.DataFrame(self.scaler_X.transform(values), columns=self.X.columns) - - # TODO: Don't need this anymore with SLE as the prediction - elif "output" in values_type.lower(): - self.scaler_y.fit(np.array(self.y).reshape(-1, 1)) - return self.scaler_y.transform(np.array(values).reshape(-1, 1)) - - else: - raise ValueError( - f"values_type must be in ['inputs', 'outputs'], received {values_type}" - ) - - def unscale(self, values: pd.DataFrame, values_type: str): - """Unscales data based on scalers trained in EmulatorData.scale(). - - Args: - values (pd.DataFrame): Dataframe to be unscaled. - values_type (str): Whether the dataframe to be unscaled is a feature or labels dataframe, must be in [inputs, outputs] - - Returns: - pd.DataFrame: unscaled dataset. - """ - - if "input" in values_type.lower(): - return pd.DataFrame( - self.scaler_X.inverse_transform(values), columns=self.input_columns - ) - - elif "output" in values_type.lower(): - return self.scaler_y.inverse_transform(values.reshape(-1, 1)) - - else: - raise ValueError( - f"values_type must be in ['inputs', 'outputs'], received {values_type}" - ) diff --git a/ise/data/__init__.py b/ise/data/__init__.py index 755638d..e69de29 100644 --- a/ise/data/__init__.py +++ b/ise/data/__init__.py @@ -1,8 +0,0 @@ -r""" -# [EmulatorData](https://brown-sciml.github.io/ise/ise/data/EmulatorData.html) -Module containing EmulatorData class with all associated methods and attributes. Primarily carries out data loading, feature engineering & processing of formatted data. - -# [processors](https://brown-sciml.github.io/ise/ise/data/processors.html) -""" - -from ise.data.EmulatorData import EmulatorData diff --git a/ise/data/dataclasses.py b/ise/data/dataclasses.py new file mode 100644 index 0000000..71c02b1 --- /dev/null +++ b/ise/data/dataclasses.py @@ -0,0 +1,214 @@ +import warnings + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import Dataset + + +class EmulatorDataset(Dataset): + """ + A PyTorch dataset for loading emulator data. + + Args: + X (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The input data. + y (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The target data. + sequence_length (int): The length of the input sequence. + + Attributes: + X (torch.Tensor): The input data as a PyTorch tensor. + y (torch.Tensor): The target data as a PyTorch tensor. + sequence_length (int): The length of the input sequence. + + Methods: + __to_tensor(x): Converts input data to a PyTorch tensor. + __len__(): Returns the length of the dataset. + __getitem__(i): Returns the i-th item in the dataset. + """ + + def __init__(self, X, y, sequence_length=5): + super().__init__() + + if X.shape[0] < 86: + warnings.warn( + "Full projections of 86 timesteps are not present in the dataset. This may lead to unexpected behavior." + ) + self.X = self._to_tensor(X) + self.y = self._to_tensor(y) + self.sequence_length = sequence_length + self.xdim = len(X.shape) + + if self.xdim == 3: # batched by projection + self.num_projections, self.num_timesteps, self.num_features = X.shape + elif self.xdim == 2: # unbatched (rows of projections*timestamps) + self.projections_and_timesteps, self.features = X.shape + self.num_timesteps = 86 + self.num_projections = self.projections_and_timesteps // self.num_timesteps + # self.num_sequences = self.timesteps - sequence_length + 1 + + def _to_tensor(self, x): + """ + Converts input data to a PyTorch tensor of type float. + + Args: + x: Input data to be converted. Must be a pandas dataframe, numpy array, or PyTorch tensor. + + Returns: + A PyTorch tensor of type float. + """ + if x is None: + return None + if isinstance(x, pd.DataFrame): + x = torch.tensor(x.values) + elif isinstance(x, np.ndarray): + x = torch.tensor(x) + elif isinstance(x, torch.Tensor): + pass + else: + raise ValueError("Data must be a pandas dataframe, numpy array, or PyTorch tensor") + return x.float() + + def __len__(self): + if self.xdim == 2: + return self.X.shape[0] + else: + return self.X.shape[0] * self.X.shape[1] + + def __getitem__(self, i): + """ + Returns the i-th item in the dataset. + + Args: + i (int): Index of the item to retrieve. + + Returns: + If `y` is None, returns the input sequence at index `i` as a PyTorch tensor. + Otherwise, returns a tuple containing the input sequence at index `i` and the corresponding target value. + """ + # Calculate projection index and timestep index + projection_index = i // self.num_timesteps + time_step_index = i % self.num_timesteps + + # Initialize a sequence with zeros for padding + sequence = torch.zeros((self.sequence_length, self.features)) + + # Calculate start and end points for the data to copy from the original dataset + start_point = max(0, time_step_index - self.sequence_length + 1) + end_point = time_step_index + 1 + length_of_data = end_point - start_point + + # Copy the data from the dataset to the end of the sequence to preserve recent data at the end + if self.xdim == 3: + sequence[-length_of_data:] = self.X[projection_index, start_point:end_point] + elif self.xdim == 2: + sequence[-length_of_data:] = self.X[ + projection_index * self.num_timesteps + + start_point : projection_index * self.num_timesteps + + end_point + ] + + if self.y is None: + return sequence + + return sequence, self.y[i] + + +class PyTorchDataset(Dataset): + """ + A PyTorch dataset for general data loading. + + Args: + X (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The input data. + y (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The target data. + + Methods: + __getitem__(index): Returns the item at the given index. + __len__(): Returns the length of the dataset. + """ + + def __init__(self, X, y): + self.X_data = X + self.y_data = y + + def __getitem__(self, index): + """ + Returns the item at the given index. + + Args: + index (int): Index of the item to retrieve. + + Returns: + If `y` is None, returns the input data at index `index`. + Otherwise, returns a tuple containing the input data at index `index` and the corresponding target value. + """ + if self.y_data is None: + return self.X_data[index] + return self.X_data[index], self.y_data[index] + + def __len__(self): + """ + Returns the length of the dataset. + + Returns: + The length of the dataset. + """ + return len(self.X_data) + + +class TSDataset(Dataset): + """ + A PyTorch dataset for time series data. + + Args: + X (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The input data. + y (pandas.DataFrame, numpy.ndarray, or torch.Tensor): The target data. + sequence_length (int): The length of the input sequence. + + Attributes: + X (torch.Tensor): The input data as a PyTorch tensor. + y (torch.Tensor): The target data as a PyTorch tensor. + sequence_length (int): The length of the input sequence. + + Methods: + __len__(): Returns the length of the dataset. + __getitem__(i): Returns the i-th item in the dataset. + """ + + def __init__(self, X, y, sequence_length=5): + super().__init__() + self.X = X + self.y = y + self.sequence_length = sequence_length + + def __len__(self): + """ + Returns the length of the dataset. + + Returns: + The length of the dataset. + """ + return len(self.X) + + def __getitem__(self, i): + """ + Returns the i-th item in the dataset. + + Args: + i (int): Index of the item to retrieve. + + Returns: + If `y` is None, returns the input sequence at index `i` as a PyTorch tensor. + Otherwise, returns a tuple containing the input sequence at index `i` and the corresponding target value. + """ + if i >= self.sequence_length - 1: + i_start = i - self.sequence_length + 1 + x = self.X[i_start : (i + 1), :] + else: + padding = self.X[0].repeat(self.sequence_length - i - 1, 1) + x = self.X[0 : (i + 1), :] + x = torch.cat((padding, x), 0) + + if self.y is None: + return x + + return x, self.y[i] diff --git a/ise/data/feature_engineer.py b/ise/data/feature_engineer.py new file mode 100644 index 0000000..bb050c2 --- /dev/null +++ b/ise/data/feature_engineer.py @@ -0,0 +1,612 @@ +import os +import pickle +from typing import List +import torch + +import numpy as np +import pandas as pd +from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler +from tqdm import tqdm + + +class FeatureEngineer: + """ + A class for feature engineering operations on a given dataset. + + Args: + data (pd.DataFrame): The input dataset. + fill_mrro_nans (bool, optional): Flag indicating whether to fill missing values in the 'mrro' column. Defaults to False. + split_dataset (bool, optional): Flag indicating whether to split the dataset into training, validation, and test sets. Defaults to False. + train_size (float, optional): The proportion of the dataset to be used for training. Defaults to 0.7. + val_size (float, optional): The proportion of the dataset to be used for validation. Defaults to 0.15. + test_size (float, optional): The proportion of the dataset to be used for testing. Defaults to 0.15. + output_directory (str, optional): The directory to save the split datasets. Defaults to None. + """ + + def __init__( + self, + ice_sheet, + data: pd.DataFrame, + fill_mrro_nans: bool = False, + split_dataset: bool = False, + train_size: float = 0.7, + val_size: float = 0.15, + test_size: float = 0.15, + output_directory: str = None, + ): + self.data = data + try: + self.data = self.data.sort_values(by=["model", "exp", "sector", "year"]) + except: + pass + self.train_size = train_size + self.val_size = val_size + self.test_size = test_size + self.output_directory = output_directory + + self.scaler_X_path = None + self.scaler_y_path = None + self.scaler_X = None + self.scaler_y = None + + if fill_mrro_nans: + self.data = self.fill_mrro_nans(method="zero") + + if split_dataset: + self.train, self.val, self.test = self.split_data( + data, train_size, val_size, test_size, output_directory, random_state=42 + ) + self._including_model_characteristics = False + + + self.train = None + self.val = None + self.test = None + + def split_data( + self, + data=None, + train_size=None, + val_size=None, + test_size=None, + output_directory=None, + random_state=42, + ): + """ + Splits the dataset into training, validation, and test sets. + + Args: + data (pd.DataFrame, optional): The input dataset. If not provided, the class attribute 'data' will be used. Defaults to None. + train_size (float, optional): The proportion of the dataset to be used for training. If not provided, the class attribute 'train_size' will be used. Defaults to None. + val_size (float, optional): The proportion of the dataset to be used for validation. If not provided, the class attribute 'val_size' will be used. Defaults to None. + test_size (float, optional): The proportion of the dataset to be used for testing. If not provided, the class attribute 'test_size' will be used. Defaults to None. + output_directory (str, optional): The directory to save the split datasets. If not provided, the class attribute 'output_directory' will be used. Defaults to None. + random_state (int, optional): The random seed for reproducibility. Defaults to 42. + + Returns: + tuple: A tuple containing the training, validation, and test sets. + """ + if data is not None: + self.data = data + if train_size is not None: + self.train_size = train_size + if val_size is not None: + self.val_size = val_size + if output_directory is not None: + self.output_directory = output_directory + + self.train, self.val, self.test = split_training_data( + self.data, + self.train_size, + self.val_size, + self.test_size, + self.output_directory, + random_state, + ) + return self.train, self.val, self.test + + def fill_mrro_nans(self, method, data=None): + """ + Fills missing values in the 'mrro' column of the dataset. + + Args: + method (str): The method to use for filling missing values. + data (pd.DataFrame, optional): The input dataset. If not provided, the class attribute 'data' will be used. Defaults to None. + + Returns: + pd.DataFrame: The dataset with missing values in the 'mrro' column filled. + """ + if data is not None: + self.data = data + self.data = fill_mrro_nans(self.data, method) + + return self.data + + def scale_data(self, X=None, y=None, method="standard", save_dir=None): + if X is not None: + self.X = X + else: + if self._including_model_characteristics: + dropped_columns = [ + "id", + "cmip_model", + "pathway", + "exp", + "ice_sheet", + "Scenario", + "Tier", + "aogcm", + "id", + "exp", + "model", + "ivaf", + ] + else: + dropped_columns = [ + "id", + "cmip_model", + "pathway", + "exp", + "ice_sheet", + "Scenario", + "Ocean forcing", + "Ocean sensitivity", + "Ice shelf fracture", + "Tier", + "aogcm", + "id", + "exp", + "model", + "ivaf", + ] + dropped_columns = [x for x in self.data.columns if x in dropped_columns] + dropped_data = self.data[dropped_columns] + self.X = self.data.drop( + columns=[x for x in self.data.columns if "sle" in x] + dropped_columns + ) + + if y is not None: + self.y = y + else: + self.y = self.data[[x for x in self.data.columns if "sle" in x]] + + if self.scaler_X_path is not None and self.scaler_y_path is not None: + scaler_X = pickle.load(open(self.scaler_X_path, "rb")) + scaler_y = pickle.load(open(self.scaler_y_path, "rb")) + + return scaler_X.transform(self.X), scaler_y.transform(self.y) + elif self.scaler_X is not None and self.scaler_y is not None: + return self.scaler_X.transform(self.X), self.scaler_y.transform(self.y) + + if (self.X is None and X is None) or (self.y is None and y is None): + raise ValueError( + "X and y must be provided if they are not already stored in the class instance." + ) + + # Initialize the scalers based on the method + if method == "standard": + scaler_X = StandardScaler() + scaler_y = StandardScaler() + elif method == "minmax": + scaler_X = MinMaxScaler() + scaler_y = MinMaxScaler() + elif method == "robust": + scaler_X = RobustScaler() + scaler_y = RobustScaler() + else: + raise ValueError("method must be 'standard', 'minmax', or 'robust'") + + # Store scalers in the class instance for potential future use + self.scaler_X, self.scaler_y = scaler_X, scaler_y + + # Fit and transform X + if isinstance(self.X, pd.DataFrame): + X_data = self.X.values + elif isinstance(self.X, np.ndarray): + X_data = self.X + else: + raise TypeError("X must be either a pandas DataFrame or a NumPy array.") + + scaler_X.fit(X_data) + X_scaled = scaler_X.transform(X_data) + + # Fit and transform X + if isinstance(self.y, pd.DataFrame): + y_data = self.y.values + elif isinstance(self.y, np.ndarray): + y_data = self.y + else: + raise TypeError("X must be either a pandas DataFrame or a NumPy array.") + + scaler_y.fit(y_data) + y_scaled = scaler_y.transform(y_data) + self.scaler_X, self.scaler_y = scaler_X, scaler_y + + # Optionally save the scalers + if save_dir is not None: + if os.path.exists(f'{save_dir}/scalers/'): + self.scaler_X_path = f"{save_dir}/scalers/scaler_X.pkl" + self.scaler_y_path = f"{save_dir}/scalers/scaler_y.pkl" + else: + self.scaler_X_path = f"{save_dir}/scaler_X.pkl" + self.scaler_y_path = f"{save_dir}/scaler_y.pkl" + with open(self.scaler_X_path, "wb") as f: + pickle.dump(scaler_X, f) + with open(self.scaler_y_path, "wb") as f: + pickle.dump(scaler_y, f) + + self.data = pd.concat( + [ + pd.DataFrame(X_scaled, columns=self.X.columns, index=self.X.index), + pd.DataFrame(y_scaled, columns=self.y.columns, index=self.y.index), + dropped_data, + ], + axis=1, + ) + + return X_scaled, y_scaled + + def unscale_data(self, X=None, y=None, scaler_X_path=None, scaler_y_path=None): + + if scaler_X_path is not None: + self.scaler_X_path = scaler_X_path + if scaler_y_path is not None: + self.scaler_y_path = scaler_y_path + + if isinstance(y, torch.Tensor): + y = y.detach().cpu().numpy() + + # Load scaler for X + if X is not None: + if self.scaler_X_path is None: + raise ValueError("scaler_X_path must be provided if X is not None.") + with open(self.scaler_X_path, "rb") as f: + scaler_X = pickle.load(f) + X_unscaled = scaler_X.inverse_transform(X) + if isinstance(X, pd.DataFrame): + X_unscaled = pd.DataFrame(X_unscaled, columns=X.columns, index=X.index) + else: + X_unscaled = None + + # Load scaler for y + if y is not None: + if self.scaler_y_path is None: + raise ValueError("scaler_y_path must be provided if y is not None.") + with open(self.scaler_y_path, "rb") as f: + scaler_y = pickle.load(f) + y_unscaled = scaler_y.inverse_transform(y) + if isinstance(y, pd.DataFrame): + y_unscaled = pd.DataFrame(y_unscaled, columns=y.columns, index=y.index) + else: + y_unscaled = None + + return X_unscaled, y_unscaled + + def add_lag_variables(self, lag, data=None): + if data is not None: + self.data = data + self.data = add_lag_variables(self.data, lag) + return self + + def backfill_outliers(self, percentile=99.999, data=None): + if data is not None: + self.data = data + self.data = backfill_outliers(self.data, percentile=percentile) + return self + + def drop_outliers(self, method, column, expression=None, quantiles=[0.01, 0.99], data=None): + if data is not None: + self.data = data + self.data = drop_outliers(self.data, column, method, expression, quantiles) + return self + + def add_model_characteristics(self, data=None, model_char_path=r'./ise/utils/model_characteristics.csv', encode=True): + if data is not None: + self.data = data + self.data = add_model_characteristics(self.data, model_char_path, encode) + self._including_model_characteristics = True + + return self + + +def add_model_characteristics(data, model_char_path=r'./ise/utils/model_characteristics.csv', encode=True) -> pd.DataFrame: + model_chars = pd.read_csv(model_char_path) + all_data = pd.merge(data, model_chars, on='model', how='left') + existing_char_columns = ['Ocean forcing', 'Ocean sensitivity', 'Ice shelf fracture'] # These are the columns that are already in the data and should not be encoded + if encode: + all_data = pd.get_dummies(all_data, columns=[x for x in model_chars.columns if x not in ['initial_year', 'model', 'Scenario', ]] + existing_char_columns) + + return all_data + +def backfill_outliers(data, percentile=99.999): + """ + Replaces extreme values in y-values (above the specified percentile and below the 1-percentile across all y-values) + with the value from the previous row. + + Args: + percentile (float): The percentile to use for defining upper extreme values across all y-values. Defaults to 99.999. + """ + # Assuming y-values are in columns named with 'sle' as mentioned in other methods + y_columns = [col for col in data.columns if "sle" in col] + + # Concatenate all y-values to compute the overall upper and lower percentile thresholds + all_y_values = pd.concat([data[col].dropna() for col in y_columns]) + upper_threshold = np.percentile(all_y_values, percentile) + lower_threshold = np.percentile(all_y_values, 100 - percentile) + + # Iterate over each y-column to backfill outliers based on the overall upper and lower thresholds + for col in y_columns: + upper_extreme_value_mask = data[col] > upper_threshold + lower_extreme_value_mask = data[col] < lower_threshold + + # Temporarily replace upper and lower extreme values with NaN + data.loc[upper_extreme_value_mask, col] = np.nan + data.loc[lower_extreme_value_mask, col] = np.nan + + # Use backfill method to fill NaN values + data[col] = data[col].fillna(method="bfill") + + return data + + +def add_lag_variables(data: pd.DataFrame, lag: int) -> pd.DataFrame: + """ + Adds lag variables to the input DataFrame. + + Args: + data (pd.DataFrame): The input DataFrame. + lag (int): The number of time steps to lag the variables. + + Returns: + pd.DataFrame: The DataFrame with lag variables added. + """ + + # Separate columns that won't be lagged and shouldn't be dropped + cols_to_exclude = [ + "id", + "cmip_model", + "pathway", + "exp", + "ice_sheet", + "Scenario", + "Ocean forcing", + "Ocean sensitivity", + "Ice shelf fracture", + "Tier", + "aogcm", + "id", + "exp", + "model", + "ivaf", + "sector", + ] + cols_to_exclude = [x for x in cols_to_exclude if x in data.columns] + temporal_indicator = "time" if "time" in data.columns else "year" + non_lagged_cols = [temporal_indicator] + [ + x for x in data.columns if "sle" in x or x in cols_to_exclude + ] + projection_length = 86 + + # Initialize a list to collect the processed DataFrames + processed_segments = [] + + # Calculate the number of segments + num_segments = len(data) // projection_length + + for segment_idx in tqdm(range(num_segments), total=num_segments, desc="Adding lag variables"): + # Extract the segment + segment_start = segment_idx * projection_length + segment_end = (segment_idx + 1) * projection_length + segment = data.iloc[segment_start:segment_end, :] + + # Separate the segment into lagged and non-lagged parts + non_lagged_data = segment[non_lagged_cols] + lagged_data = segment.drop(columns=non_lagged_cols) + + # Generate lagged variables for the segment + for shift in range(1, lag + 1): + lagged_segment = lagged_data.shift(shift).add_suffix(f".lag{shift}") + # Fill missing values caused by shifting + lagged_segment.fillna(method="bfill", inplace=True) + non_lagged_data = pd.concat( + [non_lagged_data.reset_index(drop=True), lagged_segment.reset_index(drop=True)], + axis=1, + ) + + # Store the processed segment + processed_segments.append(non_lagged_data) + + # Concatenate all processed segments into a single DataFrame + final_data = pd.concat(processed_segments, axis=0) + + return final_data + + +def fill_mrro_nans(data: pd.DataFrame, method) -> pd.DataFrame: + """ + Fills the NaN values in the specified columns with the given method. + + Args: + data (pd.DataFrame): The input DataFrame. + method (str or int): The method to fill NaN values. Must be one of 'zero', 'mean', 'median', or 'drop'. + + Returns: + pd.DataFrame: The DataFrame with NaN values filled according to the specified method. + + Raises: + ValueError: If the method is not one of 'zero', 'mean', 'median', or 'drop'. + """ + mrro_columns = [x for x in data.columns if "mrro" in x] + + if method.lower() == "zero" or method.lower() == "0" or method == 0: + for col in mrro_columns: + data[col] = data[col].fillna(0) + elif method.lower() == "mean": + for col in mrro_columns: + data[col] = data[col].fillna(data[col].mean()) + elif method.lower() == "median": + for col in mrro_columns: + data[col] = data[col].fillna(data[col].median()) + elif method.lower() == "drop": + data = data.dropna(subset=mrro_columns) + else: + raise ValueError("method must be 'zero', 'mean', 'median', or 'drop'") + return data + + +def split_training_data( + data, train_size, val_size, test_size=None, output_directory=None, random_state=42 +): + """ + Splits the input data into training, validation, and test sets based on the specified sizes. + + Args: + data (str or pandas.DataFrame): The input data to be split. It can be either a file path (str) or a pandas DataFrame. + train_size (float): The proportion of data to be used for training. + val_size (float): The proportion of data to be used for validation. + test_size (float, optional): The proportion of data to be used for testing. If not provided, the remaining data after training and validation will be used for testing. Defaults to None. + output_directory (str, optional): The directory where the split data will be saved as CSV files. Defaults to None. + random_state (int, optional): The random seed for shuffling the data. Defaults to 42. + + Returns: + tuple: A tuple containing the training, validation, and test sets as pandas DataFrames. + + Raises: + ValueError: If the length of data is not divisible by 86, indicating incomplete projections. + ValueError: If the data does not have a column named 'id'. + + """ + + if isinstance(data, str): + data = pd.read_csv(data) + elif not isinstance(data, pd.DataFrame): + raise ValueError("data must be a path (str) or a pandas DataFrame") + + if not len(data) % 86 == 0: + raise ValueError( + "Length of data must be divisible by 86, if not there are incomplete projections." + ) + + if "id" not in data.columns: + raise ValueError("data must have a column named 'id'") + + total_ids = data["id"].unique() + np.random.shuffle(total_ids) + train_ids = total_ids[: int(len(total_ids) * train_size)] + val_ids = total_ids[ + int(len(total_ids) * train_size) : int(len(total_ids) * (train_size + val_size)) + ] + test_ids = total_ids[int(len(total_ids) * (train_size + val_size)) :] + + train = data[data["id"].isin(train_ids)] + val = data[data["id"].isin(val_ids)] + test = data[data["id"].isin(test_ids)] + + if output_directory is not None: + train.to_csv(f"{output_directory}/train.csv", index=False) + val.to_csv(f"{output_directory}/val.csv", index=False) + test.to_csv(f"{output_directory}/test.csv", index=False) + + return train, val, test + + +def drop_outliers( + data: pd.DataFrame, + column: str, + method: str, + expression: List[tuple] = None, + quantiles: List[float] = [0.01, 0.99], +): + """ + Drops simulations that are outliers based on the provided method and expression. + Extra complexity is handled due to the necessity of removing the entire 86 row series from + the dataset rather than simply removing the rows with given conditions. Note that the + condition indicates rows to be DROPPED, not kept (e.g. 'sle', '>', '20' would drop all + simulations containing sle values over 20). If quantile method is used, outliers are dropped + from the SLE column based on the provided quantile in the quantiles argument. If explicit is + chosen, expression must contain a list of tuples such that the tuple contains + [(column, operator, expression)] of the subset, e.g. [("sle", ">", 20), ("sle", "<", -20)]. + + Args: + data (pd.DataFrame): The input DataFrame. + method (str): Method of outlier deletion, must be in [quantile, explicit] + expression (list[tuple]): List of subset expressions in the form [column, operator, value], defaults to None. + quantiles (list[float]): List of quantiles for quantile method, defaults to [0.01, 0.99]. + + Returns: + data (pd.DataFrame): having outliers dropped. + """ + + # Check if method is quantile + if method.lower() == "quantile": + if quantiles is None: + raise AttributeError("If method == quantile, quantiles argument cannot be None") + + # Calculate lower and upper quantiles + lower_sle, upper_sle = np.quantile(np.array(data[column]), quantiles) + + # Filter outlier data based on quantiles + outlier_data = data[(data[column] <= lower_sle) | (data[column] >= upper_sle)] + + # Check if method is explicit + elif method.lower() == "explicit": + if expression is None: + raise AttributeError("If method == explicit, expression argument cannot be None") + elif not isinstance(expression, list) or not isinstance(expression[0], tuple): + raise AttributeError( + 'Expression argument must be a list of tuples, e.g. [("sle", ">", 20), ("sle", "<", -20)]' + ) + + outlier_data = data.copy() + + # Apply subset expressions to filter outlier data + subset_dfs = [] + for subset_expression in expression: + column, operator, value = subset_expression + + if operator.lower() in ("equal", "equals", "=", "=="): + outlier_dataframe = outlier_data[outlier_data[column] == value] + elif operator.lower() in ("not equal", "not equals", "!=", "~="): + outlier_dataframe = outlier_data[outlier_data[column] != value] + elif operator.lower() in ("greater than", "greater", ">=", ">"): + outlier_dataframe = outlier_data[outlier_data[column] > value] + elif operator.lower() in ("less than", "less", "<=", "<"): + outlier_dataframe = outlier_data[outlier_data[column] < value] + else: + raise ValueError(f'Operator must be in ["==", "!=", ">", "<"], received {operator}') + subset_dfs.append(outlier_dataframe) + outlier_data = pd.concat(subset_dfs) + + # Check if outlier_data is empty + if outlier_data.empty: + return data + + # Create dataframe of experiments with outliers (want to delete the entire 86 rows) + outlier_runs = pd.DataFrame() + # TODO: Check to see if this works + outlier_runs["modelname"] = outlier_data["model"] + outlier_runs["exp_id"] = outlier_data["exp"] + try: + outlier_runs["sector"] = outlier_data["sector"] + sectors = True + except KeyError: + sectors = False + outlier_runs_list = outlier_runs.values.tolist() + unique_outliers = [list(x) for x in set(tuple(x) for x in outlier_runs_list)] + + data["outlier"] = False + + # Drop those runs + for i in tqdm(unique_outliers, total=len(unique_outliers), desc="Dropping outliers"): + modelname = i[0] + exp_id = i[1] + + if sectors: + sector = i[2] + data.loc[(data.model == modelname) & (data.exp == exp_id) & (data.sector == sector), "outlier"] = True + else: + data.loc[(data.model == modelname) & (data.exp == exp_id), "outlier"] = True + + data = data[data["outlier"] == False] + + return data diff --git a/ise/data/process.py b/ise/data/process.py new file mode 100644 index 0000000..5471712 --- /dev/null +++ b/ise/data/process.py @@ -0,0 +1,3065 @@ +import os +import time +import warnings +from datetime import datetime + +import cftime +import numpy as np +import pandas as pd +import xarray as xr +from tqdm import tqdm + +from ise.data.scaler import LogScaler, RobustScaler, StandardScaler +from ise.models.grid import PCA +from ise.utils.functions import get_all_filepaths + + +class GridProcessor: + pass + + +class SectorProcessor: + pass + + +class ProjectionProcessor: + """ + A class for processing ice sheet data. + + Attributes: + - ice_sheet (str): Ice sheet to be processed. Must be 'AIS' or 'GIS'. + - forcings_directory (str): The path to the directory containing the forcings data. + - projections_directory (str): The path to the directory containing the projections data. + - scalefac_path (str): The path to the netCDF file containing scaling factors for each grid cell. + - densities_path (str): The path to the CSV file containing ice and ocean density (rhow/rhoi) data for each experiment. + + Methods: + - __init__(self, ice_sheet, forcings_directory, projections_directory, scalefac_path=None, densities_path=None): Initializes the Processor object. + - process_forcings(self): Processes the forcings data. + - process_projections(self, output_directory): Processes the projections data. + - _calculate_ivaf_minus_control(self, data_directory, densities_fp, scalefac_path): Calculates the ice volume above flotation (IVAF) for each file in the given data directory, subtracting out the control projection IVAF if applicable. + - _calculate_ivaf_single_file(self, directory, densities, scalefac_model, ctrl_proj=False): Calculates the ice volume above flotation (IVAF) for a single file. + """ + + def __init__( + self, + ice_sheet, + forcings_directory, + projections_directory, + scalefac_path=None, + densities_path=None, + ): + self.forcings_directory = forcings_directory + self.projections_directory = projections_directory + self.densities_path = densities_path + self.scalefac_path = scalefac_path + self.ice_sheet = ice_sheet.upper() + if self.ice_sheet.lower() in ("gris", "gis"): + self.ice_sheet = "GIS" + self.resolution = 5 if self.ice_sheet == "GIS" else 8 + + def process( + self, + ): + """ + Process the ISMIP6 projections by calculating IVAF for both control + and experiments, subtracting out the control IVAF from experiments, + and exporting ivaf files. + + Args: + output_directory (str): The directory to save the processed projections. + + Raises: + ValueError: If projections_directory or output_directory is not specified. + + Returns: + int: 1 indicating successful processing. + """ + if self.projections_directory is None: + raise ValueError("Projections path must be specified") + + # if the last ivaf file is missing, assume none of them are and calculate and export all ivaf files + if ( + self.ice_sheet == "AIS" + ): # and not os.path.exists(f"{self.projections_directory}/VUW/PISM/exp08/ivaf_GIS_VUW_PISM_exp08.nc"): + self._calculate_ivaf_minus_control( + self.projections_directory, self.densities_path, self.scalefac_path + ) + elif ( + self.ice_sheet == "GIS" + ): # and not os.path.exists(f"{self.projections_directory}/VUW/PISM/exp04/ivaf_AIS_VUW_PISM_exp04.nc"): + self._calculate_ivaf_minus_control( + self.projections_directory, self.densities_path, self.scalefac_path + ) + + return 1 + + def _calculate_ivaf_minus_control( + self, data_directory: str, densities_fp: str, scalefac_path: str + ): + """ + Calculates the ice volume above flotation (IVAF) for each file in the given data directory, + subtracting out the control projection IVAF if applicable. + + Args: + - data_directory (str): path to directory containing the data files to process + - densities_fp (str or pd.DataFrame): filepath to CSV file containing density data, or a pandas DataFrame + - scalefac_path (str): path to netCDF file containing scaling factors for each grid cell + + Returns: + - int: 1 indicating successful calculation. + + Raises: + - ValueError: if densities_fp is None or not a string or pandas DataFrame + + """ + + # error handling for densities argument (must be str filepath or dataframe) + if densities_fp is None: + raise ValueError( + "densities_fp must be specified. Run get_model_densities() to get density data." + ) + if isinstance(densities_fp, str): + densities = pd.read_csv(densities_fp) + elif isinstance(densities_fp, pd.DataFrame): + pass + else: + raise ValueError("densities argument must be a string or a pandas DataFrame.") + + # open scaling model + scalefac_model = xr.open_dataset(scalefac_path) + scalefac_model = np.transpose(scalefac_model.af2.values, (1, 0)) + + # adjust scaling model based on desired resolution + if self.ice_sheet == "AIS": + scalefac_model = scalefac_model[:: self.resolution, :: self.resolution] + elif self.ice_sheet == "GIS" and scalefac_model.shape != (337, 577): + if scalefac_model.shape[0] == 6081: + raise ValueError( + f"Scalefac model must be 337x577 for GIS, received {scalefac_model.shape}. Make sure you are using the GIS scaling model and not the AIS." + ) + raise ValueError( + f"Scalefac model must be 337x577 for GIS, received {scalefac_model.shape}." + ) + + # get all files in directory with "ctrl_proj" and "exp" in them and store separately + ctrl_proj_dirs = [] + exp_dirs = [] + for root, dirs, _ in os.walk(data_directory): + for directory in dirs: + if "ctrl_proj" in directory: + ctrl_proj_dirs.append(os.path.join(root, directory)) + elif "exp" in directory: + exp_dirs.append(os.path.join(root, directory)) + else: + pass + + # first calculate ivaf for control projections + for directory in ctrl_proj_dirs: + self._calculate_ivaf_single_file(directory, densities, scalefac_model, ctrl_proj=True) + + # then, for each experiment, calculate ivaf and subtract out control + # exp_dirs = exp_dirs[65:] + for directory in exp_dirs: + self._calculate_ivaf_single_file(directory, densities, scalefac_model, ctrl_proj=False) + + return 1 + + def _calculate_ivaf_single_file(self, directory, densities, scalefac_model, ctrl_proj=False): + """ + Calculate the Ice Volume Above Floatation (IVAF) for a single file. + + Args: + directory (str): The directory path of the file. + densities (pandas.DataFrame): A DataFrame containing density values for different groups and models. + scalefac_model (float): The scale factor for the model. + ctrl_proj (bool, optional): Flag indicating whether the projection is a control projection. Defaults to False. + + Returns: + int: 1 if the processing is successful, -1 otherwise. + + + """ + + # directory = r"/gpfs/data/kbergen/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Projection-GrIS/AWI/ISSM1/exp09" + # get metadata from path + + path = directory.split("/") + exp = path[-1] + model = path[-2] + group = path[-3] + + # Determine which control to use based on experiment (only applies to AIS) per Nowicki, 2020 + if not ctrl_proj: + if self.ice_sheet == "AIS": + if exp in ( + "exp01", + "exp02", + "exp03", + "exp04", + "exp11", + "expA1", + "expA2", + "expA3", + "expA4", + "expB1", + "expB2", + "expB3", + "expB4", + "expB5", + "expC2", + "expC5", + "expC8", + "expC11", + "expE1", + "expE2", + "expE3", + "expE4", + "expE5", + "expE11", + "expE12", + "expE13", + "expE14", + ): + ctrl_path = os.path.join( + "/".join(path[:-1]), + f"ctrl_proj_open/ivaf_{self.ice_sheet}_{group}_{model}_ctrl_proj_open.nc", + ) + elif ( + exp + in ( + "exp05", + "exp06", + "exp07", + "exp08", + "exp09", + "exp10", + "exp12", + "exp13", + "expA5", + "expA6", + "expA7", + "expA8", + "expB6", + "expB7", + "expB8", + "expB9", + "expB10", + "expC3", + "expC6", + "expC9", + "expC12", + "expE6", + "expE7", + "expE8", + "expE9", + "expE10", + "expE15", + "expE16", + "expE17", + "expE18", + ) + or "expD" in exp + ): + ctrl_path = os.path.join( + "/".join(path[:-1]), + f"ctrl_proj_std/ivaf_{self.ice_sheet}_{group}_{model}_ctrl_proj_std.nc", + ) + elif exp in ( + "expC1", + "expC4", + "expC7", + "expC10", + ): # N/A value for ocean_forcing in Nowicki, 2020 table A2 + return -1 + else: + print(f"Experiment {exp} not recognized. Skipped.") + return -1 + + else: + # GrIS doesn't have ctrl_proj_open vs ctrl_proj_std + ctrl_path = os.path.join( + "/".join(path[:-1]), + f"ctrl_proj/ivaf_{self.ice_sheet}_{group}_{model}_ctrl_proj.nc", + ) + + # for some reason there is no ctrl_proj_open for AWI and JPL1, skip + if group == "AWI" and "ctrl_proj_open" in ctrl_path: + return -1 + if group == "JPL1" and "ctrl_proj_open" in ctrl_path: + return -1 + + # MUN_GISM1 is corrupted, skip + if group == "MUN" and model == "GSM1": + return -1 + # folder is empty, skip + elif group == "IMAU" and exp == "exp11": + return -1 + # bed file in NCAR_CISM/expD10 is empty, skip + elif group == "NCAR" and exp in ("expD10", "expD11"): + return -1 + + # lookup densities from csv + subset_densities = densities[(densities.group == group) & (densities.model == model)] + rhoi = subset_densities.rhoi.values[0] + rhow = subset_densities.rhow.values[0] + + # load data + if self.ice_sheet == "AIS" and group == "ULB": + # ULB uses fETISh for AIS naming, not actual model name (fETISh_16km or fETISh_32km) + naming_convention = f"{self.ice_sheet}_{group}_fETISh_{exp}.nc" + + else: + naming_convention = f"{self.ice_sheet}_{group}_{model}_{exp}.nc" + + # load data + bed = get_xarray_data( + os.path.join(directory, f"topg_{naming_convention}"), ice_sheet=self.ice_sheet + ) + thickness = get_xarray_data( + os.path.join(directory, f"lithk_{naming_convention}"), ice_sheet=self.ice_sheet + ) + mask = get_xarray_data( + os.path.join(directory, f"sftgif_{naming_convention}"), ice_sheet=self.ice_sheet + ) + ground_mask = get_xarray_data( + os.path.join(directory, f"sftgrf_{naming_convention}"), ice_sheet=self.ice_sheet + ) + + # bed = xr.open_dataset(os.path.join(directory, f'topg_{naming_convention}'), decode_times=False) + # thickness = xr.open_dataset(os.path.join(directory, f'lithk_{naming_convention}'), decode_times=False) + # mask = xr.open_dataset(os.path.join(directory, f'sftgif_{naming_convention}'), decode_times=False) + # ground_mask = xr.open_dataset(os.path.join(directory, f'sftgrf_{naming_convention}'), decode_times=False) + length_time = len(thickness.time) + # note on decode_times=False -- by doing so, it stays in "days from" rather than trying to infer a type. Makes handling much more predictable. + + try: + bed = bed.transpose("x", "y", "time", ...) + thickness = thickness.transpose("x", "y", "time", ...) + mask = mask.transpose("x", "y", "time", ...) + ground_mask = ground_mask.transpose("x", "y", "time", ...) + except ValueError: + bed = bed.transpose("x", "y", ...) + thickness = thickness.transpose("x", "y", ...) + mask = mask.transpose("x", "y", ...) + ground_mask = ground_mask.transpose("x", "y", ...) + + # if time is not a dimension, add copies for each time step + if "time" not in bed.dims or bed.dims["time"] == 1: + try: + bed = bed.drop_vars( + [ + "time", + ] + ) + except ValueError: + pass + bed = bed.expand_dims(dim={"time": length_time}) + + if length_time == 86: + bed["time"] = thickness[ + "time" + ] # most times just the bed file is missing the time index + elif length_time > 86: + if len(thickness.time.values) != len(set(thickness.time.values)): # has duplicates + keep_indices = np.unique(thickness["time"], return_index=True)[ + 1 + ] # find non-duplicates + bed = bed.isel(time=keep_indices) # only select non-duplicates + thickness = thickness.isel(time=keep_indices) + mask = mask.isel(time=keep_indices) + ground_mask = ground_mask.isel(time=keep_indices) + else: + warnings.warn( + f"At least one file in {exp} does not have a time index formatted correctly. Attempting to fix." + ) + start_idx = len(bed.time) - 86 + bed = bed.sel(time=slice(bed.time.values[start_idx], len(bed.time))) + thickness = thickness.sel( + time=slice(thickness.time[start_idx], thickness.time[-1]) + ) + mask = mask.sel(time=slice(mask.time[start_idx], mask.time[-1])) + ground_mask = ground_mask.sel( + time=slice(ground_mask.time[start_idx], ground_mask.time[-1]) + ) + + try: + bed["time"] = thickness["time"].copy() + except ValueError: + print( + f"Cannot fix time index for {exp} due to duplicate index values. Skipped." + ) + return -1 + + else: + print(f"Only {len(bed.time)} time points for {exp}. Skipped.") + return -1 + + # if -9999 instead of np.nan, replace (come back and optimize? couldn't figure out with xarray) + if bed.topg[0, 0, 0] <= -9999.0 or bed.topg[0, 0, 0] >= 9999: + topg = bed.topg.values + topg[(np.where((topg <= -9999.0) | (topg >= 9999)))] = np.nan + bed["topg"].values = topg + del topg + + lithk = thickness.lithk.values + lithk[(np.where((lithk <= -9999.0) | (lithk >= 9999)))] = np.nan + thickness["lithk"].values = lithk + del lithk + + sftgif = mask.sftgif.values + sftgif[(np.where((sftgif <= -9999.0) | (sftgif >= 9999)))] = np.nan + mask["sftgif"].values = sftgif + del sftgif + + sftgrf = ground_mask.sftgrf.values + sftgrf[(np.where((sftgrf <= -9999.0) | (sftgrf >= 9999)))] = np.nan + ground_mask["sftgrf"].values = sftgrf + del sftgrf + + # converts time (in "days from X" to numpy.datetime64) and subsets time from 2015 to 2100 + + # a few datasets do not have the time index formatted correctly + if len(bed.time.attrs) == 0: + + if len(bed.time) == 86: + bed["time"] = thickness[ + "time" + ] # most times just the bed file is missing the time index + elif len(bed.time) > 86: + # bed['time'] = thickness['time'].copy() + warnings.warn( + f"At least one file in {exp} does not have a time index formatted correctly. Attempting to fix." + ) + start_idx = len(bed.time) - 86 + bed = bed.sel(time=slice(bed.time.values[start_idx], len(bed.time))) + thickness = thickness.sel(time=slice(thickness.time[start_idx], thickness.time[-1])) + mask = mask.sel(time=slice(mask.time[start_idx], mask.time[-1])) + ground_mask = ground_mask.sel( + time=slice(ground_mask.time[start_idx], ground_mask.time[-1]) + ) + + try: + bed["time"] = thickness["time"] + except ValueError: + print( + f"Cannot fix time index for {exp} due to duplicate index values. Skipped." + ) + return -1 + + else: + print(f"Only {len(bed.time)} time points for {exp}. Skipped.") + return -1 + + bed = convert_and_subset_times(bed) + thickness = convert_and_subset_times(thickness) + mask = convert_and_subset_times(mask) + ground_mask = convert_and_subset_times(ground_mask) + length_time = len(thickness.time) + + # Interpolate values for x & y, for formatting purposes only, does not get used + if len(set(thickness.y.values)) != len(scalefac_model): + bed["x"], bed["y"] = interpolate_values(bed) + thickness["x"], thickness["y"] = interpolate_values(thickness) + mask["x"], mask["y"] = interpolate_values(mask) + ground_mask["x"], ground_mask["y"] = interpolate_values(ground_mask) + + # clip masks if they are below 0 or above 1 + if np.min(mask.sftgif.values) < 0 or np.max(mask.sftgif.values) > 1: + mask["sftgif"] = np.clip(mask.sftgif, 0.0, 1.0) + if np.min(ground_mask.sftgrf.values) < 0 or np.max(ground_mask.sftgrf.values) > 1: + ground_mask["sftgrf"] = np.clip(ground_mask.sftgrf, 0.0, 1.0) + + # if time is not a dimension, add copies for each time step + # if 'time' not in bed.dims or bed.dims['time'] == 1: + # try: + # bed = bed.drop_vars(['time',]) + # except ValueError: + # pass + # bed = bed.expand_dims(dim={'time': length_time}) + + # flip around axes so the order is (x, y, time) + bed = bed.transpose("x", "y", "time", ...) + bed_data = bed.topg.values + + thickness = thickness.transpose("x", "y", "time", ...) + thickness_data = thickness.lithk.values + + mask = mask.transpose("x", "y", "time", ...) + mask_data = mask.sftgif.values + + ground_mask = ground_mask.transpose("x", "y", "time", ...) + ground_mask_data = ground_mask.sftgrf.values + + # for each time step, calculate ivaf + ivaf = np.zeros(bed_data.shape) + for i in range(length_time): + + # get data slices for current time + thickness_i = thickness_data[:, :, i].copy() + bed_i = bed_data[:, :, i].copy() + mask_i = mask_data[:, :, i].copy() + ground_mask_i = ground_mask_data[:, :, i].copy() + + # set data slices to zero where mask = 0 or any value is NaN + thickness_i[ + (mask_i == 0) + | (np.isnan(mask_i)) + | (np.isnan(thickness_i)) + | (np.isnan(ground_mask_i)) + | (np.isnan(bed_i)) + ] = 0 + bed_i[ + (mask_i == 0) + | (np.isnan(mask_i)) + | (np.isnan(thickness_i)) + | (np.isnan(ground_mask_i)) + | (np.isnan(bed_i)) + ] = 0 + ground_mask_i[ + (mask_i == 0) + | np.isnan(mask_i) + | np.isnan(thickness_i) + | np.isnan(ground_mask_i) + | np.isnan(bed_i) + ] = 0 + mask_i[ + (mask_i == 0) + | (np.isnan(mask_i)) + | (np.isnan(thickness_i)) + | (np.isnan(ground_mask_i)) + | (np.isnan(bed_i)) + ] = 0 + + # take min(bed_i, 0) + bed_i[bed_i > 0] = 0 + + # calculate IVAF (based on MATLAB processing scripts from Seroussi, 2021) + hf_i = thickness_i + ((rhow / rhoi) * bed_i) + masked_output = hf_i * ground_mask_data[:, :, i] * mask_data[:, :, i] + ivaf[:, :, i] = masked_output * scalefac_model * (self.resolution * 1000) ** 2 + + # subtract out control if for an experment + ivaf_nc = bed.copy() # copy file structure and metadata for ivaf file + if not ctrl_proj: + # open control dataset + ivaf_ctrl = xr.open_dataset( + ctrl_path, + ).transpose("x", "y", "time", ...) + + # subtract out control + ivaf = ivaf_ctrl.ivaf.values - ivaf + + # save ivaf file (copied format from bed_data, change accordingly.) + ivaf_nc["ivaf"] = (("x", "y", "time"), ivaf) + ivaf_nc = ivaf_nc.drop_vars( + [ + "topg", + ] + ) + ivaf_nc["sle"] = ivaf_nc.ivaf / 1e9 / 362.5 + ivaf_nc.to_netcdf( + os.path.join(directory, f"ivaf_{self.ice_sheet}_{group}_{model}_{exp}.nc") + ) + + print(f"{group}_{model}_{exp}: Processing successful.") + + return 1 + + +def convert_and_subset_times( + dataset, +): + if isinstance(dataset.time.values[0], cftime._cftime.DatetimeNoLeap) or isinstance( + dataset.time.values[0], cftime._cftime.Datetime360Day + ): + datetimeindex = dataset.indexes["time"].to_datetimeindex() + dataset["time"] = datetimeindex + + elif ( + isinstance(dataset.time.values[0], np.float32) + or isinstance(dataset.time.values[0], np.float64) + or isinstance(dataset.time.values[0], np.int32) + or isinstance(dataset.time.values[0], np.int64) + ): + try: + units = dataset.time.attrs["units"] + except KeyError: + units = dataset.time.attrs["unit"] + units = units.replace("days since ", "").split(" ")[0] + + if units == "2000-1-0": # VUB AISMPALEO + units = "2000-1-1" + elif units == "day": # NCAR CISM exp7 - "day as %Y%m%d.%f"? + units = "2014-1-1" + + if units == "seconds": # VUW PISM -- seconds since 1-1-1 00:00:00 + start_date = np.datetime64( + datetime.strptime("0001-01-01 00:00:00", "%Y-%m-%d %H:%M:%S") + ) + dataset["time"] = np.array( + [start_date + np.timedelta64(int(x), "s") for x in dataset.time.values] + ) + elif units == "2008-1-1" and dataset.time[-1] == 157785.0: # UAF? + # every 5 years but still len(time) == 86.. assume we keep them all for 2015-2100 + dataset["time"] = np.array( + [ + np.datetime64(datetime.strptime(f"{x}-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")) + for x in range(2015, 2101) + ] + ) + else: + try: + start_date = np.datetime64( + datetime.strptime(units.replace("days since ", ""), "%Y-%m-%d") + ) + except ValueError: + start_date = np.datetime64( + datetime.strptime(units.replace("days since ", ""), "%d-%m-%Y") + ) + + dataset["time"] = np.array( + [start_date + np.timedelta64(int(x), "D") for x in dataset.time.values] + ) + else: + raise ValueError(f"Time values are not recognized: {type(dataset.time.values[0])}") + + if len(dataset.time) > 86: + # make sure the max date is 2100 + # dataset = dataset.sel(time=slice(np.datetime64('2014-01-01'), np.datetime64('2101-01-01'))) + dataset = dataset.sel(time=slice("2012-01-01", "2101-01-01")) + + # if you still have more than 86, take the previous 86 values from 2100 + if len(dataset.time) > 86: + # LSCE GRISLI has two 2015 measurements + + # dataset = dataset.sel(time=slice(dataset.time.values[len(dataset.time) - 86], dataset.time.values[-1])) + start_idx = len(dataset.time) - 86 + dataset = dataset.isel(time=slice(start_idx, len(dataset.time))) + + if len(dataset.time) != 86: + warnings.warn( + "After subsetting there are still not 86 time points. Go back and check logs." + ) + print(f"dataset_length={len(dataset.time)} -- {dataset.attrs}") + + return dataset + + +def get_model_densities(zenodo_directory: str, output_path: str = None): + """ + Extracts values for rhoi and rhow from NetCDF files in the specified directory and returns a pandas DataFrame + containing the group, model, rhoi, and rhow values for each file. + + Args: + zenodo_directory (str): The path to the directory containing the NetCDF files. + output_path (str, optional): The path to save the resulting DataFrame as a CSV file. + + Returns: + pandas.DataFrame: A DataFrame containing the group, model, rhoi, and rhow values for each file. + """ + results = [] + for root, dirs, files in os.walk(zenodo_directory): + for file in files: + if file.endswith(".nc"): # Check if the file is a NetCDF file + file_path = os.path.join(root, file) + try: + # Open the NetCDF file using xarray + dataset = xr.open_dataset(file_path, decode_times=False).transpose( + "x", "y", "time", ... + ) + + # Extract values for rhoi and rhow + if "rhoi" in dataset and "rhow" in dataset: + rhoi_values = dataset["rhoi"].values + rhow_values = dataset["rhow"].values + + # Append the filename and values to the results list + results.append({"filename": file, "rhoi": rhoi_values, "rhow": rhow_values}) + + # Close the dataset + dataset.close() + except Exception as e: + print(f"Error processing {file}: {e}") + + densities = [] + for file in results: + if "ctrl_proj" in file["filename"] or "hist" in file["filename"]: + continue + + elif "ILTS" in file["filename"]: + fp = file["filename"].split("_") + group = "ILTS_PIK" + model = fp[-2] + + elif "ULB_fETISh" in file["filename"]: + fp = file["filename"].split("_") + group = "ULB" + model = "fETISh_32km" if "32km" in file["filename"] else "fETISh_16km" + + else: + fp = file["filename"].split("_") + group = fp[-3] + model = fp[-2] + densities.append([group, model, file["rhoi"], file["rhow"]]) + + df = pd.DataFrame(densities, columns=["group", "model", "rhoi", "rhow"]) + df["rhoi"], df["rhow"] = df.rhoi.astype("float"), df.rhow.astype("float") + df = df.drop_duplicates() + + ice_sheet = "AIS" if "AIS" in file["filename"] else "GIS" + + if output_path is not None: + if output_path.endswith("/"): + df.to_csv(f"{output_path}/{ice_sheet}_densities.csv", index=False) + else: + df.to_csv(output_path, index=False) + + return df + + +def interpolate_values(data): + """ + Interpolates missing values in the x and y dimensions of the input NetCDF data using linear interpolation. + + Args: + data: A NetCDF file containing x and y dimensions with missing values. + + Returns: + A tuple containing the interpolated x and y arrays. + """ + y = pd.Series(data.y.values) + y = y.replace(0, np.NaN) + y = np.array(y.interpolate()) + + # first and last are NaNs, replace with correct values + y[0] = y[1] - (y[2] - y[1]) + y[-1] = y[-2] + (y[-2] - y[-3]) + + x = pd.Series(data.x.values) + x = x.replace(0, np.NaN) + x = np.array(x.interpolate()) + + # first and last are NaNs, replace with correct values + x[0] = x[1] - (x[2] - x[1]) + x[-1] = x[-2] + (x[-2] - x[-3]) + + return x, y + + +class DimensionalityReducer: + def __init__( + self, forcing_dir, projection_dir, output_dir, ice_sheet=None, scaling_method=None + ): + super().__init__() + if forcing_dir is None: + raise ValueError("Forcing directory must be specified.") + if output_dir is None: + raise ValueError("Output directory must be specified.") + self.forcing_dir = forcing_dir + self.projection_dir = projection_dir + self.output_dir = output_dir + self.forcing_paths = {"all": None, "atmosphere": None, "ocean": None} + + # check inputs + if os.path.exists(f"{self.output_dir}/pca_models/"): + self.pca_model_directory = f"{self.output_dir}/pca_models/" + else: + self.pca_model_directory = None + + if os.path.exists(f"{self.output_dir}/scalers/"): + self.scaler_directory = f"{self.output_dir}/scalers/" + else: + self.scaler_directory = None + self.scaling_method = scaling_method + + if ice_sheet not in ("AIS", "GrIS"): + raise ValueError("Ice sheet must be specified and must be 'AIS' or 'GrIS'.") + else: + self.ice_sheet = ice_sheet + + if self.ice_sheet.lower() == "gris": + atmospheric_files = get_all_filepaths( + path=self.forcing_dir, + filetype="nc", + contains="Atmosphere_Forcing/aSMB_observed/v1", + ) + atmospheric_files = [x for x in atmospheric_files if "combined" in x] + + # files in atmopheric directory are separated by year, needs to be combined + if not atmospheric_files: + combine_gris_forcings(self.forcing_dir) + + oceanic_files = get_all_filepaths( + path=self.forcing_dir, + filetype="nc", + contains="Ocean_Forcing/Melt_Implementation/v4", + ) + self.forcing_paths["all"] = atmospheric_files + oceanic_files + self.forcing_paths["atmosphere"] = atmospheric_files + self.forcing_paths["ocean"] = oceanic_files + else: + all_forcing_fps = get_all_filepaths( + path=self.forcing_dir, + filetype="nc", + contains="1995-2100", + not_contains="Ice_Shelf_Fracture", + ) + self.forcing_paths["all"] = [x for x in all_forcing_fps if "8km" in x and "v1" not in x] + self.forcing_paths["atmosphere"] = [ + x for x in self.forcing_paths["all"] if "Atmosphere_Forcing" in x + ] + self.forcing_paths["ocean"] = [ + x for x in self.forcing_paths["all"] if "Ocean_Forcing" in x + ] + + all_projection_fps = get_all_filepaths( + path=self.projection_dir, filetype="nc", contains="ivaf", not_contains="ctrl_proj" + ) + self.projection_paths = all_projection_fps + + # def reduce_dimensionlity(self, forcing_dir: str=None, output_dir: str=None): + # generate pca models + # convert each forcing file to pca space + + def generate_pca_models(self, num_forcing_pcs, num_projection_pcs, scaling_method="standard"): + """ + Generate principal component analysis (PCA) models for atmosphere and ocean variables. + + Parameters: + - atmosphere_fps (list): List of file paths for atmosphere data. + - ocean_fps (list): List of file paths for ocean data. + - save_dir (str): Directory to save the generated PCA models and results. + + Returns: + int: 0 if successful. + """ + + # check inputs and make directories for outputted models + if not os.path.exists(f"{self.output_dir}/pca_models/"): + os.mkdir(f"{self.output_dir}/pca_models/") + self.pca_model_directory = f"{self.output_dir}/pca_models/" + if not os.path.exists(f"{self.output_dir}/scalers/"): + os.mkdir(f"{self.output_dir}/scalers/") + self.scaler_directory = f"{self.output_dir}/scalers/" + self.scaling_method = scaling_method + + # Train PCA models for each atmospheric and oceanic forcing variable and save + if self.ice_sheet == "AIS": + self._generate_ais_atmosphere_pcas( + self.forcing_paths["atmosphere"], + self.pca_model_directory, + num_pcs=num_forcing_pcs, + scaler_dir=self.scaler_directory, + scaling_method=scaling_method, + ) + self._generate_ais_ocean_pcas( + self.forcing_paths["ocean"], + self.pca_model_directory, + num_pcs=num_forcing_pcs, + scaler_dir=self.scaler_directory, + scaling_method=scaling_method, + ) + else: + self._generate_gris_atmosphere_pcas( + self.forcing_paths["atmosphere"], + self.pca_model_directory, + num_pcs=num_forcing_pcs, + scaler_dir=self.scaler_directory, + scaling_method=scaling_method, + ) + self._generate_gris_ocean_pcas( + self.forcing_paths["ocean"], + self.pca_model_directory, + num_pcs=num_forcing_pcs, + scaler_dir=self.scaler_directory, + scaling_method=scaling_method, + ) + + # Train PCA model for SLE and save + sle_paths = get_all_filepaths( + path=self.projection_dir, filetype="nc", contains="ivaf", not_contains="ctrl" + ) + self._generate_sle_pca( + sle_paths, + save_dir=self.pca_model_directory, + num_pcs=num_projection_pcs, + scaler_dir=self.scaler_directory, + scaling_method=scaling_method, + ) + + return 0 + + def convert_forcings( + self, + forcing_files: list = None, + pca_model_directory: str = None, + output_dir: str = None, + scaling_method=None, + ): + """ + Converts atmospheric and oceanic forcing files to PCA space using pretrained PCA models. + + Args: + forcing_files (list, optional): List of specific forcing files to convert. If not provided, all files in the directory will be used. Default is None. + pca_model_directory (str, optional): Directory containing the pretrained PCA models. If not provided, the directory specified during object initialization will be used. Default is None. + output_dir (str, optional): Directory to save the converted files. If not provided, the directory specified during object initialization will be used. Default is None. + + Returns: + int: 0 indicating successful conversion. + """ + + # check inputs for validity + output_dir = self.output_dir if output_dir is None else output_dir + if self.pca_model_directory is None and pca_model_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + if self.scaling_method is None and scaling_method is None: + raise ValueError( + "Scalers must be generated first, or scaling_method must be identified if they already exist. Run DimensionalityReducer.generate_pca_models first." + ) + + if scaling_method is not None: + self.scaling_method = scaling_method + + if pca_model_directory is not None: + self.pca_model_directory = pca_model_directory + + # if user supplies specific forcing files (rather than entire directory), use that instead + # TODO: test this.. + if forcing_files is not None: + warnings.warn( + "By using specific forcing files, forcing_paths attribute will be overwritten." + ) + self.forcing_paths["all"] = forcing_files + self.forcing_paths["atmosphere"] = [ + x for x in self.forcing_paths["all"] if "Atmosphere_Forcing" in x + ] + self.forcing_paths["ocean"] = [ + x for x in self.forcing_paths["all"] if "Ocean_Forcing" in x + ] + + # ATMOSPHERIC FORCINGS + + if not os.path.exists(f"{output_dir}/forcings/"): + os.mkdir(f"{output_dir}/forcings/") + + # for each atmospheric forcing file, convert each variable to PCA space with pretrained PCA model + for i, path in tqdm( + enumerate(self.forcing_paths["atmosphere"]), + total=len(self.forcing_paths["atmosphere"]), + desc="Converting atmospheric forcing files to PCA space", + ): + # dataset = xr.open_dataset(path, decode_times=False, engine='netcdf4', ).transpose('time', 'y', 'x', ...) # open the dataset + # if len(dataset.dims) > 3: + # drop_dims = [x for x in list(dataset.dims) if x not in ('time', 'x', 'y')] + # dataset = dataset.drop_dims(drop_dims) + dataset = get_xarray_data(path, ice_sheet=self.ice_sheet, convert_and_subset=True) + forcing_name = path.replace(".nc", "").split("/")[-1] # get metadata (model, ssp, etc.) + + # transform each variable in the dataset with their respective trained PCA model + transformed_data = {} + if self.ice_sheet == "AIS": + + for var in [ + "evspsbl_anomaly", + "mrro_anomaly", + "pr_anomaly", + "smb_anomaly", + "ts_anomaly", + ]: + try: + transformed = self.transform( + dataset[var].values, + var_name=var, + pca_model_directory=self.pca_model_directory, + scaler_directory=self.scaler_directory, + scaling_method=self.scaling_method, + ) + except KeyError: # if a variable is missing (usually mrro_anomaly), skip it + warnings.warn(f"Variable {var} not found in {forcing_name}. Skipped.") + continue + transformed_data[ + var + ] = transformed # store in dict with structure {'var_name': transformed_var} + else: + var = path.split("_")[-2] + try: + transformed = self.transform( + dataset[var].values, + var_name=var, + pca_model_directory=self.pca_model_directory, + scaler_directory=self.scaler_directory, + scaling_method=self.scaling_method, + ) + except KeyError: + warnings.warn(f"Variable {var} not found in {forcing_name}. Skipped.") + transformed_data[ + var + ] = transformed # store in dict with structure {'var_name': transformed_var} + + if transformed.isnan().any() or transformed.isinf().any(): + warnings.warn(f"NaN or inf values found in converted {forcing_name}.") + + # create a dataframe with rows corresponding to time (106 total) and columns corresponding to each variables principal components + compiled_transformed_forcings = pd.DataFrame() + for var in transformed_data.keys(): + var_df = pd.DataFrame( + transformed_data[var], + columns=[f"{var}_pc{i+1}" for i in range(transformed_data[var].shape[1])], + ) + compiled_transformed_forcings = pd.DataFrame( + pd.concat([compiled_transformed_forcings, var_df], axis=1) + ) + + pd.DataFrame(compiled_transformed_forcings).to_csv( + f"{output_dir}/forcings/PCA_{forcing_name}.csv", index=False + ) + + print( + f"{len(self.forcing_paths['atmosphere'])}/{len(self.forcing_paths['atmosphere'])} atmospheric forcing files converted to PCA space." + ) + print( + f"Finished converting atmospheric forcings to PCA space, files outputted to {output_dir}." + ) + + # OCEANIC FORCINGS + + # for each ocean forcing file, convert each variable to PCA space with pretrained PCA model + for i, path in tqdm( + enumerate(self.forcing_paths["ocean"]), + total=len(self.forcing_paths["ocean"]), + desc="Converting oceanic forcing files", + ): + + # open the dataset + forcing_name = path.replace(".nc", "").split("/")[-1] # get metadata (model, ssp, etc.) + + # get variable name by splitting the filepath name + if self.ice_sheet == "AIS": + var = self.forcing_paths["ocean"][i].split("/")[-1].split("_")[-4] + else: + metadata = self.forcing_paths["ocean"][i].split("/")[-1].split("_") + if "basinRunoff" in metadata: + var = "basin_runoff" + elif "oceanThermalForcing" in metadata: + var = "thermal_forcing" + else: + var = self.forcing_paths["ocean"][i].split("/")[-1].split("_")[-2] + if var == "forcing" or var == "thermal": + var = "thermal_forcing" + + # get forcing array (requires mean value over z dimensions, see get_xarray_data()) + forcing_array = get_xarray_data( + path, var_name=var, ice_sheet=self.ice_sheet, convert_and_subset=True + ) + + # transform each variable in the dataset with their respective trained PCA model + transformed_data = {} + transformed = self.transform( + forcing_array, + var_name=var, + pca_model_directory=self.pca_model_directory, + scaler_directory=self.scaler_directory, + scaling_method=self.scaling_method, + ) + transformed_data[ + var + ] = transformed # store in dict with structure {'var_name': transformed_var} + + # create a dataframe with rows corresponding to time (86 total) and columns corresponding to each variables principal components + variable_df = pd.DataFrame( + transformed_data[var], + columns=[f"{var}_pc{i+1}" for i in range(transformed_data[var].shape[1])], + ) + variable_df.to_csv(f"{output_dir}/forcings/PCA_{forcing_name}.csv", index=False) + + print( + f"{len(self.forcing_paths['ocean'])}/{len(self.forcing_paths['ocean'])} oceanic forcing files converted to PCA space." + ) + print( + f"Finished converting oceanic forcings to PCA space, files outputted to {output_dir}." + ) + + return 0 + + def convert_projections( + self, + projection_files: list = None, + pca_model_directory: str = None, + output_dir: str = None, + scaling_method=None, + ): + + # check inputs for validity + output_dir = self.output_dir if output_dir is None else output_dir + if self.pca_model_directory is None and pca_model_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + + if self.scaling_method is None and scaling_method is None: + raise ValueError( + "Scalers must be generated first, or scaling_method must be identified if they already exist. Run DimensionalityReducer.generate_pca_models first." + ) + + if scaling_method is not None: + self.scaling_method = scaling_method + + if pca_model_directory is not None: + self.pca_model_directory = pca_model_directory + + # if user supplies specific projection files (rather than entire directory), use that instead + if projection_files is not None: + warnings.warn( + "By using specific projection files, projection_paths attribute will be overwritten." + ) + self.projection_paths = projection_files + + # make a folder in output directory for converted projections + if not os.path.exists(f"{output_dir}/projections/"): + os.mkdir(f"{output_dir}/projections/") + + # for each projection file, convert ivaf to PCA space with pretrained PCA model + for i, path in tqdm( + enumerate(self.projection_paths), + total=len(self.projection_paths), + desc="Converting projection files to PCA space", + ): + # get forcing array (requires mean value over z dimensions, see get_xarray_data()) + try: + projection_array = get_xarray_data(path, var_name="sle", ice_sheet=self.ice_sheet) + except: + projection_array = get_xarray_data(path, var_name="ivaf", ice_sheet=self.ice_sheet) + projection_array = projection_array / 1e9 / 362.5 + + # nan_indices = np.argwhere(np.isnan(projection_array)) + # print(len(nan_indices)) + # continue + + # projection_array = np.nan_to_num(projection_array) # deal with np.nans + var = "sle" + # projection_array = np.nan_to_num(projection_array) # there shouldn't be nans... + projection_name = path.replace(".nc", "").split("/")[ + -1 + ] # get metadata (model, ssp, etc.) + + # transform each variable in the dataset with their respective trained PCA model + transformed_data = {} + transformed = self.transform( + projection_array, + var_name=var, + pca_model_directory=self.pca_model_directory, + scaler_directory=self.scaler_directory, + scaling_method=self.scaling_method, + ) + transformed_data[ + var + ] = transformed # store in dict with structure {'var_name': transformed_var} + + # create a dataframe with rows corresponding to time (86 total) and columns corresponding to each variables principal components + variable_df = pd.DataFrame( + transformed_data[var], + columns=[f"{var}_pc{i+1}" for i in range(transformed_data[var].shape[1])], + ) + variable_df['model'] = "_".join(path.split('/')[-1].split('_')[2:4]) + variable_df['exp'] = path.replace('.nc', '').split('/')[-1].split('_')[-1] + variable_df.to_csv(f"{output_dir}/projections/PCA_{projection_name}.csv", index=False) + + print( + f"{len(self.projection_paths)}/{len(self.projection_paths)} projection files converted to PCA space." + ) + print(f"Finished converting projections to PCA space, files outputted to {output_dir}.") + + def _generate_ais_atmosphere_pcas( + self, + atmosphere_fps: list, + save_dir: str, + num_pcs="95%", + scaler_dir: str = None, + scaling_method="standard", + ): + """ + Generate principal component analysis (PCA) for atmospheric variables. + + Args: + atmosphere_fps (list): List of file paths to atmospheric CMIP files. + save_dir (str): Directory to save the PCA results. + + Returns: + int: 0 if successful. + """ + + # if no separate directory for saving scalers is specified, use the pca save_dir + if scaler_dir is None: + scaler_dir = save_dir + + # for each variable + + var_names = ["pr_anomaly", "evspsbl_anomaly", "mrro_anomaly", "smb_anomaly", "ts_anomaly"] + for i, var in tqdm( + enumerate(var_names), total=len(var_names), desc="Processing atmospheric PCA" + ): + variable_array = np.zeros([len(atmosphere_fps), 86, 761 * 761]) + + # loop through each atmospheric CMIP file and combine them into one big array + for i, fp in enumerate(atmosphere_fps): + + # get the variable you need (rather than the entire dataset) + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + # data_array = convert_and_subset_times(dataset) + try: + data_flattened = dataset[var].values.reshape(86, 761 * 761) + except KeyError: + data_flattened = np.nan + # store it in the total array + variable_array[i, :, :] = data_flattened + + # deal with np.nans -- since it's an anomaly, replace with 0 + variable_array = np.nan_to_num(variable_array) + + # reshape variable_array (num_files, num_timestamps, num_gridpoints) --> (num_files*num_timestamps, num_gridpoints) + variable_array = variable_array.reshape(len(atmosphere_fps) * 86, 761 * 761) + + # scale data + if scaling_method.lower() == "standard": + variable_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + variable_scaler = RobustScaler() + elif scaling_method.lower() == "log": + variable_scaler = LogScaler() + variable_scaler.fit(variable_array) + variable_array = variable_scaler.transform(variable_array) + + # run PCA + pca, _ = self._run_PCA(variable_array, num_pcs=num_pcs) + + # output pca object + save_path = f"{save_dir}/AIS_pca_{var}.pth" + pca.save(save_path) + # and scaler + save_path = f"{scaler_dir}/AIS_{var}_scaler.pth" + variable_scaler.save(save_path) + + return 0 + + def _generate_ais_ocean_pcas( + self, + ocean_fps: list, + save_dir: str, + num_pcs="95%", + scaler_dir: str = None, + scaling_method="standard", + ): + """ + Generate principal component analysis (PCA) for ocean variables. + + Args: + ocean_fps (list): List of file paths for ocean variables. + save_dir (str): Directory to save the PCA results. + + Returns: + int: 0 if PCA generation is successful, -1 otherwise. + """ + + if scaler_dir is None: + scaler_dir = save_dir + + thermal_forcing_fps = [x for x in ocean_fps if "thermal_forcing" in x] + salinity_fps = [x for x in ocean_fps if "salinity" in x] + temperature_fps = [x for x in ocean_fps if "temperature" in x] + + thermal_forcing_array = np.zeros([len(thermal_forcing_fps), 86, 761 * 761]) + salinity_array = np.zeros([len(salinity_fps), 86, 761 * 761]) + temperature_array = np.zeros([len(temperature_fps), 86, 761 * 761]) + + # get the variables you need (rather than the entire dataset) + print("Processing thermal_forcing PCA model.") + for i, fp in enumerate(thermal_forcing_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + # data_array = convert_and_subset_times(dataset) + thermal_forcing_array[i, :, :] = dataset["thermal_forcing"].values.reshape( + 86, 761 * 761 + ) # store + print("Processing salinity PCA model.") + for i, fp in enumerate(salinity_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + # data_array = convert_and_subset_times(dataset) + salinity_array[i, :, :] = dataset["salinity"].values.reshape(86, 761 * 761) # store + print("Processing temperature PCA model.") + for i, fp in enumerate(temperature_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + # data_array = convert_and_subset_times(dataset) + temperature_array[i, :, :] = dataset["temperature"].values.reshape(86, 761 * 761) + + # reshape variable_array (num_files, num_timestamps, num_gridpoints) --> (num_files*num_timestamps, num_gridpoints) + thermal_forcing_array = thermal_forcing_array.reshape( + len(thermal_forcing_fps) * 86, 761 * 761 + ) + salinity_array = salinity_array.reshape(len(salinity_fps) * 86, 761 * 761) + temperature_array = temperature_array.reshape(len(temperature_fps) * 86, 761 * 761) + + # remove nans + thermal_forcing_array = np.nan_to_num(thermal_forcing_array) + salinity_array = np.nan_to_num(salinity_array) + temperature_array = np.nan_to_num(temperature_array) + + # scale data + if scaling_method.lower() == "standard": + therm_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + therm_scaler = RobustScaler() + elif scaling_method.lower() == "log": + therm_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + therm_scaler.fit(thermal_forcing_array) + thermal_forcing_array = therm_scaler.transform(thermal_forcing_array) + + if scaling_method.lower() == "standard": + salinity_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + salinity_scaler = RobustScaler() + elif scaling_method.lower() == "log": + salinity_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + salinity_scaler.fit(salinity_array) + salinity_array = salinity_scaler.transform(salinity_array) + + if scaling_method.lower() == "standard": + temp_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + temp_scaler = RobustScaler() + elif scaling_method.lower() == "log": + temp_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + temp_scaler.fit(temperature_array) + temperature_array = temp_scaler.transform(temperature_array) + + # run PCA + pca_tf, _ = self._run_PCA(thermal_forcing_array, num_pcs=num_pcs) + pca_sal, _ = self._run_PCA(salinity_array, num_pcs=num_pcs) + pca_temp, _ = self._run_PCA(temperature_array, num_pcs=num_pcs) + + # get percent explained + save_path = f"{save_dir}/AIS_pca_thermal_forcing.pth" + pca_tf.save(save_path) + + save_path = f"{save_dir}/AIS_pca_salinity.pth" + pca_sal.save(save_path) + + save_path = f"{save_dir}/AIS_pca_temperature.pth" + pca_temp.save(save_path) + + # save scalers + save_path = f"{scaler_dir}/AIS_scaler_thermal_forcing.pth" + therm_scaler.save(save_path) + + save_path = f"{scaler_dir}/AIS_scaler_temperature.pth" + temp_scaler.save(save_path) + + save_path = f"{scaler_dir}/AIS_scaler_salinity.pth" + salinity_scaler.save(save_path) + + return 0 + + def _generate_gris_atmosphere_pcas( + self, + atmosphere_fps: list, + save_dir: str, + num_pcs="95%", + scaler_dir: str = None, + scaling_method="standard", + ): + + # if no separate directory for saving scalers is specified, use the pca save_dir + if scaler_dir is None: + scaler_dir = save_dir + + # get SMB and ST paths + test_num = 5 + aSMB_fps = [x for x in atmosphere_fps if "aSMB_combined" in x][0:test_num] + aST_fps = [x for x in atmosphere_fps if "aST_combined" in x][0:test_num] + + # allocate memory + flattened_xy_dim = 337 * 577 + + smb_forcing_array = np.zeros([len(aSMB_fps), 86, flattened_xy_dim]) + st_forcing_array = np.zeros([len(aST_fps), 86, flattened_xy_dim]) + # get xarray dataset, format it, and put it in preallocated array + print("Processing aSMB PCA model.") + for i, fp in enumerate(aSMB_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + smb_forcing_array[i, :, :] = dataset["aSMB"].values.reshape( + 86, flattened_xy_dim + ) # store + print("Processing aST PCA model.") + for i, fp in enumerate(aST_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + st_forcing_array[i, :, :] = dataset["aST"].values.reshape(86, flattened_xy_dim) # store + + # reshape variable_array (num_files, num_timestamps, num_gridpoints) --> (num_files*num_timestamps, num_gridpoints) + smb_forcing_array = smb_forcing_array.reshape( + len(aSMB_fps) * len(dataset.time), flattened_xy_dim + ) + st_forcing_array = st_forcing_array.reshape( + len(aST_fps) * len(dataset.time), flattened_xy_dim + ) + + # remove nans + smb_forcing_array = np.nan_to_num(smb_forcing_array) + st_forcing_array = np.nan_to_num(st_forcing_array) + + # scale data + if scaling_method.lower() == "standard": + smb_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + smb_scaler = RobustScaler() + elif scaling_method.lower() == "log": + smb_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + smb_scaler.fit(smb_forcing_array) + smb_forcing_array = smb_scaler.transform(smb_forcing_array) + + if scaling_method.lower() == "standard": + st_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + st_scaler = RobustScaler() + elif scaling_method.lower() == "log": + st_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + st_scaler.fit(st_forcing_array) + st_forcing_array = st_scaler.transform(st_forcing_array) + + # run PCA + pca_smb, _ = self._run_PCA(smb_forcing_array, num_pcs=num_pcs) + pca_st, _ = self._run_PCA(st_forcing_array, num_pcs=num_pcs) + + # save pca models + save_path = f"{save_dir}/GrIS_pca_aSMB.pth" + pca_smb.save(save_path) + + save_path = f"{save_dir}/GrIS_pca_aST.pth" + pca_st.save(save_path) + + # save scalers + save_path = f"{scaler_dir}/GrIS_aSMB_scaler.pth" + smb_scaler.save(save_path) + + save_path = f"{scaler_dir}/GrIS_aST_scaler.pth" + st_scaler.save(save_path) + + return 0 + + def _generate_gris_ocean_pcas( + self, + ocean_fps: list, + save_dir: str, + num_pcs="95%", + scaler_dir: str = None, + scaling_method="standard", + ): + + # if no separate directory for saving scalers is specified, use the pca save_dir + if scaler_dir is None: + scaler_dir = save_dir + + basin_runoff_fps = [x for x in ocean_fps if "basinRunoff" in x] + thermal_forcing_fps = [x for x in ocean_fps if "oceanThermalForcing" in x] + + # allocate memory + flattened_xy_dim = 337 * 577 + basin_runoff_array = np.zeros([len(basin_runoff_fps), 86, flattened_xy_dim]) + thermal_forcing_array = np.zeros([len(thermal_forcing_fps), 86, flattened_xy_dim]) + + # get xarray dataset, format it, and put it in preallocated array + print("Processing basin_runoff PCA model.") + for i, fp in enumerate(basin_runoff_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + basin_runoff_array[i, :, :] = dataset["basin_runoff"].values.reshape( + 86, flattened_xy_dim + ) + print("Processing thermal_forcing PCA model.") + for i, fp in enumerate(thermal_forcing_fps): + dataset = get_xarray_data(fp, ice_sheet=self.ice_sheet, convert_and_subset=True) + thermal_forcing_array[i, :, :] = dataset["thermal_forcing"].values.reshape( + 86, flattened_xy_dim + ) + + # reshape variable_array (num_files, num_timestamps, num_gridpoints) --> (num_files*num_timestamps, num_gridpoints) + basin_runoff_array = basin_runoff_array.reshape( + len(basin_runoff_fps) * len(dataset.time), flattened_xy_dim + ) + thermal_forcing_array = thermal_forcing_array.reshape( + len(thermal_forcing_fps) * len(dataset.time), flattened_xy_dim + ) + + # remove nans + basin_runoff_array = np.nan_to_num(basin_runoff_array) + thermal_forcing_array = np.nan_to_num(thermal_forcing_array) + + # scale data + if scaling_method.lower() == "standard": + basin_runoff_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + basin_runoff_scaler = RobustScaler() + elif scaling_method.lower() == "log": + basin_runoff_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + basin_runoff_scaler.fit(basin_runoff_array) + basin_runoff_array = basin_runoff_scaler.transform(basin_runoff_array) + + if scaling_method.lower() == "standard": + thermal_forcing_scaler = StandardScaler() + elif scaling_method.lower() == "robust": + thermal_forcing_scaler = RobustScaler() + elif scaling_method.lower() == "log": + thermal_forcing_scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + thermal_forcing_scaler.fit(thermal_forcing_array) + thermal_forcing_array = thermal_forcing_scaler.transform(thermal_forcing_array) + + # run PCA + pca_br, _ = self._run_PCA(basin_runoff_array, num_pcs=num_pcs) + pca_tf, _ = self._run_PCA(thermal_forcing_array, num_pcs=num_pcs) + + # save PCA + save_path = f"{save_dir}/GrIS_pca_basin_runoff.pth" + pca_br.save(save_path) + + save_path = f"{save_dir}/GrIS_pca_thermal_forcing.pth" + pca_tf.save(save_path) + + # save scalers + save_path = f"{scaler_dir}/GrIS_basin_runoff_scaler.pth" + basin_runoff_scaler.save(save_path) + + save_path = f"{scaler_dir}/GrIS_thermal_forcing_scaler.pth" + thermal_forcing_scaler.save(save_path) + + return 0 + + def _generate_sle_pca( + self, + sle_fps: list, + save_dir: str, + num_pcs="99%", + scaler_dir=None, + scaling_method="standard", + ): + """ + Generate principal component analysis (PCA) for sea level equivalent (SLE) variables. + + Args: + sle_fps (list): List of file paths for SLE variables. + save_dir (str): Directory to save the PCA results. + + Returns: + int: 0 if PCA generation is successful, -1 otherwise. + """ + + if scaler_dir is None: + scaler_dir = save_dir + + # get the flattened xy dimension + if self.ice_sheet == "AIS": + flattened_xy_dim = 761 * 761 + else: + flattened_xy_dim = 337 * 577 + + # allocate memory + sle_array = np.zeros([len(sle_fps), 86, flattened_xy_dim]) + + # loop through each SLE (IVAF) projection file + for i, fp in tqdm(enumerate(sle_fps), total=len(sle_fps), desc="Aggregating SLE files"): + # get the variable + try: + data_flattened = get_xarray_data(fp, var_name="sle", ice_sheet=self.ice_sheet) + except: + data_flattened = get_xarray_data(fp, var_name="ivaf", ice_sheet=self.ice_sheet) + data_flattened = data_flattened / 1e9 / 362.5 + + # store it in the total array + sle_array[i, :, :] = data_flattened + + # reshape variable_array (num_files, num_timestamps, num_gridpoints) --> (num_files*num_timestamps, num_gridpoints) + sle_array = sle_array.reshape(len(sle_fps) * 86, flattened_xy_dim) + + # since the array is so large (350*85, 761*761) = (29750, 579121), randomly sample N rows and run PCA + sle_array = sle_array[np.random.choice(sle_array.shape[0], 1590, replace=False), :] + + # deal with np.nans + sle_array = np.nan_to_num(sle_array) + + # scale sle + if scaling_method.lower() == "standard": + scaler = StandardScaler() + elif scaling_method.lower() == "robust": + scaler = RobustScaler() + elif scaling_method.lower() == "log": + scaler = LogScaler() + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + scaler.fit(sle_array) + sle_array = scaler.transform(sle_array) + + # run pca + pca, _ = self._run_PCA( + sle_array, + num_pcs=num_pcs, + ) + + # output pca object + save_path = f"{save_dir}/{self.ice_sheet}_pca_sle.pth" + pca.save(save_path) + + # and scaler + save_path = f"{scaler_dir}/{self.ice_sheet}_scaler_sle.pth" + scaler.save(save_path) + + return 0 + + def _run_PCA( + self, + variable_array, + num_pcs, + ): + """ + Runs Principal Component Analysis (PCA) on the given variable array. + + Args: + variable_array (array-like): The input array containing the variables. + num_pcs (int, optional): The number of principal components to keep. + If not specified, all components will be kept. + + Returns: + tuple: A tuple containing the fitted PCA model and the transformed array. + + """ + if isinstance(num_pcs, str) and not num_pcs.endswith("%"): + raise ValueError("num_pcs must be an integer, float, or string ending with '%'") + + # if num_pcs is a string, convert it to a float + if isinstance(num_pcs, str) and num_pcs.endswith("%"): + num_pcs = float(num_pcs.replace("%", "")) + if num_pcs > 1: + num_pcs /= 100 + + # run PCA + pca = PCA( + n_components=num_pcs, + ) + + # fit and transform the variable array + pca = pca.fit(variable_array) + pca_array = pca.transform(variable_array) + return pca, pca_array + + def _load_pca_models(self, pca_model_directory, var_name="all"): + if self.pca_model_directory is None and pca_model_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + if pca_model_directory is not None: + self.pca_model_directory = pca_model_directory + + # get all pca model paths + pca_models_paths = os.listdir(self.pca_model_directory) + pca_models_paths = [x for x in pca_models_paths if "pca" in x and self.ice_sheet in x] + + # load pca models + if self.ice_sheet == "AIS": + + if var_name not in [ + "all", + "evspsbl_anomaly", + "mrro_anomaly", + "pr_anomaly", + "smb_anomaly", + "ts_anomaly", + "thermal_forcing", + "salinity", + "temperature", + "sle", + None, + ]: + raise ValueError(f"Variable name {var_name} not recognized.") + + if var_name == "all" or var_name is None: + evspsbl_model = [x for x in pca_models_paths if "evspsbl" in x][0] + mrro_model = [x for x in pca_models_paths if "mrro" in x][0] + pr_model = [x for x in pca_models_paths if "pr" in x][0] + smb_model = [x for x in pca_models_paths if "smb" in x][0] + ts_model = [x for x in pca_models_paths if "ts" in x][0] + thermal_forcing_model = [x for x in pca_models_paths if "thermal_forcing" in x][0] + salinity_model = [x for x in pca_models_paths if "salinity" in x][0] + temperature_model = [x for x in pca_models_paths if "temperature" in x][0] + sle_model = [x for x in pca_models_paths if "sle" in x][0] + + pca_models = dict( + evspsbl_anomaly=PCA.load(f"{self.pca_model_directory}/{evspsbl_model}"), + mrro_anomaly=PCA.load(f"{self.pca_model_directory}/{mrro_model}"), + pr_anomaly=PCA.load(f"{self.pca_model_directory}/{pr_model}"), + smb_anomaly=PCA.load(f"{self.pca_model_directory}/{smb_model}"), + ts_anomaly=PCA.load(f"{self.pca_model_directory}/{ts_model}"), + thermal_forcing=PCA.load(f"{self.pca_model_directory}/{thermal_forcing_model}"), + salinity=PCA.load(f"{self.pca_model_directory}/{salinity_model}"), + temperature=PCA.load(f"{self.pca_model_directory}/{temperature_model}"), + sle=PCA.load(f"{self.pca_model_directory}/{sle_model}"), + ) + else: + pca_models = {} + model_path = [x for x in pca_models_paths if var_name in x][0] + pca_models[var_name] = PCA.load( + f"{self.pca_model_directory}/{model_path}", + ) + else: + if var_name not in [ + "all", + "aST", + "aSMB", + "basin_runoff", + "thermal_forcing", + "sle", + None, + ]: + raise ValueError(f"Variable name {var_name} not recognized.") + + if var_name == "all" or var_name is None: + aSMB_model = [x for x in pca_models_paths if "aSMB" in x][0] + aST_model = [x for x in pca_models_paths if "aST" in x][0] + basin_runoff_model = [x for x in pca_models_paths if "basin_runoff" in x][0] + thermal_forcing_model = [x for x in pca_models_paths if "thermal_forcing" in x][0] + + pca_models = dict( + aSMB=PCA.load(f"{self.pca_model_directory}/{aSMB_model}"), + aST=PCA.load(f"{self.pca_model_directory}/{aST_model}"), + basin_runoff=PCA.load(f"{self.pca_model_directory}/{basin_runoff_model}"), + thermal_forcing=PCA.load(f"{self.pca_model_directory}/{thermal_forcing_model}"), + sle=PCA.load(f"{self.pca_model_directory}/{sle_model}"), + ) + else: + pca_models = {} + model_path = [x for x in pca_models_paths if var_name in x][0] + pca_models[var_name] = PCA.load(f"{self.pca_model_directory}/{model_path}") + + return pca_models + + def _load_scalers(self, scaler_directory, var_name="all", scaling_method="standard"): + + if scaling_method.lower() == "standard": + scaler_class = StandardScaler + elif scaling_method.lower() == "robust": + scaler_class = RobustScaler + elif scaling_method.lower() == "log": + scaler_class = LogScaler + else: + raise ValueError(f"Scaler method {scaling_method} not recognized.") + if self.scaler_directory is None and scaler_directory is None: + warnings.warn( + "self.scaler_directory is None, resorting to using self.pca_model_directory" + ) + if self.pca_model_directory is None: + raise ValueError( + "Scaler directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + self.scaler_directory = self.pca_model_directory + if scaler_directory is not None: + self.scaler_directory = scaler_directory + + # get all scaler model paths + scaler_paths = os.listdir(self.scaler_directory) + scaler_paths = [x for x in scaler_paths if "scaler" in x and self.ice_sheet in x] + + # load scaler models + if self.ice_sheet == "AIS": + + if var_name not in [ + "all", + "evspsbl_anomaly", + "mrro_anomaly", + "pr_anomaly", + "smb_anomaly", + "ts_anomaly", + "thermal_forcing", + "salinity", + "temperature", + "sle", + None, + ]: + raise ValueError(f"Variable name {var_name} not recognized.") + + if var_name == "all" or var_name is None: + evspsbl_model = [x for x in scaler_paths if "evspsbl" in x][0] + mrro_model = [x for x in scaler_paths if "mrro" in x][0] + pr_model = [x for x in scaler_paths if "pr" in x][0] + smb_model = [x for x in scaler_paths if "smb" in x][0] + ts_model = [x for x in scaler_paths if "ts" in x][0] + thermal_forcing_model = [x for x in scaler_paths if "thermal_forcing" in x][0] + salinity_model = [x for x in scaler_paths if "salinity" in x][0] + temperature_model = [x for x in scaler_paths if "temperature" in x][0] + sle_model = [x for x in scaler_paths if "sle" in x][0] + + scalers = dict( + evspsbl_anomaly=scaler_class.load(f"{self.scaler_directory}/{evspsbl_model}"), + mrro_anomaly=scaler_class.load(f"{self.scaler_directory}/{mrro_model}"), + pr_anomaly=scaler_class.load(f"{self.scaler_directory}/{pr_model}"), + smb_anomaly=scaler_class.load(f"{self.scaler_directory}/{smb_model}"), + ts_anomaly=scaler_class.load(f"{self.scaler_directory}/{ts_model}"), + thermal_forcing=scaler_class.load( + f"{self.scaler_directory}/{thermal_forcing_model}" + ), + salinity=scaler_class.load(f"{self.scaler_directory}/{salinity_model}"), + temperature=scaler_class.load(f"{self.scaler_directory}/{temperature_model}"), + sle=scaler_class.load(f"{self.scaler_directory}/{sle_model}"), + ) + else: + scalers = {} + scaler_path = [x for x in scaler_paths if var_name in x][0] + scalers[var_name] = scaler_class.load(f"{self.scaler_directory}/{scaler_path}") + + else: # GrIS + if var_name not in [ + "all", + "aST", + "aSMB", + "basin_runoff", + "thermal_forcing", + "sle", + None, + ]: + raise ValueError(f"Variable name {var_name} not recognized.") + + if var_name == "all" or var_name is None: + aSMB_model = [x for x in scaler_paths if "aSMB" in x][0] + aST_model = [x for x in scaler_paths if "aST" in x][0] + basin_runoff_model = [x for x in scaler_paths if "basin_runoff" in x][0] + thermal_forcing_model = [x for x in scaler_paths if "thermal_forcing" in x][0] + sle_model = [x for x in scaler_paths if "sle" in x][0] + + scalers = dict( + aSMB=scaler_class.load( + f"{self.scaler_directory}/{aSMB_model}", + ), + aST=scaler_class.load( + f"{self.scaler_directory}/{aST_model}", + ), + basin_runoff=scaler_class.load( + f"{self.scaler_directory}/{basin_runoff_model}", + ), + thermal_forcing=scaler_class.load( + f"{self.scaler_directory}/{thermal_forcing_model}", + ), + sle=scaler_class.load( + f"{self.scaler_directory}/{sle_model}", + ), + ) + else: + scalers = {} + scaler_path = [x for x in scaler_paths if var_name in x][0] + scalers[var_name] = scaler_class.load(f"{self.scaler_directory}/{scaler_path}") + + return scalers + + def transform( + self, + x, + var_name, + num_pcs=None, + pca_model_directory=None, + scaler_directory=None, + scaling_method="standard", + ): + """ + Transform the given variable into PCA space. + + Args: + x (array-like): The input array containing the variables. + variable (str): The name of the variable to transform. + pca_models_paths (dict): A dictionary containing the filepaths for the PCA models. + + Returns: + array-like: The transformed array. + """ + # + if pca_model_directory is None and self.pca_model_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + + if pca_model_directory is not None: + self.pca_model_directory = pca_model_directory + + if scaler_directory is None and self.scaler_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + + if scaler_directory is not None: + self.scaler_directory = scaler_directory + + if len(x.shape) == 3: + x = x.reshape(x.shape[0], -1) + + # load pca and scaler models + pca_models = self._load_pca_models(self.pca_model_directory, var_name=var_name) + scalers = self._load_scalers( + self.scaler_directory, var_name=var_name, scaling_method=scaling_method + ) + pca = pca_models[var_name] + scaler = scalers[var_name] + x = np.nan_to_num(x) + + # scale and transform + scaled = scaler.transform(x) + transformed = pca.transform(scaled) + + # if num_pcs is a string, convert it to a float + if num_pcs is not None and num_pcs.endswith("%"): + exp_var_pca = pca.explained_variance_ratio_ + cum_sum_eigenvalues = np.cumsum(exp_var_pca) + num_pcs_cutoff = cum_sum_eigenvalues > float(num_pcs.replace("%", "")) / 100 + if ~num_pcs_cutoff.any(): + warnings.warn( + f"Explained variance cutoff ({num_pcs}) not reached, using all PCs available ({len(cum_sum_eigenvalues)})." + ) + num_pcs = len(cum_sum_eigenvalues) + else: + num_pcs = np.argmax(num_pcs_cutoff) + 1 + + return transformed[:, :num_pcs] + + def invert(self, pca_x, var_name, pca_model_directory=None, scaler_directory=None): + """ + Invert the given variable from PCA space. + + Args: + pca_x (array-like): The input array containing the variables in PCA space. + variable (str): The name of the variable to transform. + pca_models_paths (dict): A dictionary containing the filepaths for the PCA models. + + Returns: + array-like: The inverted array. + """ + if pca_model_directory is None and self.pca_model_directory is None: + raise ValueError( + "PCA model directory must be specified, or DimensionalityReducer.generate_pca_models must be run first." + ) + + if pca_model_directory is not None: + self.pca_model_directory = pca_model_directory + + # load pca and calculate inverse + pca_models = self._load_pca_models(pca_model_directory, var_name=var_name) + pca = pca_models[var_name] + inverted = pca.inverse_transform(pca_x) + + # unscale pca inverse + scalers = self._load_scalers(scaler_directory, var_name=var_name) + scaler = scalers[var_name] + unscaled = scaler.inverse_transform(inverted) + return unscaled + + +def get_xarray_data(dataset_fp, var_name=None, ice_sheet="AIS", convert_and_subset=False): + """ + Retrieves data from an xarray dataset. + + Args: + dataset_fp (str): The file path to the xarray dataset. + var_name (str, optional): The name of the variable to retrieve from the dataset. Defaults to None. + ice_sheet (str, optional): The ice sheet type. Defaults to 'AIS'. + convert_and_subset (bool, optional): Flag indicating whether to convert and subset the dataset. Defaults to False. + + Returns: + np.ndarray or xr.Dataset: The retrieved data from the dataset. + """ + + dataset = xr.open_dataset( + dataset_fp, + decode_times=False, + engine="netcdf4", + ) + try: + dataset = dataset.transpose("time", "x", "y", ...) + except: + pass + + if "ivaf" in dataset.variables: + pass + + else: + + # handle extra dimensions and variables + try: + dataset = dataset.drop_dims("nv4") + except ValueError: + pass + + for var in [ + "z_bnds", + "lat", + "lon", + "mapping", + "time_bounds", + "lat2d", + "lon2d", + "polar_stereographic", + ]: + try: + dataset = dataset.drop(labels=[var]) + except ValueError: + pass + if "z" in dataset.dims: + dataset = dataset.mean(dim="z", skipna=True) + + # subset the dataset for 5km resolution (GrIS) + if dataset.dims["x"] == 1681 and dataset.dims["y"] == 2881: + dataset = dataset.sel(x=dataset.x.values[::5], y=dataset.y.values[::5]) + + if convert_and_subset: + dataset = convert_and_subset_times(dataset) + + if var_name is not None: + try: + data = dataset[var_name].values + except KeyError: + return np.nan, np.nan + + x_dim = 761 if ice_sheet.lower() == "ais" else 337 + y_dim = 761 if ice_sheet.lower() == "ais" else 577 + if ( + "time" not in dataset.dims + or dataset.dims["time"] == 1 + or (data.shape[1] == y_dim and data.shape[2] == x_dim) + ): + pass + else: + # TODO: fix this. this is just a weird way of tranposing, not sure if it even happens. + grid_indices = np.array([0, 1, 2])[ + (np.array(data.shape) == x_dim) | (np.array(data.shape) == y_dim) + ] + data = np.moveaxis(data, list(grid_indices), [1, 2]) + + if "time" not in dataset.dims: + data_flattened = data.reshape( + -1, + ) + else: + data_flattened = data.reshape(len(dataset.time), -1) + return data_flattened + + return dataset + + +class DatasetMerger: + """ + A class for merging datasets from forcing and projection files. + """ + + def __init__(self, ice_sheet, forcings, projections, experiment_file, output_dir): + """ + Initializes a DatasetMerger object. + + Args: + ice_sheet (str): The ice sheet name. + forcing_dir (str): The directory path for forcing files. + projection_dir (str): The directory path for projection files. + experiment_file (str): The path to the experiment file (CSV or JSON). + output_dir (str): The directory path to save the merged dataset. + """ + self.ice_sheet = ice_sheet + self.forcings = forcings + self.projections = projections + self.experiment_file = experiment_file + self.output_dir = output_dir + + if self.experiment_file.endswith(".csv"): + self.experiments = pd.read_csv(experiment_file) + self.experiments.ice_sheet = self.experiments.ice_sheet.apply(lambda x: x.lower()) + elif self.experiment_file.endswith(".json"): + self.experiments = pd.read_json(experiment_file).T + else: + raise ValueError("Experiment file must be a CSV or JSON file.") + + self.forcing_paths = get_all_filepaths( + path=self.forcings, + filetype="csv", + ) + self.projection_paths = get_all_filepaths( + path=self.projections, + filetype="csv", + ) + self.forcing_metadata = self._get_forcing_metadata() + + def merge_dataset(self): + """ + Merges the forcing and projection files and creates a dataset. + + Returns: + int: Returns 0 after successfully merging and saving the dataset. + """ + full_dataset = pd.DataFrame() + self.experiments["exp"] = self.experiments["exp"].apply(lambda x: x.lower()) + + for i, projection in enumerate( + tqdm( + self.projection_paths, + total=len(self.projection_paths), + desc="Merging forcing & projection files", + ) + ): + # get experiment from projection filepath + exp = projection.replace(".csv", "").split("/")[-1].split("_")[-1] + + # make sure cases match when doing table lookup + + # get AOGCM value from table lookup + try: + aogcm = self.experiments.loc[ + (self.experiments.exp == exp.lower()) + & (self.experiments.ice_sheet == self.ice_sheet.lower()) + ]["AOGCM"].values[0] + except IndexError: + aogcm = self.experiments.loc[self.experiments.exp == exp.lower()]["AOGCM"].values[0] + proj_cmip_model = aogcm.split("_")[0] + proj_pathway = aogcm.split("_")[-1] + + # names of CMIP models are slightly different, adjust based on AIS/GrIS directories + if self.ice_sheet == "AIS": + if proj_cmip_model == "csiro-mk3.6": + proj_cmip_model = "csiro-mk3-6-0" + elif proj_cmip_model == "ipsl-cm5-mr": + proj_cmip_model = "ipsl-cm5a-mr" + elif proj_cmip_model == "cnrm-esm2" or proj_cmip_model == "cnrm-cm6": + proj_cmip_model = f"{proj_cmip_model}-1" + elif self.ice_sheet == "GrIS": + if proj_cmip_model.lower() == "noresm1-m": + proj_cmip_model = "noresm1" + elif proj_cmip_model.lower() == "ipsl-cm5-mr": + proj_cmip_model = "ipsl-cm5" + elif proj_cmip_model.lower() == "access1-3": + proj_cmip_model = "access1.3" + elif proj_cmip_model.lower() == "ukesm1-0-ll": + proj_cmip_model = "ukesm1-cm6" + + # get forcing file from table lookup that matches projection + forcing_files = self.forcing_metadata.file.loc[ + (self.forcing_metadata.cmip_model == proj_cmip_model) + & (self.forcing_metadata.pathway == proj_pathway) + ] + + if forcing_files.empty: + raise IndexError( + f"Could not find forcing file for {aogcm}. Check formatting of experiment file." + ) + + if len(forcing_files) > 1: + forcings = pd.DataFrame() + for file in forcing_files.values: + forcings = pd.concat( + [forcings, pd.read_csv(f"{self.forcings}/{file}.csv")], axis=1 + ) + else: + forcing_file = forcing_files.values[0] + forcings = pd.read_csv(f"{self.forcings}/{forcing_file}.csv") + + # load forcing and projection datasets + projections = pd.read_csv(projection) + # if forcings are longer than projections, cut off the beginning of the forcings + if len(forcings) > len(projections): + forcings = forcings.iloc[-len(projections) :].reset_index(drop=True) + + # add forcings and projections together and add some metadata + merged_dataset = pd.concat([forcings, projections], axis=1) + merged_dataset["time"] = np.arange(1, len(merged_dataset) + 1) + merged_dataset["cmip_model"] = proj_cmip_model + merged_dataset["pathway"] = proj_pathway + merged_dataset["exp"] = exp + merged_dataset["id"] = i + + # now add to dataset with all forcing/projection pairs + full_dataset = pd.concat([full_dataset, merged_dataset]) + + # save the full dataset + full_dataset.to_csv(f"{self.output_dir}/dataset.csv", index=False) + + return 0 + + def merge_sectors(self, forcings_file=None, projections_file=None, save_dir=None): + + pass + + # def merge(self, inputs='pca', outputs='sectors', save_dir=None): + # if save_dir is None: + # save_dir = self.output_dir + + # full_dataset = pd.DataFrame() + # self.experiments['exp'] = self.experiments['exp'].apply(lambda x: x.lower()) + + # if outputs.lower() == 'average' or outputs.lower() == 'sectors': + # paths = get_all_filepaths(path=self.projection_dir, filetype='nc', contains='rm', not_contains='historical') + # paths = [x for x in paths if 'ctrl' not in x] + + # for i, projection in enumerate(tqdm(paths, total=len(paths), desc="Merging forcing & projection files")): + # # get experiment from projection filepath + + # exp = projection.replace('.nc', '').replace('.csv', '').split('/')[-1].split('_')[-1] + + # # make sure cases match when doing table lookup + + # # get AOGCM value from table lookup + # try: + # aogcm = self.experiments.loc[(self.experiments.exp == exp.lower()) & (self.experiments.ice_sheet ==self.ice_sheet.lower())]['AOGCM'].values[0] + # except IndexError: + # aogcm = self.experiments.loc[self.experiments.exp == exp.lower()]['AOGCM'].values[0] + # proj_cmip_model = aogcm.split('_')[0] + # proj_pathway = aogcm.split('_')[-1] + + # # names of CMIP models are slightly different, adjust based on AIS/GrIS directories + # if self.ice_sheet == 'AIS': + # if proj_cmip_model == 'csiro-mk3.6': + # proj_cmip_model = 'csiro-mk3-6-0' + # elif proj_cmip_model == 'ipsl-cm5-mr': + # proj_cmip_model = 'ipsl-cm5a-mr' + # elif proj_cmip_model == 'cnrm-esm2' or proj_cmip_model == 'cnrm-cm6': + # proj_cmip_model = f'{proj_cmip_model}-1' + # elif self.ice_sheet == 'GrIS': + # if proj_cmip_model.lower() == 'noresm1-m': + # proj_cmip_model = 'noresm1' + # elif proj_cmip_model.lower() == 'ipsl-cm5-mr': + # proj_cmip_model = 'ipsl-cm5' + # elif proj_cmip_model.lower() == 'access1-3': + # proj_cmip_model = 'access1' + + # # get forcing file from table lookup that matches projection + # forcing_files = self.forcing_metadata.file.loc[(self.forcing_metadata.cmip_model == proj_cmip_model) & (self.forcing_metadata.pathway == proj_pathway)] + + # if forcing_files.empty: + # raise IndexError(f"Could not find forcing file for {aogcm}. Check formatting of experiment file.") + + # if len(forcing_files) > 1: + # forcings = pd.DataFrame() + # for file in forcing_files.values: + # forcings = pd.concat([forcings, pd.read_csv(f"{self.forcing_dir}/{file}.csv")], axis=1) + # else: + # forcing_file = forcing_files.values[0] + # forcings = pd.read_csv(f"{self.forcing_dir}/{forcing_file}.csv") + + # # load forcing and projection datasets + # if 'nc' in projection: + # projections = xr.open_dataset(projection) + # projections = projections.to_dataframe() + # projections = projections[[x for x in projections.columns if 'ivaf' in x]] + # projections = projections / 1e9 / 362.5 + # else: + # projections = pd.read_csv(projection) + + # # if forcings are longer than projections, cut off the beginning of the forcings + # if len(forcings) > len(projections): + # forcings = forcings.iloc[-len(projections):].reset_index(drop=True) + + # # add forcings and projections together and add some metadata + # merged_dataset = pd.concat([forcings, projections], axis=1) + # merged_dataset['cmip_model'] = proj_cmip_model + # merged_dataset['pathway'] = proj_pathway + # merged_dataset['exp'] = exp + # merged_dataset['id'] = i + + # # now add to dataset with all forcing/projection pairs + # full_dataset = pd.concat([full_dataset, merged_dataset]) + + # # save the full dataset + # full_dataset.to_csv(f"{self.output_dir}/dataset.csv", index=False) + + def _get_forcing_metadata(self): + """ + Retrieves the metadata for the forcing files. + + Returns: + df (pandas.DataFrame): DataFrame containing the metadata for the forcing files. + The DataFrame has three columns: 'file', 'cmip_model', and 'pathway'. + """ + pairs = {} + # loop through forcings, looking for cmip model and pathway + for forcing in self.forcing_paths: + if ( + forcing + == r"/oscar/home/pvankatw/scratch/pca/AIS/forcings/PCA_IPSL-CM5A-MR_RCP26_salinity_8km_x_60m.csv" + ): + stop = "stop" + forcing = forcing.replace(".csv", "").split("/")[-1] + cmip_model = forcing.split("_")[1] + + # GrIS has MAR3.9 in name, ignore + if cmip_model == "MAR3.9": + cmip_model = forcing.split("_")[2] + elif cmip_model.lower() == "gris": + cmip_model = forcing.split("_")[2] + + if "rcp" in forcing.lower() or "ssp" in forcing.lower(): + for substring in forcing.lower().split("_"): + if "rcp" in substring or "ssp" in substring: + pathway = substring.lower() + if len(pathway.split("-")) > 1 and ( + "rcp" in pathway.split("-")[-1] or "ssp" in pathway.split("-")[-1] + ): + if len(pathway.split("-")) > 2: + cmip_model = "-".join(pathway.split("-")[0:2]) + pathway = pathway.split("-")[-1] + else: + cmip_model = pathway.split("-")[0] + pathway = pathway.split("-")[-1] + break + else: + pathway = "rcp85" + if self.ice_sheet == "GrIS": + if cmip_model.lower() == "noresm1-m": + cmip_model = "noresm1" + elif cmip_model.lower() == "ipsl-cm5-mr": + cmip_model = "ipsl-cm5" + elif cmip_model.lower() == "access1-3": + cmip_model = "access1.3" + elif cmip_model.lower() == "ukesm1-0-ll": + cmip_model = "ukesm1-cm6" + + pairs[forcing] = [cmip_model.lower(), pathway.lower()] + df = pd.DataFrame(pairs).T + df = pd.DataFrame(pairs).T.reset_index() + df.columns = ["file", "cmip_model", "pathway"] + + return df + + +def combine_gris_forcings(forcing_dir): + """ + Combine GrIS forcings from multiple CMIP directories into a single NetCDF file. + + Parameters: + - forcing_dir (str): The directory containing the GrIS forcings. + + Returns: + - int: 0 indicating successful completion of the function. + """ + + atmosphere_dir = f"{forcing_dir}/GrIS/Atmosphere_Forcing/aSMB_observed/v1/" + cmip_directories = next(os.walk(atmosphere_dir))[1] + for cmip_dir in tqdm( + cmip_directories, total=len(cmip_directories), desc="Processing CMIP directories" + ): + for var in [f"aSMB", f"aST"]: + files = os.listdir(f"{atmosphere_dir}/{cmip_dir}/{var}") + files = np.array([x for x in files if x.endswith(".nc")]) + years = np.array([int(x.replace(".nc", "").split("-")[-1]) for x in files]) + year_files = files[(years >= 2015) & (years <= 2100)] + + for i, file in enumerate(year_files): + # first iteration, open dataset and store + if i == 0: + dataset = xr.open_dataset(f"{atmosphere_dir}/{cmip_dir}/{var}/{file}") + for dim in ["nv", "nv4", "mapping"]: + try: + dataset = dataset.drop_dims(dim) + except: + pass + dataset = dataset.drop("mapping") + dataset = dataset.sel(x=dataset.x.values[::5], y=dataset.y.values[::5]) + continue + + # following iterations, open dataset and concatenate + data = xr.open_dataset(f"{atmosphere_dir}/{cmip_dir}/{var}/{file}") + for dim in ["nv", "nv4"]: + try: + data = data.drop_dims(dim) + except: + pass + data = data.drop("mapping") + data = data.sel(x=data.x.values[::5], y=data.y.values[::5]) + # data['time'] = pd.to_datetime(year, format='%Y') + dataset = xr.concat([dataset, data], dim="time") + + # Now you have the dataset with the files loaded and time dimension set + dataset.to_netcdf( + os.path.join(atmosphere_dir, cmip_dir, f"GrIS_{cmip_dir}_{var}_combined.nc") + ) + + return 0 + + +def process_GrIS_atmospheric_sectors(forcing_directory, grid_file): + + start_time = time.time() + path_to_forcings = f"Atmosphere_Forcing/aSMB_observed/v1/" + af_directory = ( + f"{forcing_directory}/{path_to_forcings}" + if not forcing_directory.endswith(path_to_forcings) + else forcing_directory + ) + + # check to see if GrIS forcings have been combined + filepaths = get_all_filepaths(path=af_directory, contains="combined", filetype="nc") + if not filepaths: + combine_gris_forcings(af_directory) + filepaths = get_all_filepaths(path=af_directory, contains="combined", filetype="nc") + if not filepaths: + raise ValueError("No combined files found. Check combine_gris_forcings function.") + + aogcm_directories = os.listdir(af_directory) + aogcm_directories = [x for x in aogcm_directories if "DS_Store" not in x and "README" not in x] + + sectors = _format_grid_file(grid_file) + unique_sectors = np.unique(sectors) + all_data = [] + for i, fp in enumerate(aogcm_directories): + print("") + print(f"Directory {i+1} / {len(aogcm_directories)}") + print(f'Directory: {fp.split("/")[-1]}') + print(f"Time since start: {(time.time()-start_time) // 60} minutes") + + files = get_all_filepaths(path=f"{af_directory}/{fp}", contains="combined", filetype="nc") + if len(files) != 2: + raise ValueError(f"There should only be 2 combined files in each firectory, see {fp}.") + + st_and_smb = [] + for file in files: + dataset = xr.open_dataset(file, decode_times=False) + dataset = convert_and_subset_times(dataset) + + # handle extra dimensions and variables + try: + dataset = dataset.drop_dims("nv4") + except ValueError: + pass + + for var in [ + "z_bnds", + "lat", + "lon", + "mapping", + "time_bounds", + "lat2d", + "lon2d", + "polar_stereographic", + ]: + try: + dataset = dataset.drop(labels=[var]) + except ValueError: + pass + if "z" in dataset.dims: + dataset = dataset.mean(dim="z", skipna=True) + + dataset["sector"] = sectors + + formatted_aogcm = fp.rsplit("-", 1) + formatted_aogcm = "_".join(formatted_aogcm).lower() + + aogcm_data = [] + for sector in unique_sectors: + mask = dataset.sector == sector + sector_averages = dataset.where(mask, drop=True).mean(dim=["x", "y"]) + sector_averages = sector_averages.to_dataframe() + sector_averages["aogcm"] = formatted_aogcm + sector_averages["year"] = np.arange(1, 87) + sector_averages = sector_averages.reset_index(drop=True) + aogcm_data.append(sector_averages) + st_and_smb.append(pd.concat(aogcm_data)) + + all_data.append(pd.concat(st_and_smb, axis=1)) + + return pd.concat(all_data) + + +def process_AIS_atmospheric_sectors(forcing_directory, grid_file): + + ice_sheet = "AIS" + + start_time = time.time() + path_to_forcings = "AIS/Atmosphere_Forcing/" + af_directory = ( + f"{forcing_directory}/{path_to_forcings}" + if not forcing_directory.endswith(path_to_forcings) + else forcing_directory + ) + + filepaths = get_all_filepaths(path=af_directory, filetype="nc") + filepaths = [f for f in filepaths if "1995-2100" in f] + filepaths = [f for f in filepaths if "8km" in f] + + sectors = _format_grid_file(grid_file) + unique_sectors = np.unique(sectors) + all_data = [] + for i, fp in enumerate(filepaths): + fp = r"/oscar/home/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing//AIS/Atmosphere_Forcing/miroc-esm-chem_rcp8.5/Regridded_8km/MIROC-ESM-CHEM_8km_anomaly_1995-2100.nc" + print("") + print(f"File {i+1} / {len(filepaths)}") + print(f'File: {fp.split("/")[-1]}') + print(f"Time since start: {(time.time()-start_time) // 60} minutes") + + dataset = xr.open_dataset(fp, decode_times=False) + dataset = convert_and_subset_times(dataset) + + # handle extra dimensions and variables + try: + dataset = dataset.drop_dims("nv4") + except ValueError: + pass + + for var in ["z_bnds", "lat", "lon", "mapping", "time_bounds", "lat2d", "lon2d"]: + try: + dataset = dataset.drop(labels=[var]) + except ValueError: + pass + if "z" in dataset.dims: + dataset = dataset.mean(dim="z", skipna=True) + + # dataset = dataset.transpose("time", "x", "y", ...) + dataset["sector"] = sectors + + aogcm_data = [] + for sector in unique_sectors: + mask = dataset.sector == sector + sector_averages = dataset.where( + mask, + ).mean(dim=["x", "y"], skipna=True) + sector_averages = sector_averages.to_dataframe() + sector_averages["aogcm"] = fp.split("/")[-3].lower() + sector_averages["year"] = np.arange(1, 87) + sector_averages = sector_averages.reset_index(drop=True) + aogcm_data.append(sector_averages) + + all_data.append(pd.concat(aogcm_data)) + atmospheric_df = pd.concat(all_data) + atmospheric_df = atmospheric_df.loc[:, ~atmospheric_df.columns.duplicated()] + return atmospheric_df + + +def process_AIS_oceanic_sectors(forcing_directory, grid_file): + + start_time = time.time() + directory = ( + f"{forcing_directory}/Ocean_Forcing/" + if not forcing_directory.endswith("Ocean_Forcing/") + else forcing_directory + ) + # Get all NC files that contain data from 1995-2100 + filepaths = get_all_filepaths(path=directory, filetype="nc") + filepaths = [f for f in filepaths if "1995-2100" in f] + filepaths = [f for f in filepaths if "8km" in f] + + # In the case of ocean forcings, use the filepaths of the files to determine + # which directories need to be used for OceanForcing processing. Change to + # those directories rather than individual files. + aogcms = list(set([f.split("/")[-3] for f in filepaths])) + filepaths = [f"{directory}/{aogcm}/" for aogcm in aogcms] + + # Useful progress prints + print("Files to be processed...") + print([f.split("/")[-2] for f in filepaths]) + + sectors = _format_grid_file(grid_file) + unique_sectors = np.unique(sectors) + all_data = [] + for i, directory in enumerate(filepaths): + print("") + print(f"File {i+1} / {len(filepaths)}") + print(f'File: {directory.split("/")[-1]}') + print(f"Time since start: {(time.time()-start_time) // 60} minutes") + + files = os.listdir(f"{directory}/1995-2100/") + if len(files) != 3: + warnings.warn(f"Directory {directory} does not contain 3 files.") + + thermal_forcing_file = [f for f in files if "thermal_forcing" in f][0] + salinity_file = [f for f in files if "salinity" in f][0] + temperature_file = [f for f in files if "temperature" in f][0] + + thermal_forcing = xr.open_dataset( + f"{directory}/1995-2100/{thermal_forcing_file}", decode_times=False + ) + salinity = xr.open_dataset(f"{directory}/1995-2100/{salinity_file}", decode_times=False) + temperature = xr.open_dataset( + f"{directory}/1995-2100/{temperature_file}", decode_times=False + ) + + thermal_forcing = convert_and_subset_times(thermal_forcing) + salinity = convert_and_subset_times(salinity) + temperature = convert_and_subset_times(temperature) + + data = { + "thermal_forcing": thermal_forcing, + "salinity": salinity, + "temperature": temperature, + } + aogcm_data = {"thermal_forcing": [], "salinity": [], "temperature": []} + for name, dataset in data.items(): + # handle extra dimensions and variables + try: + dataset = dataset.drop_dims("nv4") + except ValueError: + pass + + for var in [ + "z_bnds", + "lat", + "lon", + "mapping", + "time_bounds", + "lat2d", + "lon2d", + "polar_stereographic", + ]: + try: + dataset = dataset.drop(labels=[var]) + except ValueError: + pass + if "z" in dataset.dims: + dataset = dataset.mean(dim="z", skipna=True) + + try: + dataset["sector"] = sectors + except ValueError: + dataset["time"] = np.arange(1, 87) + dataset["sector"] = sectors + + for sector in unique_sectors: + mask = dataset.sector == sector + sector_averages = dataset.where(mask, drop=True).mean(dim=["x", "y"]) + sector_averages = sector_averages.to_dataframe() + sector_averages["aogcm"] = _format_AIS_ocean_aogcm_name( + directory.split("/")[-2].lower() + ) + sector_averages["year"] = np.arange(1, 87) + sector_averages = sector_averages.reset_index(drop=True) + aogcm_data[name].append(sector_averages) + df = pd.concat( + [ + pd.concat(aogcm_data["thermal_forcing"]), + pd.concat(aogcm_data["salinity"]), + pd.concat(aogcm_data["temperature"]), + ], + axis=1, + ) + df = df.loc[:, ~df.columns.duplicated()] + all_data.append(df) + return pd.concat(all_data) + + +def process_GrIS_oceanic_sectors(forcing_directory, grid_file): + + start_time = time.time() + path_to_forcing = "Ocean_Forcing/Melt_Implementation/v4/" + forcing_directory = ( + f"{forcing_directory}/{path_to_forcing}" + if not forcing_directory.endswith(path_to_forcing) + else forcing_directory + ) + + aogcm_directories = os.listdir(forcing_directory) + aogcm_directories = [x for x in aogcm_directories if "DS_Store" not in x and "README" not in x] + + sectors = _format_grid_file(grid_file) + unique_sectors = np.unique(sectors) + all_data = [] + for i, directory in enumerate(aogcm_directories): + print("") + print(f"Directory {i+1} / {len(aogcm_directories)}") + print(f'Directory: {directory.split("/")[-1]}') + print(f"Time since start: {(time.time()-start_time) // 60} minutes") + + files = os.listdir(f"{forcing_directory}/{directory}") + if len(files) != 2: + warnings.warn(f"Directory {directory} does not contain 2 files.") + + thermal_forcing_file = [f for f in files if "thermalforcing" in f.lower()][0] + basin_runoff_file = [f for f in files if "basinrunoff" in f.lower()][0] + + thermal_forcing = xr.open_dataset( + f"{forcing_directory}/{directory}/{thermal_forcing_file}", decode_times=False + ) + basin_runoff = xr.open_dataset( + f"{forcing_directory}/{directory}/{basin_runoff_file}", decode_times=False + ) + + # subset the dataset for 5km resolution (GrIS) + if thermal_forcing.dims["x"] == 1681 and thermal_forcing.dims["y"] == 2881: + thermal_forcing = thermal_forcing.sel( + x=thermal_forcing.x.values[::5], y=thermal_forcing.y.values[::5] + ) + basin_runoff = basin_runoff.sel( + x=basin_runoff.x.values[::5], y=basin_runoff.y.values[::5] + ) + + thermal_forcing = convert_and_subset_times(thermal_forcing) + basin_runoff = convert_and_subset_times(basin_runoff) + + data = { + "thermal_forcing": thermal_forcing, + "basin_runoff": basin_runoff, + } + aogcm_data = { + "thermal_forcing": [], + "basin_runoff": [], + } + for name, dataset in data.items(): + # handle extra dimensions and variables + try: + dataset = dataset.drop_dims("nv4") + except ValueError: + pass + + for var in [ + "z_bnds", + "lat", + "lon", + "mapping", + "time_bounds", + "lat2d", + "lon2d", + "polar_stereographic", + ]: + try: + dataset = dataset.drop(labels=[var]) + except ValueError: + pass + if "z" in dataset.dims: + dataset = dataset.mean(dim="z", skipna=True) + + try: + dataset["sector"] = sectors + except ValueError: + dataset["time"] = np.arange(1, 87) + dataset["sector"] = sectors + + for sector in unique_sectors: + mask = dataset.sector == sector + sector_averages = dataset.where(mask, drop=True).mean(dim=["x", "y"]) + sector_averages = sector_averages.to_dataframe() + sector_averages["aogcm"] = _format_GrIS_ocean_aogcm_name(directory) + sector_averages["year"] = np.arange(1, 87) + sector_averages = sector_averages.reset_index(drop=True) + aogcm_data[name].append(sector_averages) + df = pd.concat( + [ + pd.concat(aogcm_data["thermal_forcing"]), + pd.concat(aogcm_data["basin_runoff"]), + ], + axis=1, + ) + df = df.loc[:, ~df.columns.duplicated()] + all_data.append(df) + return pd.concat(all_data) + + +def _format_grid_file(grid_file): + if isinstance(grid_file, str): + grids = xr.open_dataset(grid_file) # .transpose('x', 'y',) + sector_name = "sectors" if "8km" in grid_file.lower() else "ID" + elif isinstance(grid_file, xr.Dataset): + sector_name = "ID" if "Rignot" in grids.Description else "sectors" + else: + raise ValueError("grid_file must be a string or an xarray Dataset.") + + grids = grids.expand_dims(dim={"time": 86}) + sectors = grids[sector_name] + grids = grids.transpose("time", "x", "y", ...) + + return sectors + + +def process_AIS_outputs( + zenodo_directory, +): + + directory = ( + f"{zenodo_directory}/ComputedScalarsPaper/" + if not zenodo_directory.endswith("ComputedScalarsPaper") + else zenodo_directory + ) + files = get_all_filepaths(directory, contains="ivaf_minus_ctrl_proj", filetype="nc") + count = 0 + + all_files_data = [] + for i, f in enumerate(files): + exp = f.replace(".nc", "").split("/")[-1].split("_")[-1] + model = f"{f.replace('.nc', '').split('/')[-1].split('_')[-3]}_{f.replace('.nc', '').split('/')[-1].split('_')[-2]}" + + dataset = xr.open_dataset(f, decode_times=False) + + if len(dataset.time) == 85: + count += 1 + warnings.warn( + f"{f.split('/')[-1]} does not contain 86 years. Inserting a copy into the first year." + ) + + # Copy the first entry + first_entry = dataset.isel({"time": 0}) + # Assuming numeric coordinates, create a new coordinate value + new_coord_value = ( + first_entry["time"].values - 1 + ) # Adjust this calculation based on your coordinate system + # Set the new coordinate value for the copied entry + first_entry["time"] = new_coord_value + # Concatenate the new entry with the original dataset + dataset = xr.concat([first_entry, dataset], dim="time") + + # dataset = convert_and_subset_times(dataset) + all_sectors = [] + for sector in range(1, 19): + sector_x_data = dataset[f"ivaf_sector_{sector}"].to_dataframe().reset_index(drop=True) + sector_x_data.rename(columns={f"ivaf_sector_{sector}": "ivaf"}, inplace=True) + sector_x_data["sector"] = sector + sector_x_data["year"] = np.arange(1, 87) + sector_x_data["id"] = f"{model}_sector{sector}" + + all_sectors.append(sector_x_data) + full_dataset = pd.concat(all_sectors, axis=0) + full_dataset["exp"] = exp + full_dataset["model"] = model + all_files_data.append(full_dataset) + outputs = pd.concat(all_files_data) + outputs["sle"] = outputs["ivaf"] / 362.5 / 1e9 + + return outputs + + +def merge_datasets(forcings, projections, experiments_file, ice_sheet="AIS", export_directory=None): + + if isinstance(experiments_file, str): + experiments = pd.read_csv(experiments_file) + elif isinstance(experiments_file, pd.DataFrame): + experiments = experiments_file + else: + raise ValueError("experiments_file must be a string or a pandas DataFrame.") + + experiments = experiments[experiments.ice_sheet == ice_sheet] + projections = pd.merge(projections, experiments, on="exp", how="inner") + formatting_function = ( + _format_AIS_forcings_aogcm_name if ice_sheet == "AIS" else _format_GrIS_forcings_aogcm_name + ) + forcings["aogcm"] = forcings["aogcm"].apply(formatting_function) + projections.rename(columns={"AOGCM": "aogcm"}, inplace=True) + dataset = pd.merge(forcings, projections, on=["aogcm", "year", "sector"], how="inner") + + return dataset + + +def process_GrIS_outputs( + zenodo_directory, +): + + directory = ( + f"{zenodo_directory}/v7_CMIP5_pub/" + if not zenodo_directory.endswith("v7_CMIP5_pub") + else zenodo_directory + ) + files = get_all_filepaths(directory, contains="rm", not_contains="ctrl_proj", filetype="nc") + files = [f for f in files if "historical" not in f] + count = 0 + + all_files_data = [] + for f in files: + exp = f.replace(".nc", "").split("/")[-1].split("_")[-1] + exp = exp.replace("_05", "") + model = f"{f.replace('.nc', '').split('/')[-1].split('_')[-3]}_{f.replace('.nc', '').split('/')[-1].split('_')[-2]}" + dataset = xr.open_dataset(f, decode_times=False) + + if len(dataset.time) == 85: + count += 1 + warnings.warn( + f"{f.split('/')[-1]} does not contain 86 years. Inserting a copy into the first year." + ) + + # Copy the first entry + first_entry = dataset.isel({"time": 0}) + # Assuming numeric coordinates, create a new coordinate value + new_coord_value = ( + first_entry["time"].values - 1 + ) # Adjust this calculation based on your coordinate system + # Set the new coordinate value for the copied entry + first_entry["time"] = new_coord_value + # Concatenate the new entry with the original dataset + dataset = xr.concat([first_entry, dataset], dim="time") + + sector_mapping = {"1": "no", "2": "ne", "3": "se", "4": "sw", "5": "cw", "6": "nw"} + # dataset = convert_and_subset_times(dataset) + all_sectors = [] + for sector in range(1, 7): + var_name = f"ivaf_{sector_mapping[str(sector)]}" + sector_x_data = dataset[var_name].to_dataframe().reset_index(drop=True) + sector_x_data.rename(columns={var_name: "ivaf"}, inplace=True) + sector_x_data["sector"] = sector + sector_x_data["year"] = np.arange(1, 87) + sector_x_data["id"] = f"{model}_{exp}_sector{sector}" + + all_sectors.append(sector_x_data) + full_dataset = pd.concat(all_sectors, axis=0) + full_dataset["exp"] = exp + full_dataset["model"] = model + all_files_data.append(full_dataset) + outputs = pd.concat(all_files_data) + outputs["sle"] = outputs["ivaf"] / 362.5 / 1e9 + + return outputs + + +def process_sectors( + ice_sheet, + forcing_directory, + grid_file, + zenodo_directory, + experiments_file, + export_directory=None, + overwrite=False, +): + + forcing_exists = os.path.exists(f"{export_directory}/forcings.csv") + if not forcing_exists or (forcing_exists and overwrite): + atmospheric_df = ( + process_AIS_atmospheric_sectors(forcing_directory, grid_file) + if ice_sheet == "AIS" + else process_GrIS_atmospheric_sectors(forcing_directory, grid_file) + ) + atmospheric_df.to_csv(f"{export_directory}/{ice_sheet}_atmospheric.csv", index=False) + oceanic_df = ( + process_AIS_oceanic_sectors(forcing_directory, grid_file) + if ice_sheet == "AIS" + else process_GrIS_oceanic_sectors(forcing_directory, grid_file) + ) + oceanic_df.to_csv(f"{export_directory}/{ice_sheet}_oceanic.csv", index=False) + + # atmospheric_df = pd.read_csv(f"{export_directory}/{ice_sheet}_atmospheric.csv") + # oceanic_df = pd.read_csv(f"{export_directory}/{ice_sheet}_oceanic.csv") + # atmospheric_df = atmospheric_df[[x for x in atmospheric_df.columns if '.1' not in x]] + # oceanic_df = oceanic_df[[x for x in oceanic_df.columns if '.1' not in x]] + + atmospheric_df = atmospheric_df.loc[:, ~atmospheric_df.columns.duplicated()] + oceanic_df = oceanic_df.loc[:, ~oceanic_df.columns.duplicated()] + forcings = pd.merge( + atmospheric_df, + oceanic_df, + on=[ + "aogcm", + "year", + "sector", + ], + how="inner", + ) + forcings.to_csv(f"{export_directory}/forcings.csv", index=False) + else: + forcings = pd.read_csv(f"{export_directory}/forcings.csv") + + projections_exists = os.path.exists(f"{export_directory}/projections.csv") + if not projections_exists or (projections_exists and overwrite): + projections = ( + process_AIS_outputs( + zenodo_directory, + ) + if ice_sheet == "AIS" + else process_GrIS_outputs( + zenodo_directory, + ) + ) + projections.to_csv(f"{export_directory}/projections.csv", index=False) + else: + projections = pd.read_csv(f"{export_directory}/projections.csv") + + projections = projections.loc[:, ~projections.columns.duplicated()] + dataset = merge_datasets( + forcings, + projections, + experiments_file, + ice_sheet, + ) + dataset = dataset[[x for x in dataset.columns if ".1" not in x]] + + if export_directory is not None: + dataset.to_csv(f"{export_directory}/dataset.csv", index=False) + + return dataset + + +def _format_AIS_ocean_aogcm_name(aogcm): + aogcm = aogcm.lower() + if ( + aogcm == "ipsl-cm5a-mr_rcp2.6" + or aogcm == "ipsl-cm5a-mr_rcp8.5" + or aogcm == "hadgem2-es_rcp8.5" + or aogcm == "csiro-mk3-6-0_rcp8.5" + ): + aogcm = aogcm.replace(".", "") + elif ( + aogcm == "cnrm-cm6-1_ssp585" + or aogcm == "cnrm-esm2-1_ssp585" + or aogcm == "cnrm-cm6-1_ssp126" + ): + aogcm = aogcm.replace("-1", "") + aogcm = aogcm.replace("-", "_") + elif aogcm == "ukesm1-0-ll_ssp585": + aogcm = "ukesm1-0-ll" + else: + pass + return aogcm + + +def _format_AIS_forcings_aogcm_name(aogcm): + aogcm = aogcm.lower() + if ( + aogcm == "noresm1-m_rcp2.6" + or aogcm == "noresm1-m_rcp8.5" + or aogcm == "miroc-esm-chem_rcp8.5" + or aogcm == "ccsm4_rcp8.5" + ): + aogcm = aogcm.replace(".", "") + elif aogcm == "csiro-mk3-6-0_rcp85": + aogcm = "csiro-mk3.6_rcp85" + elif aogcm == "ipsl-cm5a-mr_rcp26" or aogcm == "ipsl-cm5a-mr_rcp85": + aogcm = aogcm.replace("a", "") + else: + pass + return aogcm + + +def _format_GrIS_forcings_aogcm_name(aogcm): + aogcm = aogcm.lower() + if aogcm == "noresm1_rcp85": + aogcm = "noresm1-m_rcp85" + elif aogcm == "ukesm1-cm6_ssp585": + aogcm = "ukesm1-0-ll_ssp585" + else: + pass + return aogcm + + +def format_GrIS_atmospheric_aogcm_name(aogcm): + modified_string = aogcm.rsplit("-", 1) + return "_".join(modified_string).lower() + + +def _format_GrIS_ocean_aogcm_name(aogcm): + aogcm = aogcm.lower() + if aogcm == "access1-3_rcp8.5": + aogcm = "access1.3_rcp85" + elif aogcm == "csiro-mk3.6_rcp8.5": + aogcm = aogcm.replace("8.5", "85") + elif aogcm in ( + "hadgem2-es_rcp8.5", + "ipsl-cm5-mr_rcp8.5", + "miroc5_rcp2.6", + "miroc5_rcp8.5", + "miroc5_rcp8.5", + ): + aogcm = aogcm.replace(".", "") + elif aogcm == "noresm1-m_rcp8.5": + aogcm = "noresm1_rcp85" + elif aogcm == "ukesm1-0-ll_ssp585": + aogcm = "ukesm1-cm6_ssp585" + else: + pass + return aogcm diff --git a/ise/data/processors/__init__.py b/ise/data/processors/__init__.py deleted file mode 100644 index 54f71b5..0000000 --- a/ise/data/processors/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -r""" -# [forcings](https://brown-sciml.github.io/ise/ise/data/processors/forcings.html) -Processing functions for ISMIP6 atmospheric, oceanic, and ice-collapse forcings found in the Globus ISMIP6 Archive - -# [ismip6](https://brown-sciml.github.io/ise/ise/data/processors/ismip6.html) -Processing functions for ismip6 ice sheet model outputs. - -# [merge](https://brown-sciml.github.io/ise/ise/data/processors/merge.html) -Processing functions for joining the processed inputs from the forcing directory and the outputs from the ismip6 ice sheet models to create a master dataset. - -# [control](https://brown-sciml.github.io/ise/ise/data/processors/control.html) -Processing functions for the control experiments. -""" - - -from ise.data.processors.forcings import ( - process_forcings, - AtmosphereForcing, - GridSectors, - IceCollapse, - OceanForcing, - aggregate_atmosphere, - aggregate_by_sector, - aggregate_icecollapse, - aggregate_ocean, -) - -from ise.data.processors.ismip6 import ( - process_ismip6_outputs, - _get_sector, - process_experiment, - process_repository, - process_single_file, -) - -from ise.data.processors.merge import combine_datasets, exp_to_attributes, format_aogcms - -from ise.data.processors.control import create_control_dataset diff --git a/ise/data/processors/control.py b/ise/data/processors/control.py deleted file mode 100644 index c033ece..0000000 --- a/ise/data/processors/control.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Processing functions for ISMIP6 control experiments. Includes creating dataset for ctrl_proj values.""" - -from ise.utils.utils import get_all_filepaths -import xarray as xr -import pandas as pd -from tqdm import tqdm - - -def create_control_dataset(zenodo_directory: str, export_directory: str): - """Creates dataset for lookup of control values. Can be added to prediction to adjust for - control group subtraction. - - Args: - zenodo_directory (str): Directory containing Zenodo output files. - export_directory (str): Directory to export processed outputs. - """ - - # traverse file structure to find directories with ctrl - data_directory = f"{zenodo_directory}/ComputedScalarsPaper/" - all_files = get_all_filepaths(data_directory, filetype="nc", contains="computed_ivaf") - - # for every folder in ctrl, get the computed_ivaf_ file - ctrl_files = [f for f in all_files if '/ctrl_proj_' in f] - - all_ctrls = [] - for file in tqdm(ctrl_files): - path_split = file.split('/') - filename = path_split[-1] - forcing_type = 'Open' if 'open' in filename else 'Standard' - modelname = f"{path_split[-4]}_{path_split[-3]}" - - data = xr.open_dataset(file, decode_times=False) - data = data.to_dataframe().reset_index().drop(columns=['ivaf', 'rhoi', 'rhow', 'ivaf_region_1', 'ivaf_region_2', 'ivaf_region_3']) - data = pd.melt(data, id_vars="time") - data['variable'] = data.variable.apply(lambda x: x.split('_')[-1]) - data['value'] = data.value / 1e9 / 362.5 # to sle - data.columns = ['year', 'sectors', 'ctrl_sle'] - data['year'] = data.year.astype(int) - data['modelname'] = modelname - data["ocean_forcing"] = forcing_type - all_ctrls.append(data) - - all_ctrls = pd.concat(all_ctrls) - all_ctrls.to_csv(f'{export_directory}/ctrl_sle.csv', index=False) - - return all_ctrls diff --git a/ise/data/processors/forcings.py b/ise/data/processors/forcings.py deleted file mode 100644 index ba58293..0000000 --- a/ise/data/processors/forcings.py +++ /dev/null @@ -1,647 +0,0 @@ -"""Processing functions for ISMIP6 atmospheric, oceanic, and ice-collapse forcings found in -the [Globus ISMIP6 Archive](https://app.globus.org/file-manager?origin_id=ad1a6ed8-4de0-4490-93a9-8258931766c7&origin_path=%2F) -""" -import time -import numpy as np -import pandas as pd -import xarray as xr -from ise.utils.utils import get_all_filepaths, check_input - -np.random.seed(10) - - -def process_forcings( - forcing_directory: str, - grids_directory: str, - export_directory: str, - to_process: str = "all", - verbose: bool = False, -) -> None: - """Perform preprocessing of atmospheric, oceanic, and ice-collapse forcing from [Globus ISMIP6 - Directory](https://app.globus.org/file-manager?origin_id=ad1a6ed8-4de0-4490-93a9-8258931766c7 - &origin_path=%2F). - - Args: - forcing_directory (str): Directory containing grid data files - export_directory (str): Directory to export processed files. - to_process (str, optional): Forcings to process, options=[all, - atmosphere, ocean, ice_collapse], - verbose (bool, optional): Flag denoting whether to output logs - in terminal, defaults to False - defaults to 'all' - """ - # check inputs - to_process_options = ["all", "atmosphere", "ocean", "ice_collapse"] - if isinstance(to_process, str): - if to_process.lower() not in to_process_options: - raise ValueError( - f"to_process arg must be in [{to_process_options}], \ - received {to_process}" - ) - elif isinstance(to_process, list): - to_process_valid = all(s in to_process_options for s in to_process) - if not to_process_valid: - raise ValueError( - f"to_process arg must be in [{to_process_options}], \ - received {to_process}" - ) - - if to_process.lower() == "all": - to_process = ["atmosphere", "ocean", "ice_collapse"] - - if verbose: - print("Processing...") - - # Process each using respective functions - curr_time = time.time() - if "atmosphere" in to_process: - af_directory = f"{forcing_directory}/Atmosphere_Forcing/" - aggregate_atmosphere( - af_directory, - grids_directory, - export=export_directory, - ) - if verbose: - prev_time, curr_time = curr_time, time.time() - curr_time = time.time() - print( - f"Finished processing atmosphere, Total Running Time: \ - {(curr_time - prev_time) // 60} minutes" - ) - - if "ocean" in to_process: - of_directory = f"{forcing_directory}/Ocean_Forcing/" - aggregate_ocean( - of_directory, - grids_directory, - export=export_directory, - ) - if verbose: - prev_time, curr_time = curr_time, time.time() - curr_time = time.time() - print( - f"Finished processing ocean, Total Running Time: \ - {(curr_time - prev_time) // 60} minutes" - ) - - if "ice_collapse" in to_process: - ice_directory = f"{forcing_directory}/Ice_Shelf_Fracture" - aggregate_icecollapse( - ice_directory, - grids_directory, - export=export_directory, - ) - if verbose: - prev_time, curr_time = curr_time, time.time() - curr_time = time.time() - print( - f"Finished processing ice_collapse, Total Running Time: \ - {(curr_time - prev_time) // 60} minutes" - ) - if verbose: - print(f"Finished. Data exported to {export_directory}") - - -class GridSectors: - """Class for grid sector data and attributes.""" - - def __init__( - self, - grids_dir: str, - grid_size: int = 8, - filetype: str = "nc", - format_index: bool = True, - ): - """Initializes class and opens/stores data. - - Args: - grids_dir (str): Directory containing grid data. - grid_size (int, optional): KM grid size to be used, must be - [4, 8, 16, 32] defaults to 8 - filetype (str, optional): Filetype of data, must be in [nc, - csv], defaults to 'nc' - format_index (bool, optional): Flag denoting whether to fix - index so that join works appropriately, defaults to True - """ - check_input(grid_size, [4, 8, 16, 32]) - check_input(filetype.lower(), ["nc", "csv"]) - self.grids_dir = grids_dir - - if filetype.lower() == "nc": - self.path = self.grids_dir + f"sectors_{grid_size}km.nc" - self.data = xr.open_dataset(self.path, decode_times=False) - self._to_dataframe() - if format_index: - self._format_index() - elif filetype.lower() == "csv": - self.path = self.grids_dir + f"sector_{grid_size}.csv" - self.data = pd.read_csv(self.path) - else: - raise NotImplementedError('Only "NetCDF" and "CSV" are currently supported') - - def _to_dataframe(self): - """Converts self.data to dataframe. - - Returns: - self: GridSectors: GridSectors object with data as - dataframe. - """ - if not isinstance(self, pd.DataFrame): - self.data = self.data.to_dataframe() - return self - - def _format_index(self): - """Formats indices from 0 to 761 so merge with forcing data is possible. - - Returns: - self: GridSectors: GridSectors object with indices - formatted. - """ - index_array = list(np.arange(0, 761)) - self.data.index = pd.MultiIndex.from_product( - [index_array, index_array], names=["x", "y"] - ) - return self - - -class AtmosphereForcing: - """Class for atmospheric forcing data and attributes.""" - - def __init__(self, path: str): - """Initializes class and opens/stores data. - - Args: - path (str): Filepath to atmospheric forcing file. - """ - self.forcing_type = "atmosphere" - self.path = path - self.aogcm = path.split("/")[-3] # 3rd to last folder in directory structure - - if path[-2:] == "nc": - self.data = xr.open_dataset(self.path, decode_times=False) - self.datatype = "NetCDF" - - elif path[-3:] == "csv": - self.data = pd.read_csv( - self.path, - ) - self.datatype = "CSV" - - def aggregate_dims( - self, - ): - """Aggregates over excess dimesions, particularly over time or grid cells. - - Returns: - self: AtmosphereForcing: AtmosphereForcing object with - dimensions reduced. - """ - dims = self.data.dims - if "time" in dims: - self.data = self.data.mean(dim="time") - if "nv4" in dims: - self.data = self.data.mean(dim="nv4") - return self - - def add_sectors(self, grids: GridSectors): - """Adds information on which sector each grid cell belongs to. This is done through a merge - of grid cell data with a sectors NC file. - - Args: - grids (GridSectors): GridSectors class containing grid cell - information and attributes - - Returns: - self: AtmosphereForcing: AtmosphereForcing class with - sectors added. - """ - for col in ["lon_bnds", "lat_bnds", "lat2d", "lon2d"]: - try: - self.data = self.data.drop(labels=[col]) - except ValueError: - pass - self.data = self.data.to_dataframe().reset_index(level="time", drop=True) - # merge forcing data with grid data - self.data = pd.merge( - self.data, grids.data, left_index=True, right_index=True, how="outer" - ) - return self - - -class OceanForcing: - """Class for oceanic forcing data and attributes.""" - - def __init__(self, aogcm_dir: str): - """Initializes class and opens/stores data. - - Args: - aogcm_dir (str): Directory path to oceanic forcings. - """ - self.forcing_type = "ocean" - self.path = f"{aogcm_dir}/1995-2100/" - self.aogcm = aogcm_dir.split("/")[ - -2 - ] # 3rd to last folder in directory structure - - # Load all data: thermal forcing, salinity, and temperature - files = get_all_filepaths(path=self.path, filetype="nc") - for file in files: - if "salinity" in file: - self.salinity_data = xr.open_dataset(file) - elif "thermal_forcing" in file: - self.thermal_forcing_data = xr.open_dataset(file) - elif "temperature" in file: - self.temperature_data = xr.open_dataset(file) - else: - pass - - def aggregate_dims( - self, - ): - """Aggregates over excess dimesions, particularly over time or grid cells. - - Returns: - self: AtmosphereForcing: AtmosphereForcing object with - dimensions reduced. - """ - dims = self.data.dims - if "z" in dims: - self.data = self.data.mean(dim="time") - if "nbounds" in dims: - self.data = self.data.mean(dim="nv4") - return self - - def add_sectors(self, grids: GridSectors): - """Adds information on which sector each grid cell belongs to. This is done through a merge - of grid cell data with a sectors NC file. - - Args: - grids (GridSectors): GridSectors class containing grid cell - information and attributes - - Returns: - self: OceanForcing: OceanForcing class with sectors added. - """ - self.salinity_data = self.salinity_data.drop(labels=["z_bnds", "lat", "lon"]) - # Take mean over all z values (only found in oceanic forcings) - self.salinity_data = self.salinity_data.mean( - dim="z", skipna=True - ).to_dataframe() - self.salinity_data = self.salinity_data.reset_index( - level="time", - ) - # merge with grid data - self.salinity_data = pd.merge( - self.salinity_data, - grids.data, - left_index=True, - right_index=True, - how="outer", - ) - self.salinity_data["year"] = self.salinity_data["time"].apply(lambda x: x.year) - self.salinity_data = self.salinity_data.drop(columns=["time", "mapping"]) - - self.thermal_forcing_data = self.thermal_forcing_data.drop(labels=["z_bnds"]) - self.thermal_forcing_data = ( - self.thermal_forcing_data.mean(dim="z", skipna=True) - .to_dataframe() - .reset_index( - level="time", - ) - ) - self.thermal_forcing_data = pd.merge( - self.thermal_forcing_data, - grids.data, - left_index=True, - right_index=True, - how="outer", - ) - self.thermal_forcing_data["year"] = self.thermal_forcing_data["time"].apply( - lambda x: x.year - ) - self.thermal_forcing_data = self.thermal_forcing_data.drop( - columns=["time", "mapping"] - ) - - self.temperature_data = self.temperature_data.drop(labels=["z_bnds"]) - self.temperature_data = ( - self.temperature_data.mean(dim="z", skipna=True) - .to_dataframe() - .reset_index( - level="time", - ) - ) - self.temperature_data = pd.merge( - self.temperature_data, - grids.data, - left_index=True, - right_index=True, - how="outer", - ) - self.temperature_data["year"] = self.temperature_data["time"].apply( - lambda x: x.year - ) - self.temperature_data = self.temperature_data.drop(columns=["time", "mapping"]) - - return self - - -class IceCollapse: - """Class for ice collapse forcing data and attributes.""" - - def __init__(self, aogcm_dir: str): - """Initializes class and opens/stores data. - - Args: - aogcm_dir (str): Directory path to ice collapse forcings - forcings. - """ - self.forcing_type = "ice_collapse" - self.path = f"{aogcm_dir}" - self.aogcm = aogcm_dir.split("/")[-2] # last folder in directory structure - - # Load all data: thermal forcing, salinity, and temperature - files = get_all_filepaths(path=self.path, filetype="nc") - files = [f for f in files if "8km" in f] - if len(files) > 1: # if there is a "v2" file in the directory, use that one - for file in files: - if "v2" in file: - self.data = xr.open_dataset(file) - else: - pass - else: - self.data = xr.open_dataset(files[0]) - - def add_sectors(self, grids: GridSectors): - """Adds information on which sector each grid cell belongs to. This is done through a merge - of grid cell data with a sectors NC file. - - Args: - grids (GridSectors): GridSectors class containing grid cell - information and attributes - - Returns: - self: IceCollapse: IceCollapse class with sectors added. - """ - for col in ["lon_bnds", "lat_bnds", "lat2d", "lon2d"]: - try: - self.data = self.data.drop(labels=[col]) - except ValueError: - pass - self.data = self.data.to_dataframe().reset_index(level="time", drop=False) - self.data = pd.merge( - self.data, grids.data, left_index=True, right_index=True, how="outer" - ) - self.data["year"] = self.data["time"].apply(lambda x: x.year) - self.data = self.data.drop( - columns=[ - "time", - "mapping", - "lat_x", - "lat_y", - "lon_x", - "lon_y", - ] - ) - return self - - -def aggregate_by_sector(path: str, grids_dir: str): - """Takes a atmospheric forcing dataset, adds sector numbers to it, - and gets aggregate data based on sector and year. Returns atmospheric - forcing data object. - - Args: - path (str): Filepath to atmospheric forcing nc file. - grids_dir (str): Directory containing grid data. - - Returns: - forcing: AtmosphereForcing: AtmosphereForcing instance with aggregated data - """ - # Load grid data with 8km grid size - print("") - - # Load in Atmospheric forcing data and add the sector numbers to it - if "Atmosphere" in path: - grids = GridSectors( - grids_dir, - grid_size=8, - ) - forcing = AtmosphereForcing(path=path) - - elif "Ocean" in path: - grids = GridSectors(grids_dir, grid_size=8, format_index=False) - forcing = OceanForcing(aogcm_dir=path) - - elif "Ice" in path: - grids = GridSectors( - grids_dir, - grid_size=8, - ) - forcing = IceCollapse(path) - - forcing = forcing.add_sectors(grids) - - # Group the dataset and assign aogcm column to the aogcm simulation - if forcing.forcing_type in ("atmosphere", "ice_collapse"): - forcing.data = forcing.data.groupby(["sectors", "year"]).mean() - forcing.data["aogcm"] = forcing.aogcm.lower() - elif forcing.forcing_type == "ocean": - forcing.salinity_data = forcing.salinity_data.groupby( - ["sectors", "year"] - ).mean() - forcing.salinity_data["aogcm"] = forcing.aogcm.lower() - forcing.temperature_data = forcing.temperature_data.groupby( - ["sectors", "year"] - ).mean() - forcing.temperature_data["aogcm"] = forcing.aogcm.lower() - forcing.thermal_forcing_data = forcing.thermal_forcing_data.groupby( - ["sectors", "year"] - ).mean() - forcing.thermal_forcing_data["aogcm"] = forcing.aogcm.lower() - - return forcing - - -def aggregate_atmosphere( - directory: str, - grids_directory: str, - export: str, -): - """Loops through every NC file in the provided forcing directory - from 1995-2100 and applies the aggregate_by_sector function. It then outputs - the concatenation of all processed data to all_data.csv - - Args: - directory (str): Directory containing forcing files - grids_directory (str): Directory containing grid data. - export (str): Directory to export output files. - """ - - start_time = time.time() - - # Get all NC files that contain data from 1995-2100 - filepaths = get_all_filepaths(path=directory, filetype="nc") - filepaths = [f for f in filepaths if "1995-2100" in f] - filepaths = [f for f in filepaths if "8km" in f] - - # Useful progress prints - print("Files to be processed...") - print([f.split("/")[-1] for f in filepaths]) - - # Loop over each file specified above - all_data = pd.DataFrame() - for i, fp in enumerate(filepaths): - print("") - print(f"File {i+1} / {len(filepaths)}") - print(f'File: {fp.split("/")[-1]}') - print(f"Time since start: {(time.time()-start_time) // 60} minutes") - - # attach the sector to the data and groupby sectors & year - forcing = aggregate_by_sector(fp, grids_dir=grids_directory) - - # Handle files that don't have mrro_anomaly input (ISPL RCP 85?) - try: - forcing.data["mrro_anomaly"] - except KeyError: - forcing.data["mrro_anomaly"] = np.nan - - # Keep selected columns and output each file individually - forcing.data = forcing.data[ - [ - "pr_anomaly", - "evspsbl_anomaly", - "mrro_anomaly", - "smb_anomaly", - "ts_anomaly", - "regions", - "aogcm", - ] - ] - - # meanwhile, create a concatenated dataset - all_data = pd.concat([all_data, forcing.data]) - - print(" -- ") - - if export: - all_data.to_csv(f"{export}/atmospheric_forcing.csv") - - -def aggregate_ocean( - directory, - grids_directory, - export, -): - """Loops through every NC file in the provided forcing directory - from 1995-2100 and applies the aggregate_by_sector function. It then outputs - the concatenation of all processed data to all_data.csv. - - Args: - directory (str): Directory containing forcing files - grids_directory (str): Directory containing grid data. - export (str): Directory to export output files. - """ - start_time = time.time() - - # Get all NC files that contain data from 1995-2100 - filepaths = get_all_filepaths(path=directory, filetype="nc") - filepaths = [f for f in filepaths if "1995-2100" in f] - filepaths = [f for f in filepaths if "8km" in f] - - # In the case of ocean forcings, use the filepaths of the files to determine - # which directories need to be used for OceanForcing processing. Change to - # those directories rather than individual files. - aogcms = list(set([f.split("/")[-3] for f in filepaths])) - filepaths = [f"{directory}/{aogcm}/" for aogcm in aogcms] - - # Useful progress prints - print("Files to be processed...") - print([f.split("/")[-2] for f in filepaths]) - - # Loop over each directory specified above - salinity_data = pd.DataFrame() - temperature_data = pd.DataFrame() - thermal_forcing_data = pd.DataFrame() - for i, fp in enumerate(filepaths): - print("") - print(f"Directory {i+1} / {len(filepaths)}") - print(f'Directory: {fp.split("/")[-2]}') - print(f"Time since start: {(time.time()-start_time) // 60} minutes") - - # attach the sector to the data and groupby sectors & year - forcing = aggregate_by_sector(fp, grids_dir=grids_directory) - - forcing.salinity_data = forcing.salinity_data[["salinity", "regions", "aogcm"]] - forcing.temperature_data = forcing.temperature_data[ - ["temperature", "regions", "aogcm"] - ] - forcing.thermal_forcing_data = forcing.thermal_forcing_data[ - ["thermal_forcing", "regions", "aogcm"] - ] - - # meanwhile, create a concatenated dataset - salinity_data = pd.concat([salinity_data, forcing.salinity_data]) - temperature_data = pd.concat([temperature_data, forcing.temperature_data]) - thermal_forcing_data = pd.concat( - [thermal_forcing_data, forcing.thermal_forcing_data] - ) - - print(" -- ") - - if export: - salinity_data.to_csv(export + "/salinity.csv") - temperature_data.to_csv(export + "/temperature.csv") - thermal_forcing_data.to_csv(export + "/thermal_forcing.csv") - - -def aggregate_icecollapse( - directory, - grids_directory, - export, -): - """Loops through every NC file in the provided forcing directory - from 1995-2100 and applies the aggregate_by_sector function. It then outputs - the concatenation of all processed data to all_data.csv. - - Args: - directory (str): Directory containing forcing files - grids_directory (str): Directory containing grid data. - export (str): Directory to export output files. - """ - start_time = time.time() - - # Get all NC files that contain data from 1995-2100 - filepaths = get_all_filepaths(path=directory, filetype="nc") - - # In the case of ocean forcings, use the filepaths of the files to determine - # which directories need to be used for processing. Change to - # those directories rather than individual files. - aogcms = list(set([f.split("/")[-2] for f in filepaths])) - filepaths = [f"{directory}/{aogcm}/" for aogcm in aogcms] - # filepaths = [f for f in filepaths if "8km" in f] - - # Useful progress prints - print("Files to be processed...") - print([f.split("/")[-2] for f in filepaths]) - - # Loop over each directory specified above - ice_collapse = pd.DataFrame() - for i, fp in enumerate(filepaths): - print("") - print(f"Directory {i+1} / {len(filepaths)}") - print(f'Directory: {fp.split("/")[-2]}') - print(f"Time since start: {(time.time()-start_time) // 60} minutes") - - # attach the sector to the data and groupby sectors & year - forcing = aggregate_by_sector(fp, grids_dir=grids_directory) - - forcing.data = forcing.data[["mask", "regions", "aogcm"]] - - # meanwhile, create a concatenated dataset - ice_collapse = pd.concat([ice_collapse, forcing.data]) - - print(" -- ") - - if export: - ice_collapse.to_csv(export + "/ice_collapse.csv") diff --git a/ise/data/processors/ismip6.py b/ise/data/processors/ismip6.py deleted file mode 100644 index dbd9eff..0000000 --- a/ise/data/processors/ismip6.py +++ /dev/null @@ -1,163 +0,0 @@ -"""Processing functions for ismip6 ice sheet model outputs.""" -import os -from itertools import compress -from tqdm import tqdm -import pandas as pd -import xarray as xr -import numpy as np -from ise.utils.utils import get_all_filepaths - -np.random.seed(10) - - -variables = ["iareafl", "iareagr", "icearea", "ivol", "ivaf", "smb", "smbgr", "bmbfl"] - - -def process_ismip6_outputs(zenodo_directory: str, export_directory: str): - """Wrapper function to run all output processing functions. See process_repository documentation - for more details. - - Args: - zenodo_directory (str): Directory containing Zenodo output files. - export_directory (str): Directory to export processed outputs. - """ - - process_repository( - zenodo_directory, export_filepath=f"{export_directory}/ismip6_outputs.csv" - ) - - -def _get_sector(x): - """Helper for lambda function.""" - x = x.split("_") - if len(x) == 1 or "region" in x: - return np.nan - return int(x[-1]) - - -def process_repository(zenodo_directory: str, export_filepath=None) -> pd.DataFrame: - """Processes zenodo output repository. - - Args: - zenodo_directory (str): Directory containing Zenodo output files. - export_filepath (_type_, optional): Directory to export processed outputs., defaults to None - - Returns: - pd.DataFrame: all_data, Processed outputs - """ - groups_dir = f"{zenodo_directory}/ComputedScalarsPaper/" - all_groups = os.listdir(groups_dir) - all_data = pd.DataFrame() - - # For each modeling group - for group in tqdm(all_groups, total=len(all_groups)): - group_path = f"{groups_dir}/{group}/" - - # For each model they submitted - for model in os.listdir(group_path): - model_path = f"{group_path}/{model}/" - - # For each experiment they carried out... - not_experiments = ( - "historical", - "ctr", - "ctr_proj", - "asmb", - "abmb", - "ctrl_proj_std", - "hist_std", - "hist_open", - "ctrl_proj_open", - ) - all_experiments = [ - f for f in os.listdir(model_path) if f not in not_experiments - ] - for exp in all_experiments: - exp_path = f"{model_path}/{exp}/" - processed_experiment = process_experiment(exp_path) - all_data = pd.concat([all_data, processed_experiment]) - - if export_filepath: - all_data.to_csv(export_filepath, index=False) - - return all_data - - -def process_experiment(experiment_directory: str) -> pd.DataFrame: - """Process all files within a particular experiment folder. - - Args: - experiment_directory (str): Directory containing experiments. - - Returns: - pd.DataFrame: all_data, Data from a particular experiment - directory. - """ - files = get_all_filepaths( - experiment_directory, filetype="nc", contains="minus_ctrl_proj" - ) - - all_data = process_single_file(files[0]) - for file in files[1:]: - temp = process_single_file(file) - all_data = pd.merge( - all_data, - temp, - on=["year", "sectors", "groupname", "modelname", "exp_id", "rhoi", "rhow"], - how="outer", - ) - - return all_data - - -def process_single_file(path: str) -> pd.DataFrame: - """Processes single file within experiment folder - - Args: - path (str): Filepath to file - - Returns: - pd.DataFrame: data, Data from that file. - """ - - # TODO: Need to figure out what this (and other lines of code in this function) is doing and comment - var = list(compress(variables, [v in path for v in variables])) - - # ! Fix this: getting confused with "smb" vs "smbgr" using "in" operator - if len(var) > 1: - var = var[1] - else: - var = var[0] - - data = xr.open_dataset(path, decode_times=False) - - fp_split = [f for f in path.split("/") if f != ""] - groupname = fp_split[-4] - modelname = fp_split[-3] - exp_id = fp_split[-2] - - try: - rhoi = data.rhoi.values - rhow = data.rhow.values - data = data.drop(labels=["rhoi", "rhow"]) - except AttributeError: - rhoi = np.nan - rhow = np.nan - - data = data.to_dataframe().reset_index() - data["year"] = np.floor(data["time"]).astype(int) - data = data.drop(columns="time") - data = pd.melt(data, id_vars="year") - - data["sectors"] = data.variable.apply(_get_sector) - data = data.dropna().drop(columns=["variable"]) - data[var] = data["value"] - data = data.drop(columns=["value"]) - - data["groupname"] = groupname - data["modelname"] = modelname - data["exp_id"] = exp_id - data["rhoi"] = rhoi - data["rhow"] = rhow - - return data diff --git a/ise/data/processors/merge.py b/ise/data/processors/merge.py deleted file mode 100644 index 69653e9..0000000 --- a/ise/data/processors/merge.py +++ /dev/null @@ -1,205 +0,0 @@ -"""Processing functions for joining the processed inputs from the forcing directory and the outputs -from the ismip6 ice sheet models to create a master dataset. -""" -import pandas as pd -import re -import requests -import json - - -# Open up the JSON with Table 1 from H. Seroussi et al.: ISMIP6 Antarctica projections -# Link: https://tc.copernicus.org/articles/14/3033/2020/tc-14-3033-2020.pdf -resp = requests.get( - r"https://raw.githubusercontent.com/Brown-SciML/ise/master/ise/utils/ismip6_experiments.json" -) -ismip6_experiments = json.loads(resp.text) - - -def merge_datasets( - processed_forcing_directory: str, - processed_ismip6_directory: str, - export_directory: str, - include_icecollapse: bool = False, -): - """Wrapper function that runs all merging functions. Includes combining the input data - from the forcing data with the output data from the Zenodo directory. - - Args: - processed_forcing_directory (str): Directory with processed - forcing files. - processed_ismip6_directory (str): Directory with processed - output files. - export_directory (str): Directory to export combined files. - include_icecollapse (bool, optional): Flag denoting whether to include ice collapse, defaults to False - - Returns: - pd.DataFrame: master, inputs, outputs, Combined datasets - """ - master, inputs, outputs = combine_datasets( - processed_forcing_directory=processed_forcing_directory, - processed_ismip6_directory=processed_ismip6_directory, - include_icecollapse=include_icecollapse, - export=export_directory, - ) - return master, inputs, outputs - - -def combine_datasets( - processed_forcing_directory: str, - processed_ismip6_directory: str, - include_icecollapse: bool = False, - export=True, -): - """Combines the input datasets -- atmospheric forcing, three oceanic forcing (salinity, - temperature and thermal forcing), and ice sheet collapse forcing with the output dataset - generated in H. Seroussi et al.: ISMIP6 Antarctica projections. - - Args: - processed_forcing_directory (str): Directory of the processed files. Should contain atmospheric_forcing, ice_collapse, and three oceanic forcing CSV's. - processed_ismip6_directory (str): Directory of the processed output files. - include_icecollapse (bool): Flag denoting whether to include ice collapse, defaults to False - export (str, optional): Directory of exported files, defaults to True - - - Returns: - pd.DataFrame: master, inputs, outputs, Combined datasets - """ - - # Get the files and if that doesn't work, return a FIleNotFoundError - try: - af = pd.read_csv(f"{processed_forcing_directory}/atmospheric_forcing.csv") - ice = pd.read_csv(f"{processed_forcing_directory}/ice_collapse.csv") - salinity = pd.read_csv(f"{processed_forcing_directory}/salinity.csv") - temp = pd.read_csv(f"{processed_forcing_directory}/temperature.csv") - tf = pd.read_csv(f"{processed_forcing_directory}/thermal_forcing.csv") - outputs = pd.read_csv(f"{processed_ismip6_directory}/ismip6_outputs.csv") - except FileNotFoundError as exc: - raise FileNotFoundError( - "Files not found, make sure to run all processing functions." - ) from exc - - # Merge the oceanic datasets together (thermal forcing, temperature, salinity) - # and apply the format_aogcms function for formatting strings in AOGCM column - ocean = salinity - ocean["aogcm"] = ocean["aogcm"].apply(format_aogcms) - af["aogcm"] = af["aogcm"].apply(format_aogcms) - for data in [temp, tf,]: - data["aogcm"] = data["aogcm"].apply(format_aogcms) - ocean = pd.merge( - ocean, data, on=["sectors", "year", "aogcm", "regions"], how="outer" - ) - ocean = ocean.drop_duplicates() - ocean = ocean[ - [ - "sectors", - "regions", - "year", - "aogcm", - "salinity", - "temperature", - "thermal_forcing", - ] - ] - - # Apply the same formatting function to atmospheric and ice forcing - af["aogcm"] = af["aogcm"].apply(format_aogcms) - ice["aogcm"] = ice["aogcm"].apply(format_aogcms) - - # Merge all inputs into one dataframe using an inner join - inputs = pd.merge( - ocean, af, on=["sectors", "year", "aogcm", "regions"], how="inner" - ) - - # If indicated, add ice collapse - if include_icecollapse: - inputs = pd.merge( - inputs, ice, on=["sectors", "year", "aogcm", "regions"], how="inner" - ) - - # Map the experiment to attribute function, which takes Table 1 from H. - # Seroussi et al.: ISMIP6 Antarctica projections - # and adds columns for other attributes listed in the table... - ( - outputs["experiment"], - outputs["aogcm"], - outputs["scenario"], - outputs["ocean_forcing"], - outputs["ocean_sensitivity"], - outputs["ice_shelf_fracture"], - outputs["tier"], - ) = zip(*outputs["exp_id"].map(exp_to_attributes)) - - # Merge inputs and outputs - master = pd.merge(inputs, outputs, on=["year", "sectors", "aogcm"]) - - if export: - master.to_csv(f"{export}/master.csv", index=False) - inputs.to_csv(f"{export}/inputs.csv", index=False) - outputs.to_csv(f"{export}/outputs.csv", index=False) - - return master, inputs, outputs - - -def exp_to_attributes( - x: str, -): - """Combines Table 1 in H. Seroussi et al.: ISMIP6 Antarctica projections and associates - the attributes listed in Table 1 with each experiment in the output dataset. - - Args: - x (str): AOGCM string as it is stored in the 'aogcm' column - - Returns: - tuple(str): attributes, Returns all new attributes associated with each experiment - """ - - try: - attributes = ismip6_experiments[x] - return ( - attributes["Experiment"], - attributes["AOGCM"], - attributes["Scenario"], - attributes["Ocean forcing"], - attributes["Ocean sensitivity"], - attributes["Ice shelf fracture"], - attributes["Tier"], - ) - except: - pass - - -def format_aogcms(x: str) -> str: - """Formats AOGCM strings so that joins between datasets work properly. This is necessary due to - differing file directory names in the original AIS Globus dataset. - - Args: - x (str): AOGCM string as it is stored in the 'aogcm' column - - Returns: - str: x, Formatted AOGCM string - """ - - # To homogeonize, get rid of periods (rcp85 vs rcp8.5) and make all dashes underscores - x = x.lower().replace(".", "").replace("-", "_") - try: - try: - # If it is already in the format "ssp585", do nothing and continue - correct_format = re.search("(ssp|rcp)\d{2,3}", x).group() - except AttributeError: - # If not, find the numeric value (e.g. 85) and change to rcp (_rcp85) - numeric = re.search("\d{2,3}", x).group() - x = x[:-2] - if x.endswith("_"): - x += f"rcp{numeric}" - else: - x += f"_rcp{numeric}" - - except AttributeError: - # if none of the above worked, just skip it - pass - - # Get rid of _1 and include case for ukesm1_0_ll to match other formats - x = x.replace("_1", "") - if x == "ukesm1_0_ll": - x += "_ssp585" - return x diff --git a/ise/data/scaler.py b/ise/data/scaler.py new file mode 100644 index 0000000..ba74176 --- /dev/null +++ b/ise/data/scaler.py @@ -0,0 +1,257 @@ +import numpy as np +import pandas as pd +import torch +from torch import nn +from scipy.stats import yeojohnson, yeojohnson_normmax + +from ise.utils.functions import to_tensor + +class StandardScaler(nn.Module): + """ + A class for scaling input data using mean and standard deviation. + + Args: + nn.Module: The base class for all neural network modules in PyTorch. + + Attributes: + mean_ (torch.Tensor): The mean values of the input data. + scale_ (torch.Tensor): The standard deviation values of the input data. + device (torch.device): The device (CPU or GPU) on which the calculations are performed. + + Methods: + fit(X): Computes the mean and standard deviation of the input data. + transform(X): Scales the input data using the computed mean and standard deviation. + inverse_transform(X): Reverses the scaling operation on the input data. + save(path): Saves the mean and standard deviation to a file. + load(path): Loads the mean and standard deviation from a file. + + """ + + def __init__( + self, + ): + super(StandardScaler, self).__init__() + self.mean_ = None + self.scale_ = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) + + def fit(self, X): + """ + Computes the mean and standard deviation of the input data. + + Args: + X (torch.Tensor): The input data to be scaled. + + """ + X = to_tensor(X).to(self.device) + self.mean_ = torch.mean(X, dim=0) + self.scale_ = torch.std(X, dim=0, unbiased=False) + self.eps = 1e-8 # to avoid divide by zero + self.scale_ = torch.where( + self.scale_ == 0, torch.ones_like(self.scale_) * self.eps, self.scale_ + ) # Avoid division by zero + + def transform(self, X): + """ + Scales the input data using the computed mean and standard deviation. + + Args: + X (torch.Tensor): The input data to be scaled. + + Returns: + torch.Tensor: The scaled input data. + + Raises: + RuntimeError: If the Scaler instance is not fitted yet. + + """ + X = to_tensor(X).to(self.device) + if self.mean_ is None or self.scale_ is None: + raise RuntimeError("This Scaler instance is not fitted yet.") + transformed = (X - self.mean_) / self.scale_ + + # handle NAN (i.e. divide by zero) + # could also use epsilon value and divide by epsilon instead... + if torch.isnan(transformed).any(): + transformed = torch.nan_to_num(transformed) + + return transformed + + def inverse_transform(self, X): + """ + Reverses the scaling operation on the input data. + + Args: + X (torch.Tensor): The scaled input data to be transformed back. + + Returns: + torch.Tensor: The transformed input data. + + Raises: + RuntimeError: If the Scaler instance is not fitted yet. + + """ + X = to_tensor(X).to(self.device) + if self.mean_ is None or self.scale_ is None: + raise RuntimeError("This Scaler instance is not fitted yet.") + return X * self.scale_ + self.mean_ + + def save(self, path): + """ + Saves the mean and standard deviation to a file. + + Args: + path (str): The path to save the file. + + """ + torch.save( + { + "mean_": self.mean_, + "scale_": self.scale_, + }, + path, + ) + + @staticmethod + def load(path): + """ + Loads the mean and standard deviation from a file. + + Args: + path (str): The path to load the file from. + + Returns: + Scaler: A Scaler instance with the loaded mean and standard deviation. + + """ + checkpoint = torch.load(path) + scaler = StandardScaler() + scaler.mean_ = checkpoint["mean_"] + scaler.scale_ = checkpoint["scale_"] + return scaler + + +class RobustScaler(nn.Module): + def __init__(self): + super(RobustScaler, self).__init__() + self.median_ = None + self.iqr_ = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) + + def fit(self, X): + X = to_tensor(X).to(self.device) + self.median_ = torch.median(X, dim=0).values + q75, q25 = torch.quantile(X, 0.75, dim=0), torch.quantile(X, 0.25, dim=0) + self.iqr_ = q75 - q25 + + def transform(self, X): + X = to_tensor(X).to(self.device) + if self.median_ is None or self.iqr_ is None: + raise RuntimeError("This RobustScaler instance is not fitted yet.") + return (X - self.median_) / (self.iqr_ + 1e-8) + + def inverse_transform(self, X): + X = to_tensor(X).to(self.device) + if self.median_ is None or self.iqr_ is None: + raise RuntimeError("This RobustScaler instance is not fitted yet.") + return X * (self.iqr_ + 1e-8) + self.median_ + + def save(self, path): + torch.save( + { + "median_": self.median_, + "iqr_": self.iqr_, + }, + path, + ) + + @staticmethod + def load(path): + checkpoint = torch.load(path) + scaler = RobustScaler() + scaler.median_ = checkpoint["median_"] + scaler.iqr_ = checkpoint["iqr_"] + return scaler + + +class LogScaler(nn.Module): + def __init__(self, epsilon=1e-8): + super(LogScaler, self).__init__() + self.epsilon = epsilon + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) + self.min_value = None + + def fit(self, X): + X = to_tensor(X).to(self.device) + dataset_min = torch.min(X) - self.epsilon + if dataset_min >= 0: + self.min_value = 0 + else: + self.min_value = dataset_min + + def transform(self, X): + X = to_tensor(X).to(self.device) + X_shifted = X - self.min_value # adding shift (subtracting negative or zero) + return torch.log(X_shifted + self.epsilon) + + def inverse_transform(self, X): + X = to_tensor(X).to(self.device) + X_exp = torch.exp(X) - self.epsilon + return X_exp + self.min_value + + def save(self, path): + torch.save( + { + "epsilon": self.epsilon, + "min_value": self.min_value, + }, + path, + ) + + @staticmethod + def load(path): + checkpoint = torch.load(path) + scaler = LogScaler() + scaler.epsilon = checkpoint["epsilon"] + scaler.min_value = checkpoint["min_value"] + return scaler + + +class YeoJohnsonScaler(nn.Module): + def __init__(self): + super(YeoJohnsonScaler, self).__init__() + self.lambdas_ = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) + + def fit(self, X): + X_np = X.cpu().numpy() if isinstance(X, torch.Tensor) else np.array(X) + _, self.lambdas_ = yeojohnson(X_np) + self.lambdas_ = torch.tensor(self.lambdas_, dtype=torch.float32).to(self.device) + + def transform(self, X): + X = to_tensor(X, self.device) + if self.lambdas_ is None: + raise RuntimeError("This YeoJohnsonScaler instance is not fitted yet.") + # Transformation logic here... + + def inverse_transform(self, X): + raise NotImplementedError("Inverse transform is not implemented due to its complexity and dependency on the original data scale.") + + def save(self, path): + torch.save({ + "lambdas_": self.lambdas_, + }, path) + + @staticmethod + def load(path, device=None): + device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") + checkpoint = torch.load(path, map_location=device) + scaler = YeoJohnsonScaler() + scaler.lambdas_ = checkpoint["lambdas_"] + scaler.to(device) + return scaler + diff --git a/ise/evaluation/__init__.py b/ise/evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ise/evaluation/metrics.py b/ise/evaluation/metrics.py new file mode 100644 index 0000000..2062d5a --- /dev/null +++ b/ise/evaluation/metrics.py @@ -0,0 +1,120 @@ +import numpy as np +import torch +import xarray as xr +from scipy.spatial.distance import jensenshannon +from scipy.stats import kstest, ttest_ind + + +def sum_by_sector(array, grid_file): + if isinstance(array, torch.Tensor): + array = array.cpu().detach().numpy() + if isinstance(grid_file, str): + grids = xr.open_dataset(grid_file) + sector_name = "sectors" if "ais" in grid_file.lower() else "ID" + elif isinstance(grid_file, xr.Dataset): + sector_name = "ID" if "Rignot" in grids.Description else "sectors" + else: + raise ValueError("grid_file must be a string or an xarray Dataset.") + + if len(array.shape) == 3: + num_timesteps = array.shape[0] + elif len(array.shape) == 2: + num_timesteps = 1 + array = array.reshape((1, array.shape[0], array.shape[1])) + + # if len(array.shape) == 3: + # grids = grids.expand_dims(dim={'time': num_timesteps}) + sectors = grids[sector_name].values + + ice_sheet = "AIS" if 761 in array.shape else "GIS" + num_sectors = 18 if ice_sheet == "AIS" else 6 + + sums_by_sector = np.zeros((num_timesteps, num_sectors)) + for i in range(array.shape[0]): + for sector in range(1, num_sectors + 1): + sums_by_sector[i, sector - 1] = np.sum(array[i, :, :][sectors == sector]) + return sums_by_sector + + +def mean_squared_error_sector(sum_sectors_true, sum_sectors_pred): + return np.mean((sum_sectors_true - sum_sectors_pred) ** 2) + + +def kl_divergence(p: np.ndarray, q: np.ndarray): + """Calculates the Kullback-Leibler Divergence between two distributions. Q is typically a + 'known' distirubtion and should be the true values, whereas P is typcically the test distribution, + or the predicted distribution. Note the the KL divergence is assymetric, and near-zero values for + p with a non-near zero values for q cause the KL divergence to inflate [citation]. + + Args: + p (np.ndarray): Test distribution + q (np.ndarray): Known distribution + + Returns: + float: KL Divergence + """ + return np.sum(np.where(p != 0, p * np.log(p / q), 0)) + + +def js_divergence(p: np.ndarray, q: np.ndarray): + """Calculates the Jensen-Shannon Divergence between two distributions. Q is typically a + 'known' distirubtion and should be the true values, whereas P is typcically the test distribution, + or the predicted distribution. Note the the JS divergence, unlike the KL divergence, is symetric. + + Args: + p (np.ndarray): Test distribution + q (np.ndarray): Known distribution + + Returns: + float: JS Divergence + """ + return jensenshannon(p, q) + + +def mape(y_true, y_pred): + """ + Calculate Mean Absolute Percentage Error (MAPE). + + Args: + - y_true: numpy array or a list of actual numbers + - y_pred: numpy array or a list of predicted numbers + + Returns: + - mape: Mean Absolute Percentage Error + """ + y_true, y_pred = np.array(y_true), np.array(y_pred) + # Avoid division by zero + non_zero_mask = y_true != 0 + if not np.any(non_zero_mask): + return np.inf + mape = ( + np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) + * 100 + ) + return mape + + +def relative_squared_error(y_true, y_pred): + """ + Calculate Relative Squared Error (RSE). + + Args: + - y_true: numpy array or a list of actual numbers + - y_pred: numpy array or a list of predicted numbers + + Returns: + - rse: Relative Squared Error + """ + y_true, y_pred = np.array(y_true), np.array(y_pred) + ss_res = np.sum((y_true - y_pred) ** 2) + ss_tot = np.sum((y_true - np.mean(y_true)) ** 2) + rse = ss_res / ss_tot + return rse + +def kolmogorov_smirnov(x1, x2): + res = kstest(x1, x2) + return res.statistic, res.pvalue + +def t_test(x1, x2): + res = ttest_ind(x1, x2) + return res.statistic, res.pvalue \ No newline at end of file diff --git a/ise/evaluation/plots.py b/ise/evaluation/plots.py new file mode 100644 index 0000000..84b01e3 --- /dev/null +++ b/ise/evaluation/plots.py @@ -0,0 +1,823 @@ +import os +import random +import warnings + +import imageio +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +import torch +import xarray as xr +from tqdm import tqdm + +import ise +from ise.evaluation.metrics import ( + js_divergence, + kl_divergence, + mean_squared_error_sector, + sum_by_sector, +) +from ise.utils.functions import ( + create_distribution, + get_uncertainty_bands, + group_by_run, + load_ml_data, +) + + +class SectorPlotter: + def __init__(self, results_dataset, column=None, condition=None, save_directory=None): + super().__init__() + self.dataset = results_dataset + self.save_directory = save_directory + self.trues, self.preds, self.scenarios = group_by_run( + self.dataset, column=column, condition=condition + ) + self.true_bounds = UncertaintyBounds(self.trues) + self.pred_bounds = UncertaintyBounds(self.preds) + self.cache = { + "true_sle_runs": self.trues, + "pred_sle_runs": self.preds, + "true_bounds": self.true_bounds, + "pred_bounds": self.pred_bounds, + } + self.true_distribution, self.support = create_distribution(year=2100, dataset=self.trues) + self.pred_distribution, _ = create_distribution(year=2100, dataset=self.preds) + self.distribution_metrics = { + "kl": kl_divergence(self.pred_distribution, self.true_distribution), + "js": js_divergence(self.pred_distribution, self.true_distribution), + } + self.model = None + self.ml_directory = None + + def plot_ensemble( + self, + uncertainty="quantiles", + column=None, + condition=None, + save=None, + ): + return plot_ensemble( + dataset=self.dataset, + uncertainty=uncertainty, + column=column, + condition=condition, + save=save, + cache=self.cache, + ) + + def plot_ensemble_mean( + self, + uncertainty=False, + column=None, + condition=None, + save=None, + ): + return plot_ensemble_mean( + dataset=self.dataset, + uncertainty=uncertainty, + column=column, + condition=condition, + save=save, + cache=self.cache, + ) + + def plot_distributions( + self, + year, + column=None, + condition=None, + save=None, + ): + return plot_distributions( + dataset=self.dataset, + year=year, + column=column, + condition=condition, + save=save, + cache=self.cache, + ) + + def plot_histograms( + self, + year, + column=None, + condition=None, + save=None, + ): + return plot_histograms( + dataset=self.dataset, + year=year, + column=column, + condition=condition, + save=save, + cache=self.cache, + ) + + def plot_test_series( + self, + model, + data_directory, + time_series=True, + approx_dist=True, + mc_iterations=100, + confidence="95", + draws="random", + k=10, + save_directory=None, + ): + if not isinstance(model, ise.models.timeseries.TimeSeriesEmulator): + raise NotImplementedError( + "currently the only model compatible with this function is TimeSeriesEmulator." + ) + self.model = model + self.ml_directory = data_directory + return plot_test_series( + model=model, + data_directory=data_directory, + time_series=time_series, + approx_dist=approx_dist, + mc_iterations=mc_iterations, + confidence=confidence, + draws=draws, + k=k, + save_directory=save_directory, + ) + + def plot_callibration(self, color_by=None, alpha=0.2, column=None, condition=None, save=None): + return plot_callibration( + dataset=self.dataset, + column=column, + condition=condition, + color_by=color_by, + alpha=alpha, + save=save, + ) + + +class EvaluationPlotter: + def __init__(self, save_dir="."): + + self.save_dir = save_dir + self.video = False + + def spatial_side_by_side( + self, y_true, y_pred, timestep=None, save_path=None, cmap=plt.cm.RdBu, video=False + ): + + if video and timestep: + warnings.warn("Video will be generated, ignoring timestep argument.") + # Create a custom colormap for masked values (white) + + if video: + self.video = True + self._generate_side_by_side_video(y_true, y_pred, fps=3) + return self + + if len(y_true.shape) == 3 and len(y_pred.shape) == 3 and timestep is None: + raise ValueError("timestep must be specified for 3D arrays") + elif len(y_true.shape) == 3 and len(y_pred.shape) == 3 and timestep is not None: + self.y_true = y_true[timestep - 1, :, :] + self.y_pred = y_pred[timestep - 1, :, :] + else: + self.y_true = y_true + self.y_pred = y_pred + difference = np.abs(self.y_pred - self.y_true) + masked_y_true = np.ma.masked_equal(self.y_true, 0) + masked_y_pred = np.ma.masked_equal(self.y_pred, 0) + masked_difference = np.ma.masked_equal(difference, 0) + global_min = min(masked_y_true.min(), masked_y_pred.min()) + global_max = max(masked_y_true.max(), masked_y_pred.max()) + + global_extreme = max(abs(global_min), abs(global_max)) + cmap.set_bad(color="white") + + # Create subplots + fig, axs = plt.subplots(1, 3, figsize=(18, 6)) + + # Plot y_true with mask, align color scale + cax1 = axs[0].imshow( + masked_y_true, cmap=cmap, vmin=global_extreme * -1, vmax=global_extreme + ) + fig.colorbar(cax1, ax=axs[0], orientation="vertical") + axs[0].set_title("True Y") + + # Plot y_pred with mask, align color scale + cax2 = axs[1].imshow( + masked_y_pred, cmap=cmap, vmin=global_extreme * -1, vmax=global_extreme + ) + fig.colorbar(cax2, ax=axs[1], orientation="vertical") + axs[1].set_title("Predicted Y") + + # Plot absolute difference with mask, using 'Reds' colormap + cax3 = axs[2].imshow(masked_difference, cmap="Reds") + fig.colorbar(cax3, ax=axs[2], orientation="vertical") + axs[2].set_title("Absolute Difference |Y_pred - Y_true|") + + # Show plot + plt.tight_layout() + if save_path is not None: + if self.video: + plt.savefig( + f"{self.save_dir}/{save_path}", + ) + else: + plt.savefig(f"{self.save_dir}/{save_path}", dpi=600) + + plt.close("all") + + def _generate_side_by_side_video(self, y_true, y_pred, fps=3): + if not (len(y_true.shape) == 3 and len(y_pred.shape) == 3): + raise ValueError( + "y_true and y_pred must be 3D arrays with shape (timesteps, height, width)" + ) + + timesteps = y_true.shape[0] + + for timestep in tqdm(range(timesteps), total=timesteps, desc="Generating video"): + save_path = f"timestep_{timestep}.png" # Save each frame with timestep + self.spatial_side_by_side( + y_true, y_pred, timestep, save_path, cmap=plt.cm.viridis, video=False + ) + + images = [] + # Improved sorting function that handles unexpected filenames more gracefully + try: + files = sorted( + os.listdir(self.save_dir), + key=lambda x: int(x.replace("timestep_", "").split(".")[0]), + ) + except ValueError: + raise ValueError( + "Unexpected filenames found in save directory. Expected format: 'timestep_#.png'" + ) + for filename in files: + if filename.endswith(".png"): + image_path = os.path.join(self.save_dir, filename) + images.append(imageio.imread(image_path)) + + # Create a video from the images + video_path = f"{self.save_dir}/plot_video.mp4" + imageio.mimwrite(video_path, images, fps=fps, codec="libx264") # fps is frames per second + + def sector_side_by_side( + self, + y_true, + y_pred, + grid_file, + outline_array_true=None, + outline_array_pred=None, + timestep=None, + save_path=None, + cmap=plt.cm.RdBu, + ): + + if y_true.shape != y_pred.shape: + raise ValueError("y_true and y_pred must have the same shape.") + if y_pred.shape[1] != 18 and y_pred.shape[1] != 6: + raise ValueError("y_pred must have 18 sectors.") + + if len(y_true.shape) == 2 and len(y_pred.shape) == 2 and timestep is None: + raise ValueError("timestep must be specified for 2D arrays") + elif len(y_true.shape) == 2 and len(y_pred.shape) == 2 and timestep is not None: + self.y_true = y_true[timestep - 1, :] + self.y_pred = y_pred[timestep - 1, :] + outline_array_pred = outline_array_pred[timestep - 1, :] + outline_array_true = outline_array_true[timestep - 1, :] + else: + self.y_true = y_true + self.y_pred = y_pred + + if isinstance(grid_file, str): + grids = xr.open_dataset(grid_file).transpose("x", "y", ...) + sector_name = "sectors" if "ais" in grid_file.lower() else "ID" + elif isinstance(grid_file, xr.Dataset): + sector_name = "ID" if "Rignot" in grids.Description else "sectors" + else: + raise ValueError("grid_file must be a string or an xarray Dataset.") + + sectors = grids[sector_name].values + true_plot_data = np.zeros_like(sectors) + pred_plot_data = np.zeros_like(sectors) + + num_sectors = 18 if sector_name == "sectors" else 6 + + for sector in range(1, num_sectors + 1): + true_plot_data[sectors == sector] = self.y_true[sector - 1] + pred_plot_data[sectors == sector] = self.y_pred[sector - 1] + + # Convert outline arrays to binary masks + outline_mask_true = np.where(outline_array_true != 0, 1, 0) + outline_mask_pred = np.where(outline_array_pred != 0, 1, 0) + + # Define the color scale based on the combined range of true and predicted matrices + vmin = min(true_plot_data.min(), pred_plot_data.min()) + vmax = max(true_plot_data.max(), pred_plot_data.max()) + + # Create a figure and a set of subplots + fig, axs = plt.subplots(1, 2, figsize=(12, 5), gridspec_kw={"wspace": 0.5}) + + # Plot the modified outline array for the true matrix (black for non-zero values, white elsewhere) + axs[0].imshow(np.flipud(outline_mask_true.T), cmap="Greys", interpolation="nearest") + # Plot the true matrix with slight transparency + cax1 = axs[0].imshow( + np.flipud(true_plot_data.T), + cmap="Reds", + interpolation="nearest", + vmin=vmin, + vmax=vmax, + alpha=0.90, + ) + fig.colorbar(cax1, ax=axs[0], fraction=0.046, pad=0.04) + axs[0].set_title("True") + + # Plot the modified outline array for the predicted matrix (black for non-zero values, white elsewhere) + axs[1].imshow(np.flipud(outline_mask_pred.T), cmap="Greys", interpolation="nearest") + # Plot the predicted matrix with slight transparency + cax2 = axs[1].imshow( + np.flipud(pred_plot_data.T), + cmap="Reds", + interpolation="nearest", + vmin=vmin, + vmax=vmax, + alpha=0.90, + ) + fig.colorbar(cax2, ax=axs[1], fraction=0.046, pad=0.04) + axs[1].set_title("Predicted") + + sum_by_sector_true = sum_by_sector(self.y_true, grid_file) + sum_by_sector_pred = sum_by_sector(self.y_pred, grid_file) + + mse = mean_squared_error_sector(sum_by_sector_true, sum_by_sector_pred) + plt.suptitle(f"Mean Squared Error: {mse:0.2f}") + # plt.tight_layout() + + if save_path is not None: + plt.savefig(f"{self.save_dir}/{save_path}", dpi=600) + + plt.close("all") + + stop = "" + + +class UncertaintyBounds: + def __init__(self, data, confidence="95", quantiles=None): + if quantiles is None: + quantiles = [0.05, 0.95] + self.data = data + ( + self.mean, + self.sd, + self.upper_ci, + self.lower_ci, + self.upper_q, + self.lower_q, + ) = get_uncertainty_bands(data, confidence=confidence, quantiles=quantiles) + + +def plot_ensemble( + dataset: pd.DataFrame, + uncertainty: str = "quantiles", + column: str = None, + condition: str = None, + save: str = None, + cache: dict = None, +): + """Generates a plot of the comparison of ensemble results from the true simulations and the predicted emulation. + Adds uncertainty bounds and plots them side-by-side. + + Args: + dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#combine_testing_results). + uncertainty (str, optional): Type of uncertainty for creating bounds, must be in [quantiles, confidence]. Defaults to 'quantiles'. + column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Defaults to None. + condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. + save (str, optional): Path to save plot. Defaults to None. + cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/sectors/visualization/Plotter.html#Plotter). Defaults to None. + """ + + if cache is None: + all_trues, all_preds, _ = group_by_run(dataset, column=column, condition=condition) + ( + mean_true, + _, + true_upper_ci, + true_lower_ci, + true_upper_q, + true_lower_q, + ) = get_uncertainty_bands( + all_trues, + ) + ( + mean_pred, + _, + pred_upper_ci, + pred_lower_ci, + pred_upper_q, + pred_lower_q, + ) = get_uncertainty_bands( + all_preds, + ) + else: + all_trues = cache["true_sle_runs"] + all_preds = cache["pred_sle_runs"] + t = cache["true_bounds"] + p = cache["pred_bounds"] + mean_true, true_upper_ci, true_lower_ci, true_upper_q, true_lower_q = ( + t.mean, + t.upper_ci, + t.lower_ci, + t.upper_q, + t.lower_q, + ) + mean_pred, pred_upper_ci, pred_lower_ci, pred_upper_q, pred_lower_q = ( + p.mean, + p.upper_ci, + p.lower_ci, + p.upper_q, + p.lower_q, + ) + + true_df = pd.DataFrame(all_trues).transpose() + pred_df = pd.DataFrame(all_preds).transpose() + + _, axs = plt.subplots(1, 2, figsize=(15, 6), sharey=True, sharex=True) + axs[0].plot(true_df) + axs[0].plot(mean_true, "r-", linewidth=4, label="Mean") + axs[1].plot(pred_df) + axs[1].plot(mean_pred, "r-", linewidth=4, label="Mean") + if uncertainty and uncertainty.lower() == "confidence": + axs[0].plot(true_upper_ci, "b--", linewidth=3, label="5/95% Confidence (True)") + axs[0].plot(true_lower_ci, "b--", linewidth=3) + axs[1].plot(pred_upper_ci, "b--", linewidth=3, label="5/95% Confidence (Predicted)") + axs[1].plot(pred_lower_ci, "b--", linewidth=3) + + elif uncertainty and uncertainty.lower() == "quantiles": + axs[0].plot(pred_upper_q, "b--", linewidth=3, label="5/95% Percentile (Predicted)") + axs[0].plot(pred_lower_q, "b--", linewidth=3) + axs[1].plot(true_upper_q, "b--", linewidth=3, label="5/95% Percentile (True)") + axs[1].plot(true_lower_q, "b--", linewidth=3) + + elif uncertainty and uncertainty.lower() == "both": + axs[0].plot(true_upper_ci, "r--", linewidth=2, label="5/95% Confidence (True)") + axs[0].plot(true_lower_ci, "r--", linewidth=2) + axs[1].plot(pred_upper_ci, "b--", linewidth=2, label="5/95% Confidence (Predicted)") + axs[1].plot(pred_lower_ci, "b--", linewidth=2) + axs[1].plot(pred_upper_q, "o--", linewidth=2, label="5/95% Percentile (Predicted)") + axs[1].plot(pred_lower_q, "o--", linewidth=2) + axs[0].plot(true_upper_q, "k--", linewidth=2, label="5/95% Percentile (True)") + axs[0].plot(true_lower_q, "k--", linewidth=2) + + elif uncertainty and uncertainty.lower() not in ["confidence", "quantiles"]: + raise AttributeError( + f"uncertainty argument must be in ['confidence', 'quantiles'], received {uncertainty}" + ) + + axs[0].title.set_text("True") + axs[0].set_ylabel("True SLE (mm)") + axs[1].title.set_text("Predicted") + plt.xlabel("Years since 2015") + if column is not None and condition is not None: + plt.suptitle(f"Time Series of ISM Ensemble - where {column} == {condition}") + else: + plt.suptitle("Time Series of ISM Ensemble") + plt.subplots_adjust(wspace=0, hspace=0) + plt.legend() + + # TODO: FileNotFoundError: [Errno 2] No such file or directory: 'None/ensemble_plot.png' + if save: + plt.savefig(save) + + +def plot_ensemble_mean( + dataset: pd.DataFrame, + uncertainty: str = False, + column=None, + condition=None, + save=None, + cache=None, +): + """Generates a plot of the mean sea level contribution from the true simulations and the predicted emulation. + + Args: + dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#combine_testing_results). + uncertainty (str, optional): Type of uncertainty for creating bounds. If not None/False, must be in [quantiles, confidence]. Defaults to 'quantiles'. + column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Defaults to None. + condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. + save (str, optional): Path to save plot. Defaults to None. + cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/sectors/visualization/Plotter.html#Plotter). Defaults to None. + """ + + if cache is None: + all_trues, all_preds, _ = group_by_run(dataset, column=column, condition=condition) + ( + mean_true, + _, + true_upper_ci, + true_lower_ci, + true_upper_q, + true_lower_q, + ) = get_uncertainty_bands( + all_trues, + ) + ( + mean_pred, + _, + pred_upper_ci, + pred_lower_ci, + pred_upper_q, + pred_lower_q, + ) = get_uncertainty_bands( + all_preds, + ) + else: + all_trues = cache["true_sle_runs"] + all_preds = cache["pred_sle_runs"] + t = cache["true_bounds"] + p = cache["pred_bounds"] + mean_true, true_upper_ci, true_lower_ci, true_upper_q, true_lower_q = ( + t.mean, + t.upper_ci, + t.lower_ci, + t.upper_q, + t.lower_q, + ) + mean_pred, pred_upper_ci, pred_lower_ci, pred_upper_q, pred_lower_q = ( + p.mean, + p.upper_ci, + p.lower_ci, + p.upper_q, + p.lower_q, + ) + + plt.figure(figsize=(15, 6)) + plt.plot(mean_true, label="True Mean SLE") + plt.plot(mean_pred, label="Predicted Mean SLE") + + if uncertainty and uncertainty.lower() == "confidence": + plt.plot(true_upper_ci, "r--", linewidth=2, label="5/95% Percentile (True)") + plt.plot(true_lower_ci, "r--", linewidth=2) + plt.plot(pred_upper_ci, "b--", linewidth=2, label="5/95% Percentile (Predicted)") + plt.plot(pred_lower_ci, "b--", linewidth=2) + + elif uncertainty and uncertainty.lower() == "quantiles": + plt.plot(pred_upper_q, "r--", linewidth=2, label="5/95% Confidence (Predicted)") + plt.plot(pred_lower_q, "r--", linewidth=2) + plt.plot(true_upper_q, "b--", linewidth=2, label="5/95% Confidence (True)") + plt.plot(true_lower_q, "b--", linewidth=2) + + elif uncertainty and uncertainty.lower() == "both": + plt.plot(true_upper_ci, "r--", linewidth=2, label="5/95% Percentile (True)") + plt.plot(true_lower_ci, "r--", linewidth=2) + plt.plot(pred_upper_ci, "b--", linewidth=2, label="5/95% Percentile (Predicted)") + plt.plot(pred_lower_ci, "b--", linewidth=2) + plt.plot(pred_upper_q, "o--", linewidth=2, label="5/95% Confidence (Predicted)") + plt.plot(pred_lower_q, "o--", linewidth=2) + plt.plot(true_upper_q, "k--", linewidth=2, label="5/95% Confidence (True)") + plt.plot(true_lower_q, "k--", linewidth=2) + + elif uncertainty and uncertainty.lower() not in ["confidence", "quantiles"]: + raise AttributeError( + f"uncertainty argument must be in ['confidence', 'quantiles'], received {uncertainty}" + ) + + else: + pass + + if column is not None and condition is not None: + plt.suptitle(f"ISM Ensemble Mean SLE over Time - where {column} == {condition}") + else: + plt.suptitle("ISM Ensemble Mean over Time") + plt.xlabel("Years since 2015") + plt.ylabel("Mean SLE (mm)") + plt.legend() + + if save: + plt.savefig(save) + + +def plot_distributions( + dataset: pd.DataFrame, + year: int = 2100, + column: str = None, + condition: str = None, + save: str = None, + cache: dict = None, +): + """Generates a plot of comparison of distributions at a given time slice (year) from the true simulations and the predicted emulation. + + Args: + dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#combine_testing_results). + year (int, optional): Distribution year (time slice). Defaults to 2100. + column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Defaults to None. + condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. + save (str, optional): Path to save plot. Defaults to None. + cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/sectors/visualization/Plotter.html#Plotter). Defaults to None. + """ + + if cache is None: + all_trues, all_preds, _ = group_by_run(dataset, column=column, condition=condition) + else: + all_trues = cache["true_sle_runs"] + all_preds = cache["pred_sle_runs"] + + true_dist, true_support = create_distribution(year=year, dataset=all_trues) + pred_dist, pred_support = create_distribution(year=year, dataset=all_preds) + plt.figure(figsize=(15, 8)) + plt.plot(true_support, true_dist, label="True") + plt.plot(pred_support, pred_dist, label="Predicted") + plt.title( + f"Distribution Comparison at year {year}, KL Divergence: {kl_divergence(pred_dist, true_dist):0.3f}" + ) + plt.xlabel("SLE (mm)") + plt.ylabel("Probability") + plt.legend() + if save: + plt.savefig(save) + + +def plot_histograms( + dataset: pd.DataFrame, + year: int = 2100, + column: str = None, + condition: str = None, + save: str = None, + cache: dict = None, +): + """Generates a plot of comparison of histograms at a given time slice (year) from the true simulations and the predicted emulation. + + Args: + dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#combine_testing_results). + year (int, optional): Histogram year (time slice). Defaults to 2100. + column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Defaults to None. + condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/sectors/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. + save (str, optional): Path to save plot. Defaults to None. + cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/sectors/visualization/Plotter.html#Plotter). Defaults to None. + """ + if cache is None: + all_trues, all_preds, _ = group_by_run(dataset, column=column, condition=condition) + + else: + all_trues = cache["true_sle_runs"] + all_preds = cache["pred_sle_runs"] + + fig = plt.figure(figsize=(15, 8)) + ax1 = plt.subplot( + 1, + 2, + 1, + ) + sns.histplot( + all_preds[:, year - 2101], + label="Predicted Distribution", + color="blue", + alpha=0.3, + ) + plt.legend() + plt.subplot(1, 2, 2, sharex=ax1, sharey=ax1) + sns.histplot(all_trues[:, year - 2101], label="True Distribution", color="red", alpha=0.3) + plt.suptitle(f"Histograms of Predicted vs True SLE at year {year}") + plt.ylabel("") + plt.legend() + fig.tight_layout(rect=[0, 0.03, 1, 0.95]) + if save: + plt.savefig(save) + + +def plot_test_series( + model, + data_directory, + time_series, + approx_dist=True, + mc_iterations=100, + confidence="95", + draws="random", + k=10, + save_directory=None, +): + _, _, test_features, test_labels, test_scenarios = load_ml_data( + data_directory, time_series=time_series + ) + + sectors = list(set(test_features.sectors)) + sectors.sort() + + if draws == "random": + data = random.sample(test_scenarios, k=k) + elif draws == "first": + data = test_scenarios[:k] + else: + raise ValueError(f"draws must be in [random, first], received {draws}") + + for scen in data: + single_scenario = scen + test_model = single_scenario[0] + test_exp = single_scenario[2] + test_sector = single_scenario[1] + single_test_features = torch.tensor( + np.array( + test_features[ + (test_features[test_model] == 1) + & (test_features[test_exp] == 1) + & (test_features.sectors == test_sector) + ], + dtype=np.float64, + ), + dtype=torch.float, + ) + single_test_labels = np.array( + test_labels[ + (test_features[test_model] == 1) + & (test_features[test_exp] == 1) + & (test_features.sectors == test_sector) + ], + dtype=np.float64, + ) + preds, means, sd = model.predict( + single_test_features, + approx_dist=approx_dist, + mc_iterations=mc_iterations, + confidence=confidence, + ) # TODO: this doesn't work with traditional + + quantiles = np.quantile(preds, [0.05, 0.95], axis=0) + lower_ci = means - 1.96 * sd + upper_ci = means + 1.96 * sd + upper_q = quantiles[1, :] + lower_q = quantiles[0, :] + + if not approx_dist: + plt.figure(figsize=(15, 8)) + plt.plot(single_test_labels, "r-", label="True") + plt.plot(preds, "b-", label="Predicted") + plt.xlabel("Time (years since 2015)") + plt.ylabel("SLE (mm)") + plt.title(f"Model={test_model}, Exp={test_exp}, sector={sectors.index(test_sector)+1}") + plt.legend() + if save_directory: + plt.savefig(f"{save_directory}/{test_model}_{test_exp}_test_sector.png") + else: + preds = pd.DataFrame(preds).transpose() + plt.figure(figsize=(15, 8)) + plt.plot( + preds, + alpha=0.2, + ) + plt.plot(means, "b-", label="Predicted") + plt.plot(upper_ci, "k-", label=f"{confidence}% CI") + plt.plot( + lower_ci, + "k-", + ) + plt.plot(quantiles[0, :], "k--", label=f"Quantiles") + plt.plot(quantiles[1, :], "k--") + plt.plot( + lower_ci, + "k-", + ) + plt.plot(single_test_labels, "r-", label="True") + + plt.xlabel("Time (years since 2015)") + plt.ylabel("SLE (mm)") + plt.title(f"Model={test_model}, Exp={test_exp}, sector={sectors.index(test_sector)+1}") + plt.legend() + if save_directory: + plt.savefig( + f'{save_directory}/{test_model.replace("-", "_")}_{test_exp}_test_sector.png' + ) + + +def plot_callibration(dataset, column=None, condition=None, color_by=None, alpha=0.2, save=None): + + # TODO: Add ability to subset multiple columns and conditions. Not needed now so saving for later... + if column is None and condition is None: + subset = dataset + elif column is not None and condition is not None: + subset = dataset[(dataset[column] == condition)] + else: + raise ValueError( + "Column and condition type must be the same (None & None, not None & not None)." + ) + + plt.figure(figsize=(15, 8)) + sns.scatterplot(data=subset, x="true", y="pred", hue=color_by, alpha=alpha) + plt.plot( + [min(subset.true), max(subset.true)], + [min(subset.true), max(subset.true)], + "r-", + ) + + # TODO: Add density plots (below) + # sns.kdeplot(data=subset, x='true', y='pred', hue=color_by, fill=True) + # plt.plot([min(subset.true),max(subset.true)], [min(subset.true),max(subset.true)], 'r-',) + + # TODO: add plotly export + plt.xlabel("True Value") + plt.ylabel("Predicted Value") + plt.title("Callibration Plot") + + if color_by is not None: + plt.legend() + + if save: + plt.savefig(save) diff --git a/ise/models/__init__.py b/ise/models/__init__.py index 3ae046c..e69de29 100644 --- a/ise/models/__init__.py +++ b/ise/models/__init__.py @@ -1,11 +0,0 @@ -r""" -# [gp](https://brown-sciml.github.io/ise/ise/models/gp.html) - -# [testing](https://brown-sciml.github.io/ise/ise/models/testing.html) - -# [timeseries](https://brown-sciml.github.io/ise/ise/models/timeseries.html) - -# [traditional](https://brown-sciml.github.io/ise/ise/models/traditional.html) - -# [training](https://brown-sciml.github.io/ise/ise/models/training.html) -""" diff --git a/ise/models/training/iterative.py b/ise/models/experiments.py similarity index 94% rename from ise/models/training/iterative.py rename to ise/models/experiments.py index 183ae48..b27910d 100644 --- a/ise/models/training/iterative.py +++ b/ise/models/experiments.py @@ -1,20 +1,24 @@ -from ise.data.EmulatorData import EmulatorData -from ise.models.training.Trainer import Trainer -from ise.models.timeseries import TimeSeriesEmulator -from ise.models.traditional.ExploratoryModel import ExploratoryModel -from ise.utils.utils import _structure_emulatordata_args, _structure_architecture_args from datetime import datetime -from torch import nn -from ise.utils.data import load_ml_data from typing import List +from torch import nn + +from ise.data._EmulatorData import EmulatorData +from ise.models.sector import ExploratoryModel, VariationalLSTMEmulator +from ise.models.train import Trainer +from ise.utils.functions import ( + _structure_architecture_args, + _structure_emulatordata_args, + load_ml_data, +) + def lag_sequence_test( data_directory, lag_array, sequence_array, iterations, - model_class=TimeSeriesEmulator, + model_class=VariationalLSTMEmulator, emulator_data_args=None, architecture=None, verbose=True, @@ -26,9 +30,7 @@ def lag_sequence_test( if verbose: print("1/3: Loading processed data...") - emulator_data_args = _structure_emulatordata_args( - emulator_data_args, time_series=True - ) + emulator_data_args = _structure_emulatordata_args(emulator_data_args, time_series=True) architecture = _structure_architecture_args(architecture, time_series=True) count = 0 @@ -89,7 +91,7 @@ def rnn_architecture_test( rnn_layers_array: List[int], hidden_nodes_array: List[int], iterations: int, - model_class=TimeSeriesEmulator, + model_class=VariationalLSTMEmulator, verbose: bool = True, epochs: int = 100, batch_size: int = 100, diff --git a/ise/models/gp/kernels.py b/ise/models/gp.py similarity index 58% rename from ise/models/gp/kernels.py rename to ise/models/gp.py index 17811c2..bed36b9 100644 --- a/ise/models/gp/kernels.py +++ b/ise/models/gp.py @@ -1,8 +1,11 @@ -"""Custom Kernels required for gaussian process regression.""" +"""Gaussian process model architecture, containing training and testing capabilities.""" -from sklearn.gaussian_process.kernels import RBF, _check_length_scale, WhiteKernel -from scipy.spatial.distance import pdist, cdist, squareform import numpy as np +from joblib import dump, load +from scipy.spatial.distance import cdist, pdist, squareform +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, WhiteKernel, _check_length_scale +from sklearn.metrics import r2_score class PowerExponentialKernel(RBF): @@ -75,9 +78,9 @@ def __call__( return K, K_gradient elif self.anisotropic: # We need to recompute the pairwise dimension-wise distances - K_gradient = ( - X[:, np.newaxis, :] - X[np.newaxis, :, :] - ) ** self.exponential / (length_scale**2) + K_gradient = (X[:, np.newaxis, :] - X[np.newaxis, :, :]) ** self.exponential / ( + length_scale**2 + ) K_gradient *= K[..., np.newaxis] return K, K_gradient else: @@ -87,3 +90,58 @@ def __call__( class NuggetKernel(WhiteKernel): def __init__(self, noise_level=1.0, noise_level_bounds=(1e-5, 1e5)): super().__init__(noise_level=noise_level, noise_level_bounds=noise_level_bounds) + + +class GP(GaussianProcessRegressor): + def __init__(self, kernel, verbose=True): + super().__init__( + n_restarts_optimizer=9, + ) + self.kernel = kernel + self.verbose = verbose + + def train( + self, + train_features, + train_labels, + ): + self.train_features, self.train_labels = train_features, train_labels + self.fit( + train_features, + train_labels, + ) + return self + + def test(self, test_features, test_labels): + self.test_features, self.test_labels = test_features, test_labels + preds, std_prediction = self.predict(test_features, return_std=True) + test_labels = np.array(test_labels.squeeze()) + mse = sum((preds - test_labels) ** 2) / len(preds) + mae = sum(abs((preds - test_labels))) / len(preds) + rmse = np.sqrt(mse) + r2 = r2_score(test_labels, preds) + + metrics = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2} + + if self.verbose: + print( + f"""Test Metrics +MSE: {mse:0.6f} +MAE: {mae:0.6f} +RMSE: {rmse:0.6f} +R2: {r2:0.6f}""" + ) + return preds, std_prediction, metrics + + def save(self, path): + """Save model to path.""" + if not path.endswith(".joblib"): + raise ValueError("Path must end with .joblib") + dump(self, path) + + def load(self, path): + """Load model from path.""" + if not path.endswith(".joblib"): + raise ValueError("Path must end with .joblib") + self = load(path) + return self diff --git a/ise/models/gp/GP.py b/ise/models/gp/GP.py deleted file mode 100644 index 65d8de1..0000000 --- a/ise/models/gp/GP.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Gaussian process model architecture, containing training and testing capabilities.""" - -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.metrics import r2_score -import numpy as np -from joblib import dump, load - -np.random.seed(10) - - -class GP(GaussianProcessRegressor): - def __init__(self, kernel, verbose=True): - super().__init__( - n_restarts_optimizer=9, - ) - self.kernel = kernel - self.verbose = verbose - - def train( - self, - train_features, - train_labels, - ): - self.train_features, self.train_labels = train_features, train_labels - self.fit( - train_features, - train_labels, - ) - return self - - def test(self, test_features, test_labels): - self.test_features, self.test_labels = test_features, test_labels - preds, std_prediction = self.predict(test_features, return_std=True) - test_labels = np.array(test_labels.squeeze()) - mse = sum((preds - test_labels) ** 2) / len(preds) - mae = sum(abs((preds - test_labels))) / len(preds) - rmse = np.sqrt(mse) - r2 = r2_score(test_labels, preds) - - metrics = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2} - - if self.verbose: - print( - f"""Test Metrics -MSE: {mse:0.6f} -MAE: {mae:0.6f} -RMSE: {rmse:0.6f} -R2: {r2:0.6f}""" - ) - return preds, std_prediction, metrics - - - def save(self, path): - """Save model to path.""" - if not path.endswith('.joblib'): - raise ValueError('Path must end with .joblib') - dump(self, path) - - def load(self, path): - """Load model from path.""" - if not path.endswith('.joblib'): - raise ValueError('Path must end with .joblib') - self = load(path) - return self - - diff --git a/ise/models/gp/__init__.py b/ise/models/gp/__init__.py deleted file mode 100644 index c25a6fd..0000000 --- a/ise/models/gp/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -r""" -[GaussianProcess](https://brown-sciml.github.io/ise/ise/models/gp/GaussianProcess.html) - -[kernels](https://brown-sciml.github.io/ise/ise/models/gp/kernels.html) -""" - -from ise.models.gp.GP import GP -from ise.models.gp.kernels import PowerExponentialKernel diff --git a/ise/models/grid.py b/ise/models/grid.py new file mode 100644 index 0000000..0f6453b --- /dev/null +++ b/ise/models/grid.py @@ -0,0 +1,1086 @@ + + + +import json +import os +import warnings + +import numpy as np +import pandas as pd +import torch +from nflows import distributions, flows, transforms +from torch import nn, optim + +from ise.data.dataclasses import EmulatorDataset +from ise.data.scaler import StandardScaler, RobustScaler, LogScaler +from ise.models.loss import MSEDeviationLoss, WeightedMSELoss +from ise.utils.functions import to_tensor + + +class PCA(nn.Module): + def __init__(self, n_components): + """ + Principal Component Analysis (PCA) model. + + This class provides a PCA model which can be fit to data. + + Attributes: + n_components (int or float): The number of components to keep. Can be an int or a float between 0 and 1. + mean (torch.Tensor): The mean of the input data. Calculated during fit. + components (torch.Tensor): The principal components. Calculated during fit. + singular_values (torch.Tensor): The singular values corresponding to each of the principal components. Calculated during fit. + explained_variance (torch.Tensor): The amount of variance explained by each of the selected components. Calculated during fit. + explained_variance_ratio (torch.Tensor): Percentage of variance explained by each of the selected components. Calculated during fit. + """ + super(PCA, self).__init__() + self.n_components = n_components + self.mean = None + self.components = None + self.singular_values = None + self.explained_variance = None + self.explained_variance_ratio = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.to(self.device) + + def fit(self, X): + """ + Fit the PCA model to the input data. + + Args: + X (np.array | pd.DataFrame): Input data, a tensor of shape (n_samples, n_features). + + Returns: + self (PCModel): The fitted PCA model. + + Raises: + ValueError: If n_components is not a float in the range (0, 1) or an integer. + """ + # Center the data + X = self._to_tensor(X) + self.mean = torch.mean(X, dim=0) + X_centered = X - self.mean + + # Compute low-rank PCA + U, S, V = torch.pca_lowrank( + X_centered, + q=301, + ) + + # Compute total variance + total_variance = S.pow(2).sum() + explained_variance = S.pow(2) + explained_variance_ratio = explained_variance / total_variance + + # Determine the number of components for the desired variance + if isinstance(self.n_components, float) and 0 < self.n_components < 1: + cumulative_variance_ratio = torch.cumsum(explained_variance_ratio, dim=0) + # Number of components to explain desired variance + num_components = torch.searchsorted(cumulative_variance_ratio, self.n_components) + 1 + self.n_components = min(num_components, S.size(0)) + elif isinstance(self.n_components, int): + self.n_components = min(self.n_components, S.size(0)) + else: + raise ValueError("n_components must be a float in the range (0, 1) or an integer.") + + self.components = V[:, : self.n_components] + self.singular_values = S[: self.n_components] + self.explained_variance = explained_variance[: self.n_components] + self.explained_variance_ratio = explained_variance_ratio[: self.n_components] + return self + + def transform(self, X): + """ + Apply dimensionality reduction to the input data using the fitted PCA model. + + Args: + X (np.array | pd.DataFrame): Input data, a tensor of shape (n_samples, n_features). + + Returns: + torch.Tensor: Transformed data, a tensor of shape (n_samples, n_components). + + Raises: + RuntimeError: If the PCA model has not been fitted yet. + """ + X = self._to_tensor(X) + if self.mean is None or self.components is None: + raise RuntimeError("PCA model has not been fitted yet.") + X_centered = X - self.mean + return torch.mm(X_centered, self.components) + + def inverse_transform(self, X): + """ + Apply inverse dimensionality reduction to the input data using the fitted PCA model. + + Args: + X (np.array | pd.DataFrame): Transformed data, a tensor of shape (n_samples, n_components). + + Returns: + torch.Tensor: Inverse transformed data, a tensor of shape (n_samples, n_features). + + Raises: + RuntimeError: If the PCA model has not been fitted yet. + """ + X = self._to_tensor(X) + + if self.mean is None or self.components is None: + raise RuntimeError("PCA model has not been fitted yet.") + inverse = torch.mm(X, self.components.t()) + self.mean + return inverse + + def save(self, path): + """ + Save the PCA model to a file. + + Args: + path (str): The path to save the model. + + Raises: + RuntimeError: If the PCA model has not been fitted yet. + """ + if self.mean is None or self.components is None: + raise RuntimeError("PCA model has not been fitted yet.") + torch.save( + { + "n_components": self.n_components, + "mean": self.mean, + "components": self.components, + "singular_values": self.singular_values, + "explained_variance": self.explained_variance, + "explained_variance_ratio": self.explained_variance_ratio, + }, + path, + ) + + def _to_tensor(self, x): + """ + Converts the input data to a PyTorch tensor. + + Args: + x: The input data to be converted. + + Returns: + The converted PyTorch tensor. + + Raises: + ValueError: If the input data is not a pandas DataFrame, numpy array, or PyTorch tensor. + """ + if x is None: + return None + if isinstance(x, pd.DataFrame): + x = x.values + elif isinstance(x, np.ndarray): + x = torch.tensor(x) + elif isinstance(x, torch.Tensor): + pass + else: + raise ValueError("Data must be a pandas dataframe, numpy array, or PyTorch tensor") + + return x + + @staticmethod + def load(path): + """ + Load a saved PCA model from a file. + + Args: + path (str): The path to the saved model. + + Returns: + PCA: The loaded PCA model. + + Raises: + FileNotFoundError: If the file does not exist. + RuntimeError: If the loaded model is not a PCA model. + """ + checkpoint = torch.load(path) + model = PCA(checkpoint["n_components"]) + model.mean = checkpoint["mean"] + model.components = checkpoint["components"] + model.singular_values = checkpoint["singular_values"] + model.explained_variance = checkpoint["explained_variance"] + model.explained_variance_ratio = checkpoint["explained_variance_ratio"] + return model + + +class DimensionProcessor(nn.Module): + """ + A class that performs dimension processing using PCA and scaling. + + Args: + pca_model (str or PCA): The PCA model to use for dimension reduction. It can be either a path to a saved PCA model or an instance of the PCA class. + scaler_model (str or Scaler): The scaler model to use for scaling the data. It can be either a path to a saved scaler model or an instance of the scaler class. + scaler_method (str): The method to use for scaling. Must be one of 'standard', 'robust', or 'log'. + + Attributes: + device (str): The device to use for computation. It is set to 'cuda' if a CUDA-enabled GPU is available, otherwise it is set to 'cpu'. + pca (PCA): The PCA model used for dimension reduction. + scaler (Scaler): The scaler model used for scaling the data. + + Raises: + ValueError: If the `pca_model` is not a valid path or instance of PCA, or if the `scaler_model` is not a valid path or instance of the scaler class. + RuntimeError: If the PCA model has not been fitted yet. + + """ + + def __init__( + self, + pca_model, + scaler_model, + scaler_method="standard", + ): + super(DimensionProcessor, self).__init__() + self.device = "cuda" if torch.cuda.is_available() else "cpu" + + # LOAD PCA + if isinstance(pca_model, str): + self.pca = PCA.load(pca_model) + elif isinstance(pca_model, PCA): + self.pca = pca_model + else: + raise ValueError("pca_model must be a path (str) or a PCA instance") + if self.pca.mean is None or self.pca.components is None: + raise RuntimeError("PCA model has not been fitted yet.") + + # LOAD SCALER + if scaler_method == "standard": + scaler_class = StandardScaler + elif scaler_method == "robust": + scaler_class = RobustScaler + elif scaler_method == "log": + scaler_class = LogScaler + else: + raise ValueError("scaler_method must be 'standard', 'robust', or 'log'") + + if isinstance(scaler_model, str): + self.scaler = scaler_class.load(scaler_model) + elif isinstance(scaler_model, scaler_class): + self.scaler = scaler_model + else: + raise ValueError("pca_model must be a path (str) or a PCA instance") + # if self.scaler.mean_ is None or self.scaler.scale_ is None: + # raise RuntimeError("This StandardScalerPyTorch instance is not fitted yet.") + + self.scaler.to(self.device) + self.pca.to(self.device) + self.to(self.device) + + def to_pca(self, data): + """ + Transforms the input data to the PCA space. + + Args: + data (torch.Tensor or pd.DataFrame): The input data to transform. + + Returns: + torch.Tensor: The transformed data in the PCA space. + + """ + data = data.to(self.device) + scaled = self.scaler.transform(data) # scale + return self.pca.transform(scaled) # convert to pca + + def to_grid(self, pcs, unscale=True): + """ + Transforms the input principal components (pcs) to the original data space. + + Args: + pcs (torch.Tensor or pd.DataFrame): The principal components to transform. + unscale (bool): Whether to unscale the transformed data. If True, the data will be unscaled using the scaler model. + + Returns: + torch.Tensor or pd.DataFrame: The transformed data in the original data space. + + """ + if not isinstance(pcs, torch.Tensor): + if isinstance(pcs, pd.DataFrame): + pcs = pcs.values + pcs = torch.tensor(pcs, dtype=torch.float32).to(self.device) + else: + pcs = pcs.to(self.device) + # Ensure components and mean are on the same device as pcs + components = self.pca.components.to(self.device) + pca_mean = self.pca.mean.to(self.device) + # Now, the operation should not cause a device mismatch error + scaled_grid = torch.mm(pcs, components.t()) + pca_mean + + if unscale: + return self.scaler.inverse_transform(scaled_grid) + # scale = self.scaler.scale_.to(self.device) + # scaler_mean = self.scaler.mean_.to(self.device) + # unscaled_grid = scaled_grid * scale + scaler_mean + # return unscaled_grid + + return scaled_grid + + +class WeakPredictor(nn.Module): + """ + A class representing a weak predictor model. + + Args: + lstm_num_layers (int): The number of LSTM layers. + lstm_hidden_size (int): The hidden size of the LSTM layers. + input_size (int, optional): The input size of the model. Defaults to 43. + output_size (int, optional): The output size of the model. Defaults to 1. + dim_processor (DimensionProcessor or str, optional): The dimension processor object or path to a PCA object. Defaults to None. + scaler_path (str, optional): The path to a scaler object. Required if dim_processor is a path to a PCA object. Defaults to None. + ice_sheet (str, optional): The ice sheet type. Defaults to "AIS". + criterion (torch.nn.Module, optional): The loss criterion. Defaults to torch.nn.MSELoss(). + + Attributes: + lstm_num_layers (int): The number of LSTM layers. + lstm_num_hidden (int): The hidden size of the LSTM layers. + input_size (int): The input size of the model. + output_size (int): The output size of the model. + ice_sheet (str): The ice sheet type. + ice_sheet_dim (tuple): The dimensions of the ice sheet. + device (str): The device used for computation. + lstm (torch.nn.LSTM): The LSTM layer. + relu (torch.nn.ReLU): The ReLU activation function. + linear1 (torch.nn.Linear): The first linear layer. + linear_out (torch.nn.Linear): The output linear layer. + optimizer (torch.optim.Optimizer): The optimizer used for training. + dropout (torch.nn.Dropout): The dropout layer. + criterion (torch.nn.Module): The loss criterion. + trained (bool): Indicates if the model has been trained. + dim_processor (DimensionProcessor or None): The dimension processor object. + + Methods: + forward(x): Performs a forward pass through the model. + fit(X, y, epochs, sequence_length, batch_size, loss, val_X, val_y): Trains the model. + predict(X, sequence_length, batch_size): Makes predictions using the trained model. + """ + + def __init__( + self, + lstm_num_layers, + lstm_hidden_size, + input_size=43, + output_size=1, + dim_processor=None, + scaler_path=None, + ice_sheet="AIS", + criterion=torch.nn.MSELoss(), + ): + super(WeakPredictor, self).__init__() + + # Initialize attributes + self.lstm_num_layers = lstm_num_layers + self.lstm_num_hidden = lstm_hidden_size + self.input_size = input_size + self.output_size = output_size + self.ice_sheet = ice_sheet + self.ice_sheet_dim = (761, 761) if ice_sheet == "AIS" else (337, 577) + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(self.device) + + # Initialize model layers + self.lstm = nn.LSTM( + input_size=input_size, + hidden_size=lstm_hidden_size, + batch_first=True, + num_layers=lstm_num_layers, + ) + self.relu = nn.ReLU() + self.linear1 = nn.Linear(in_features=lstm_hidden_size, out_features=32) + self.linear_out = nn.Linear(in_features=32, out_features=output_size) + + # Initialize optimizer and other components + self.optimizer = optim.Adam(self.parameters()) + self.dropout = nn.Dropout(p=0.2) + self.criterion = criterion + self.trained = False + + # Initialize dimension processor + if isinstance(dim_processor, DimensionProcessor): + self.dim_processor = dim_processor.to(self.device) + elif isinstance(dim_processor, str) and scaler_path is None: + raise ValueError( + "If dim_processor is a path to a PCA object, scaler_path must be provided" + ) + elif isinstance(dim_processor, str) and scaler_path is not None: + self.dim_processor = DimensionProcessor( + pca_model=self.pca_model, scaler_model=scaler_path + ).to(self.device) + elif dim_processor is None: + self.dim_processor = None + else: + raise ValueError( + "dim_processor must be a DimensionProcessor instance or a path (str) to a PCA object with scaler_path specified as a Scaler object." + ) + + def forward(self, x): + """ + Performs a forward pass through the model. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: The output tensor. + """ + # Perform LSTM forward pass + batch_size = x.shape[0] + h0 = ( + torch.zeros(self.lstm_num_layers, batch_size, self.lstm_num_hidden) + .requires_grad_() + .to(self.device) + ) + c0 = ( + torch.zeros(self.lstm_num_layers, batch_size, self.lstm_num_hidden) + .requires_grad_() + .to(self.device) + ) + _, (hn, _) = self.lstm(x, (h0, c0)) + x = hn[-1, :, :] + + # Perform linear layer operations + x = self.linear1(x) + x = self.relu(x) + x = self.dropout(x) + x = self.linear_out(x) + + return x + + def fit( + self, X, y, epochs=100, sequence_length=5, batch_size=64, loss=None, val_X=None, val_y=None + ): + """ + Trains the model. + + Args: + X (numpy.ndarray or pandas.DataFrame): The input data. + y (numpy.ndarray or pandas.DataFrame): The target data. + epochs (int, optional): The number of epochs to train for. Defaults to 100. + sequence_length (int, optional): The sequence length for creating input sequences. Defaults to 5. + batch_size (int, optional): The batch size. Defaults to 64. + loss (torch.nn.Module, optional): The loss function to use. If None, the default criterion is used. Defaults to None. + val_X (numpy.ndarray or pandas.DataFrame, optional): The validation input data. Defaults to None. + val_y (numpy.ndarray or pandas.DataFrame, optional): The validation target data. Defaults to None. + """ + # Convert data to tensors and move to device + X, y = to_tensor(X).to(self.device), to_tensor(y).to(self.device) + + # Check if validation data is provided + if val_X is not None and val_y is not None: + validate = True + else: + validate = False + + # Set loss criterion + if loss is not None: + self.criterion = loss.to(self.device) + elif loss is None and self.criterion is None: + raise ValueError("loss must be provided if criterion is None.") + self.criterion = self.criterion.to(self.device) + + # Convert data to numpy arrays if pandas DataFrames + if isinstance(X, pd.DataFrame): + X = X.values + if isinstance(y, pd.DataFrame): + y = y.values + + # Create dataset and data loader + dataset = EmulatorDataset(X, y, sequence_length=sequence_length) + data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + + # Set model to training mode + self.train() + self.to(self.device) + + # Training loop + for epoch in range(1, epochs + 1): + self.train() + batch_losses = [] + for i, (x, y) in enumerate(data_loader): + x = x.to(self.device) + y = y.to(self.device) + self.optimizer.zero_grad() + y_pred = self.forward(x) + loss = self.criterion(y_pred, y) + loss.backward() + self.optimizer.step() + batch_losses.append(loss.item()) + + # Print average batch loss and validation loss (if provided) + if validate: + val_preds = self.predict( + val_X, sequence_length=sequence_length, batch_size=batch_size + ).to(self.device) + val_loss = self.criterion( + val_preds, torch.tensor(val_y, device=self.device) + ) + print( + f"Epoch {epoch}, Average Batch Loss: {sum(batch_losses) / len(batch_losses)}, Validation Loss: {val_loss}" + ) + else: + average_batch_loss = sum(batch_losses) / len(batch_losses) + print(f"Epoch {epoch}, Average Batch Loss: {average_batch_loss}") + + self.trained = True + + def predict(self, X, sequence_length=5, batch_size=64): + """ + Makes predictions using the trained model. + + Args: + X (numpy.ndarray or pandas.DataFrame): The input data. + sequence_length (int, optional): The sequence length for creating input sequences. Defaults to 5. + batch_size (int, optional): The batch size. Defaults to 64. + + Returns: + torch.Tensor: The predicted values. + """ + # Set model to evaluation mode + self.eval() + self.to(self.device) + + # Convert data to numpy array if pandas DataFrame + if isinstance(X, pd.DataFrame): + X = X.values + + # Create dataset and data loader + dataset = EmulatorDataset(X, y=None, sequence_length=sequence_length) + data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False) + + # Move input data to device + X = to_tensor(X).to(self.device) + + preds = torch.tensor([]).to(self.device) + for X_test_batch in data_loader: + self.eval() + X_test_batch = X_test_batch.to(self.device) + y_pred = self.forward(X_test_batch) + preds = torch.cat((preds, y_pred), 0) + + return preds + + +class DeepEnsemble(nn.Module): + """ + A deep ensemble model for prediction tasks. + + Args: + - weak_predictors (list, optional): List of WeakPredictor instances. If not provided, weak predictors will be randomly generated. + - forcing_size (int, optional): Size of the input forcing data. Required if weak_predictors is not provided. + - sle_size (int, optional): Size of the input SLE data. Required if weak_predictors is not provided. + - num_predictors (int, optional): Number of weak predictors to generate if weak_predictors is not provided. + + Attributes: + - weak_predictors (list): List of WeakPredictor instances. + - trained (bool): Indicates whether all weak predictors have been trained. + + Methods: + - forward(x): Performs forward pass through the model. + - predict(x): Makes predictions using the model. + - fit(X, y, epochs=100, batch_size=64, sequence_length=5): Trains the model. + - save(model_path): Saves the model parameters and metadata. + - load(model_path): Loads the model parameters and metadata and returns an instance of the model. + """ + + def __init__( + self, weak_predictors: list = [], forcing_size=44, sle_size=1, num_predictors=3 + ): + super(DeepEnsemble, self).__init__() + + if not weak_predictors: + if forcing_size is None or sle_size is None: + raise ValueError( + "forcing_size and sle_size must be provided if weak_predictors is not provided" + ) + self.loss_choices = [torch.nn.MSELoss(), MSEDeviationLoss(threshold=1.0, penalty_multiplier=2.0), torch.nn.L1Loss(), torch.nn.HuberLoss()] + loss_probabilities = [.45, .05, .3, .2] + self.weak_predictors = [ + WeakPredictor( + lstm_num_layers=np.random.randint(low=1, high=3, size=1)[0], + lstm_hidden_size=np.random.choice([512, 256, 128, 64], 1)[0], + criterion=np.random.choice(self.loss_choices, 1, p=loss_probabilities)[0], + input_size=forcing_size, + output_size=1, + ) + for _ in range(num_predictors) + ] + else: + if isinstance(weak_predictors, list): + self.weak_predictors = weak_predictors + if not all([isinstance(x, WeakPredictor) for x in weak_predictors]): + raise ValueError("weak_predictors must be a list of WeakPredictor instances") + else: + raise ValueError("weak_predictors must be a list of WeakPredictor instances") + + if any([x for x in weak_predictors if not isinstance(x, WeakPredictor)]): + raise ValueError("weak_predictors must be a list of WeakPredictor instances") + + # check to see if all weak predictors are trained + self.trained = all([wp.trained for wp in self.weak_predictors]) + + def forward(self, x): + """ + Performs a forward pass through the model. + + Args: + - x: Input data. + + Returns: + - mean_prediction: Mean prediction of the ensemble. + - epistemic_uncertainty: Epistemic uncertainty of the ensemble. + """ + if not self.trained: + warnings.warn("This model has not been trained. Predictions will not be accurate.") + mean_prediction = torch.mean(torch.stack([wp.predict(x) for wp in self.weak_predictors], axis=1), axis=1).squeeze() + epistemic_uncertainty = torch.std(torch.stack([wp.predict(x) for wp in self.weak_predictors], axis=1), axis=1).squeeze() + return mean_prediction, epistemic_uncertainty + + def predict(self, x): + """ + Makes predictions using the model. + + Args: + - x: Input data. + + Returns: + - predictions: Predictions made by the model. + """ + return self.forward(x) + + def fit( + self, + X, + y, + epochs=100, + batch_size=64, + sequence_length=5, + ): + """ + Trains the model. + + Args: + - X: Input data. + - y: Target data. + - epochs (int, optional): Number of epochs to train the model. Default is 100. + - batch_size (int, optional): Batch size for training. Default is 64. + - sequence_length (int, optional): Length of input sequences. Default is 5. + """ + if self.trained: + warnings.warn("This model has already been trained. Training anyways.") + for i, wp in enumerate(self.weak_predictors): + print(f"Training Weak Predictor {i+1} of {len(self.weak_predictors)}:") + wp.fit(X, y, epochs=epochs, batch_size=batch_size, sequence_length=sequence_length) + print("") + self.trained = True + + def save(self, model_path): + """ + Saves the model parameters and metadata. + + Args: + - model_path: Path to save the model. + """ + if not self.trained: + raise ValueError("This model has not been trained yet. Please train the model before saving.") + # Save model metadata + metadata = { + 'model_type': self.__class__.__name__, + 'weak_predictors': [ + { + 'lstm_num_layers': int(wp.lstm_num_layers), + 'lstm_num_hidden': int(wp.lstm_num_hidden), + 'criterion': wp.criterion.__class__.__name__, + 'trained': wp.trained + } for wp in self.weak_predictors + ], + + } + metadata_path = model_path.replace('.pth', '_metadata.json') + with open(metadata_path, 'w') as file: + json.dump(metadata, file, indent=4) + print(f"Model metadata saved to {metadata_path}") + + # Save model parameters + torch.save(self.state_dict(), model_path) + print(f"Model parameters saved to {model_path}") + + @classmethod + def load(cls, model_path): + """ + Loads the model architecture metadata from a JSON file and the model parameters, + reconstructs the model, and returns an instance of the model. + + Parameters: + - model_path: Path to the file from which model parameters should be loaded. + + Returns: + - An instance of the model with loaded parameters. + """ + # Load metadata + metadata_path = model_path.replace('.pth', '_metadata.json') + with open(metadata_path, 'r') as file: + metadata = json.load(file) + + # Instantiate the model based on metadata + if cls.__name__ != metadata['model_type']: + raise ValueError(f"Model type in metadata ({metadata['model_type']}) does not match the class type ({cls.__class__.__name__})") + + loss_lookup = {'MSELoss': torch.nn.MSELoss(), 'L1Loss': torch.nn.L1Loss(), 'HuberLoss': torch.nn.HuberLoss()} + + weak_predictors = [WeakPredictor(lstm_num_layers=wp['lstm_num_layers'], lstm_hidden_size=wp['lstm_num_hidden'], criterion=loss_lookup[wp['criterion']]) for wp in metadata['weak_predictors']] + model = cls(weak_predictors=weak_predictors) + + # Load model parameters + model.load_state_dict(torch.load(model_path)) + model.eval() # Set the model to evaluation mode + + return model + + +class NormalizingFlow(nn.Module): + """ + A class representing a Normalizing Flow model. + + Args: + forcing_size (int): The size of the forcing input features. + sle_size (int): The size of the predicted SLE (Stochastic Lagrangian Ensemble) output. + + Attributes: + num_flow_transforms (int): The number of flow transforms in the model. + num_input_features (int): The number of input features. + num_predicted_sle (int): The number of predicted SLE features. + flow_hidden_features (int): The number of hidden features in the flow. + device (str): The device used for computation (either "cuda" or "cpu"). + base_distribution (distributions.normal.ConditionalDiagonalNormal): The base distribution for the flow. + t (transforms.base.CompositeTransform): The composite transform for the flow. + flow (flows.base.Flow): The flow model. + optimizer (optim.Adam): The optimizer used for training the flow. + criterion (callable): The criterion used for calculating the log probability of the flow. + trained (bool): Indicates whether the model has been trained or not. + + Methods: + fit(X, y, epochs=100, batch_size=64): Trains the model on the given input and output data. + sample(features, num_samples, return_type="numpy"): Generates samples from the model. + get_latent(x, latent_constant=0.0): Computes the latent representation of the input data. + aleatoric(features, num_samples): Computes the aleatoric uncertainty of the model predictions. + save(path): Saves the trained model to the specified path. + """ + + def __init__( + self, + forcing_size=43, + sle_size=1, + ): + super(NormalizingFlow, self).__init__() + self.num_flow_transforms = 5 + self.num_input_features = forcing_size + self.num_predicted_sle = sle_size + self.flow_hidden_features = sle_size * 2 + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(self.device) + + self.base_distribution = distributions.normal.ConditionalDiagonalNormal( + shape=[self.num_predicted_sle], + context_encoder=nn.Linear(self.num_input_features, self.flow_hidden_features), + ) + + t = [] + for _ in range(self.num_flow_transforms): + t.append( + transforms.permutations.RandomPermutation( + features=self.num_predicted_sle, + ) + ) + t.append( + transforms.autoregressive.MaskedAffineAutoregressiveTransform( + features=self.num_predicted_sle, + hidden_features=self.flow_hidden_features, + context_features=self.num_input_features, + ) + ) + + self.t = transforms.base.CompositeTransform(t) + + self.flow = flows.base.Flow(transform=self.t, distribution=self.base_distribution) + + self.optimizer = optim.Adam(self.flow.parameters()) + self.criterion = self.flow.log_prob + self.trained = False + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(self.device) + + def fit(self, X, y, epochs=100, batch_size=64): + """ + Trains the model on the given input and output data. + + Args: + X (array-like): The input data. + y (array-like): The output data. + epochs (int): The number of training epochs (default: 100). + batch_size (int): The batch size for training (default: 64). + """ + X, y = to_tensor(X).to(self.device), to_tensor(y).to(self.device) + dataset = EmulatorDataset(X, y, sequence_length=1) + data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True) + self.train() + + for epoch in range(1, epochs + 1): + epoch_loss = [] + for i, (x, y) in enumerate(data_loader): + x = x.to(self.device).view(x.shape[0], -1) + y = y.to(self.device) + self.optimizer.zero_grad() + loss = torch.mean(-self.flow.log_prob(inputs=y, context=x)) + if torch.isnan(loss): + stop = '' + loss.backward() + self.optimizer.step() + epoch_loss.append(loss.item()) + print(f"Epoch {epoch}, Loss: {sum(epoch_loss) / len(epoch_loss)}") + self.trained = True + + def sample(self, features, num_samples, return_type="numpy"): + """ + Generates samples from the model. + + Args: + features (array-like): The input features for generating samples. + num_samples (int): The number of samples to generate. + return_type (str): The return type of the samples ("numpy" or "tensor", default: "numpy"). + + Returns: + array-like or torch.Tensor: The generated samples. + """ + if not isinstance(features, torch.Tensor): + features = to_tensor(features) + samples = self.flow.sample(num_samples, context=features).reshape( + features.shape[0], num_samples + ) + if return_type == "tensor": + pass + elif return_type == "numpy": + samples = samples.detach().cpu().numpy() + else: + raise ValueError("return_type must be 'numpy' or 'tensor'") + return samples + + def get_latent(self, x, latent_constant=0.0): + """ + Computes the latent representation of the input data. + + Args: + x (array-like): The input data. + latent_constant (float): The constant value for the latent representation (default: 0.0). + + Returns: + torch.Tensor: The latent representation of the input data. + """ + x = to_tensor(x).to(self.device) + latent_constant_tensor = torch.ones((x.shape[0], 1)).to(self.device) * latent_constant + z, _ = self.t(latent_constant_tensor.float(), context=x) + return z + + def aleatoric(self, features, num_samples): + """ + Computes the aleatoric uncertainty of the model predictions. + + Args: + features (array-like): The input features for computing the uncertainty. + num_samples (int): The number of samples to use for computing the uncertainty. + + Returns: + array-like: The aleatoric uncertainty of the model predictions. + """ + if not isinstance(features, torch.Tensor): + features = to_tensor(features) + samples = self.flow.sample(num_samples, context=features) + samples = samples.detach().cpu().numpy() + std = np.std(samples, axis=1).squeeze() + return std + + def save(self, path): + """ + Saves the model parameters and metadata to the specified path. + + Args: + path (str): The path to save the model. + """ + # Prepare metadata for saving + metadata = { + 'forcing_size': self.forcing_size, + 'sle_size': self.sle_size, + } + metadata_path = path + '_metadata.json' + + # Save metadata + with open(metadata_path, 'w') as f: + json.dump(metadata, f, indent=4) + + # Save model parameters + torch.save(self.state_dict(), path) + print(f"Model and metadata saved to {path} and {metadata_path}, respectively.") + + @staticmethod + def load(path): + """ + Loads the NormalizingFlow model from the specified path. + + Args: + path (str): The path to load the model from. + + Returns: + NormalizingFlow: The loaded NormalizingFlow model. + """ + # Load metadata + metadata_path = path + '_metadata.json' + with open(metadata_path, 'r') as f: + metadata = json.load(f) + + # Reconstruct the model using the loaded metadata + model = NormalizingFlow(forcing_size=metadata['forcing_size'], sle_size=metadata['sle_size']) + + # Load the model parameters + model.load_state_dict(torch.load(path)) + model.eval() # Set the model to evaluation mode + + return model + + + + + + +class HybridEmulator(torch.nn.Module): + """ + A hybrid emulator that combines a deep ensemble and a normalizing flow model. + + Args: + deep_ensemble (DeepEnsemble): The deep ensemble model. + normalizing_flow (NormalizingFlow): The normalizing flow model. + + Attributes: + device (str): The device used for computation (cuda or cpu). + deep_ensemble (DeepEnsemble): The deep ensemble model. + normalizing_flow (NormalizingFlow): The normalizing flow model. + trained (bool): Indicates whether the model has been trained. + + Methods: + fit(X, y, epochs=100, nf_epochs=None, de_epochs=None, sequence_length=5): + Fits the hybrid emulator to the training data. + forward(x): + Performs a forward pass through the hybrid emulator. + save(save_dir): + Saves the trained model to the specified directory. + load(deep_ensemble_path, normalizing_flow_path): + Loads a trained model from the specified paths. + + """ + + def __init__(self, deep_ensemble, normalizing_flow): + super(HybridEmulator, self).__init__() + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(self.device) + + if not isinstance(deep_ensemble, DeepEnsemble): + raise ValueError("deep_ensemble must be a DeepEmulator instance") + if not isinstance(normalizing_flow, NormalizingFlow): + raise ValueError("normalizing_flow must be a NormalizingFlow instance") + + self.deep_ensemble = deep_ensemble.to(self.device) + self.normalizing_flow = normalizing_flow.to(self.device) + self.trained = self.deep_ensemble.trained and self.normalizing_flow.trained + + def fit(self, X, y, epochs=100, nf_epochs=None, de_epochs=None, sequence_length=5): + """ + Fits the hybrid emulator to the training data. + + Args: + X (array-like): The input training data. + y (array-like): The target training data. + epochs (int): The number of epochs to train the model (default: 100). + nf_epochs (int): The number of epochs to train the normalizing flow model (default: None). + If not specified, the same number of epochs as the overall model will be used. + de_epochs (int): The number of epochs to train the deep ensemble model (default: None). + If not specified, the same number of epochs as the overall model will be used. + sequence_length (int): The sequence length used for training the deep ensemble model (default: 5). + + """ + torch.manual_seed(np.random.randint(0, 100000)) + # if specific epoch numbers are not supplied, use the same number of epochs for both + if nf_epochs is None: + nf_epochs = epochs + if de_epochs is None: + de_epochs = epochs + + X, y = to_tensor(X).to(self.device), to_tensor(y).to(self.device) + if self.trained: + warnings.warn("This model has already been trained. Training anyways.") + if not self.normalizing_flow.trained: + print(f"\nTraining Normalizing Flow ({nf_epochs} epochs):") + self.normalizing_flow.fit(X, y, epochs=nf_epochs) + z = self.normalizing_flow.get_latent(X,).detach() + X_latent = torch.concatenate((X, z), axis=1) + if not self.deep_ensemble.trained: + print(f"\nTraining Deep Ensemble ({de_epochs} epochs):") + self.deep_ensemble.fit(X_latent, y, epochs=de_epochs, sequence_length=sequence_length) + self.trained = True + + def forward(self, x, smooth_projection=False,): + """ + Performs a forward pass through the hybrid emulator. + + Args: + x (array-like): The input data. + + Returns: + tuple: A tuple containing the prediction, epistemic uncertainty, and aleatoric uncertainty. + + """ + x = to_tensor(x).to(self.device) + if not self.trained: + warnings.warn("This model has not been trained. Predictions will not be accurate.") + z = self.normalizing_flow.get_latent(x, ).detach() + X_latent = torch.concatenate((x, z), axis=1) + prediction, epistemic = self.deep_ensemble(X_latent) + aleatoric = self.normalizing_flow.aleatoric(x, 100) + + if smooth_projection: + stop = '' + return prediction, epistemic, aleatoric + + def save(self, save_dir): + """ + Saves the trained model to the specified directory. + + Args: + save_dir (str): The directory to save the model. + + Raises: + ValueError: If the model has not been trained yet or if save_dir is a file. + + """ + if not self.trained: + raise ValueError("This model has not been trained yet. Please train the model before saving.") + if save_dir.endswith('.pth'): + raise ValueError("save_dir must be a directory, not a file") + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + self.deep_ensemble.save(f"{save_dir}/deep_ensemble.pth") + self.normalizing_flow.save(f"{save_dir}/normalizing_flow.pth") + + @staticmethod + def load(deep_ensemble_path, normalizing_flow_path): + """ + Loads a trained model from the specified paths. + + Args: + deep_ensemble_path (str): The path to the saved deep ensemble model. + normalizing_flow_path (str): The path to the saved normalizing flow model. + + Returns: + HybridEmulator: The loaded hybrid emulator model. + + """ + deep_ensemble = DeepEnsemble.load(deep_ensemble_path) + normalizing_flow = NormalizingFlow.load(normalizing_flow_path) + model = HybridEmulator(deep_ensemble, normalizing_flow) + model.trained = True + return model diff --git a/ise/models/loss.py b/ise/models/loss.py new file mode 100644 index 0000000..cf4f0e9 --- /dev/null +++ b/ise/models/loss.py @@ -0,0 +1,355 @@ +import torch + + +class WeightedGridLoss(torch.nn.Module): + def __init__(self): + super(WeightedGridLoss, self).__init__() + self.to(self.device) + + def total_variation_regularization(self, grid): + # Calculate the sum of horizontal and vertical differences + horizontal_diff = torch.abs(torch.diff(grid, axis=2)) + vertical_diff = torch.abs(torch.diff(grid, axis=1)) + total_variation = torch.sum(horizontal_diff, axis=(1, 2)) + torch.sum( + vertical_diff, axis=(1, 2) + ) + return torch.mean(total_variation) + + def weighted_pixelwise_mse(self, true, predicted, weights): + # Compute the squared error + squared_error = (true - predicted) ** 2 + # Apply weights + weighted_error = weights * squared_error + # Return the mean of the weighted error + return torch.mean(weighted_error) + + def forward(self, true, predicted, smoothness_weight=0.001, extreme_value_threshold=1e-6): + true = torch.tensor(true, dtype=torch.float32, device=self.device) + predicted = torch.tensor(predicted, dtype=torch.float32, device=self.device) + + # Determine weights based on extreme values + if extreme_value_threshold is not None: + # Identify extreme values in the true data + extreme_mask = torch.abs(true) > extreme_value_threshold + # Assign higher weight to extreme values, 1 to others + weights = torch.where(extreme_mask, 10.0 * torch.ones_like(true), torch.ones_like(true)) + else: + # If no threshold is provided, use uniform weights + weights = torch.ones_like(true) + + pixelwise_mse = self.weighted_pixelwise_mse(true, predicted, weights) + tvr = self.total_variation_regularization(predicted) + return pixelwise_mse + smoothness_weight * tvr + + +class WeightedMSELoss(torch.nn.Module): + def __init__(self, data_mean, data_std, weight_factor=1.0): + """ + Custom loss function that penalizes errors on extreme values more. + + Args: + data_mean (float): Mean of the target variable in the training set. + data_std (float): Standard deviation of the target variable in the training set. + weight_factor (float): Factor to adjust the weighting. Higher values will increase + the penalty on extremes. Default is 1.0. + """ + super(WeightedMSELoss, self).__init__() + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.data_mean = torch.tensor(data_mean, dtype=torch.float32, device=self.device) + self.data_std = torch.tensor(data_std, dtype=torch.float32, device=self.device) + self.weight_factor = torch.tensor(weight_factor, dtype=torch.float32, device=self.device) + self.to(self.device) + + def forward(self, input, target): + """ + Calculate the Weighted MSE Loss. + + Args: + input (tensor): Predicted values. + target (tensor): Actual values. + + Returns: + Tensor: Computed loss. + """ + # Ensure data_mean, data_std, and weight_factor are on the same device as input + input = input.to(self.device) + target = target.to(self.device) + + # Calculate the deviation of each target value from the mean + deviation = torch.abs(target - self.data_mean) + + # Scale deviations by the standard deviation to normalize them + # normalized_deviation = torch.tensor(deviation / self.data_std, dtype=torch.float32, device=self.device) + normalized_deviation = deviation / self.data_std + + # Compute weights: increase penalty for extreme values + weights = 1 + (normalized_deviation * self.weight_factor) + + # Compute the squared error + squared_error = torch.nn.functional.mse_loss(input, target, reduction="none") + + # Apply the weights and take the mean to get the final loss + weighted_squared_error = weights * squared_error + loss = torch.mean(weighted_squared_error) + + return loss + + +class WeightedMSEPCALoss(torch.nn.Module): + def __init__(self, data_mean, data_std, weight_factor=1.0, custom_weights=None): + """ + Custom loss function that penalizes errors on extreme values more and allows for custom weighting of each prediction + in a batched manner. + + Args: + data_mean (float): Mean of the target variable in the training set. + data_std (float): Standard deviation of the target variable in the training set. + weight_factor (float): Factor to adjust the weighting. Higher values will increase the penalty on extremes. Default is 1.0. + custom_weights (torch.Tensor, optional): A tensor of weights corresponding to each y-value in the batch. Default is None. + """ + super(WeightedMSEPCALoss, self).__init__() + + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.to(self.device) + + self.data_mean = torch.tensor(data_mean, dtype=torch.float32, device=self.device) + self.data_std = torch.tensor(data_std, dtype=torch.float32, device=self.device) + self.weight_factor = torch.tensor(weight_factor, dtype=torch.float32, device=self.device) + self.custom_weights = ( + torch.tensor(custom_weights, dtype=torch.float32, device=self.device) + if custom_weights is not None + else None + ) + + def forward(self, input, target): + """ + Calculate the Weighted MSE Loss for batched inputs and outputs. + + Args: + input (tensor): Predicted values with shape (batch_size, num_targets). + target (tensor): Actual values with shape (batch_size, num_targets). + + Returns: + Tensor: Computed loss. + """ + + input = input.to(self.device) + target = target.to(self.device) + + # Ensure input and target are of the same shape + if input.shape != target.shape: + raise ValueError("Input and target must have the same shape.") + + # Calculate the deviation of each target value from the mean + deviation = torch.abs(target - self.data_mean) + + # Scale deviations by the standard deviation to normalize them + normalized_deviation = deviation / self.data_std + + # Compute weights: increase penalty for extreme values + weights = 1 + (normalized_deviation * self.weight_factor) + + # If custom weights are provided, multiply them by the calculated weights + if self.custom_weights is not None: + # Expand custom weights to match batch size if necessary + if self.custom_weights.dim() == 1: + self.custom_weights = self.custom_weights.unsqueeze(0) # Make it a 2D tensor + if self.custom_weights.shape != weights.shape: + raise ValueError("Custom weights shape must match input/target shape.") + weights *= self.custom_weights + + # Compute the squared error for each element in the batch without reducing + squared_error = (input - target) ** 2 + + # Apply the weights to the squared error + weighted_squared_error = weights * squared_error + + # Take the mean across all dimensions to get the final loss + loss = torch.mean(weighted_squared_error) + + return loss + + +class WeightedMSELossWithSignPenalty(torch.nn.Module): + def __init__(self, data_mean, data_std, weight_factor=1.0, sign_penalty_factor=1.0): + """ + Custom loss function that penalizes errors on extreme values more and adds a penalty for opposite sign predictions. + + Args: + data_mean (float): Mean of the target variable in the training set. + data_std (float): Standard deviation of the target variable in the training set. + weight_factor (float): Factor to adjust the weighting for extremes. Higher values will increase + the penalty on extremes. Default is 1.0. + sign_penalty_factor (float): Factor to adjust the penalty for opposite sign predictions. + Higher values increase the penalty. Default is 1.0. + """ + super(WeightedMSELossWithSignPenalty, self).__init__() + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.data_mean = torch.tensor(data_mean, dtype=torch.float32, device=self.device) + self.data_std = torch.tensor(data_std, dtype=torch.float32, device=self.device) + self.weight_factor = torch.tensor(weight_factor, dtype=torch.float32, device=self.device) + self.sign_penalty_factor = torch.tensor( + sign_penalty_factor, dtype=torch.float32, device=self.device + ) + self.to(self.device) + + def forward(self, input, target): + """ + Calculate the Weighted MSE Loss with an additional penalty for opposite sign predictions. + + Args: + input (tensor): Predicted values. + target (tensor): Actual values. + + Returns: + Tensor: Computed loss. + """ + # Calculate the deviation of each target value from the mean + deviation = torch.abs(target - self.data_mean) + + # Scale deviations by the standard deviation to normalize them + normalized_deviation = deviation / self.data_std + + # Compute weights: increase penalty for extreme values + weights = 1 + (normalized_deviation * self.weight_factor) + + # Compute the squared error + squared_error = torch.nn.functional.mse_loss(input, target, reduction="none") + + # Calculate sign penalty + sign_penalty = torch.where( + torch.sign(input) != torch.sign(target), + torch.abs(input - target) * self.sign_penalty_factor, + torch.zeros_like(input), + ) + + # Apply the weights and sign penalty, then take the mean to get the final loss + weighted_squared_error = weights * (squared_error + sign_penalty) + loss = torch.mean(weighted_squared_error) + + return loss + + +class GridCriterion(torch.nn.Module): + def __init__( + self, + ): + super(GridCriterion, self).__init__() + + def total_variation_regularization( + self, + grid, + ): + # Calculate the sum of horizontal and vertical differences + horizontal_diff = torch.abs(torch.diff(grid, axis=2)) + vertical_diff = torch.abs(torch.diff(grid, axis=1)) + total_variation = torch.sum(horizontal_diff, axis=(1, 2)) + torch.sum( + vertical_diff, axis=(1, 2) + ) + return torch.mean(total_variation) + + # def spatial_loss(self, true, predicted, smoothness_weight=0.001): + def forward(self, true, predicted, smoothness_weight=0.001): + pixelwise_mse = torch.mean( + torch.abs(true - predicted) ** 2, + ) # loss for each image in the batch (batch_size,) + tvr = self.total_variation_regularization( + predicted, + ) + return pixelwise_mse + smoothness_weight * tvr + + # def forward(self, true, predicted, x, y, flow, predictor_weight=0.5, nf_weight=0.5,): + # if predictor_weight + nf_weight != 1: + # raise ValueError("The sum of predictor_weight and nf_weight must be 1") + # predictor_loss = self.spatial_loss(true, predicted, smoothness_weight=0.2) + # nf_loss = -flow.log_prob(inputs=y, context=x) + # return predictor_weight*predictor_loss + nf_weight*nf_loss + + +class WeightedPCALoss(torch.nn.Module): + def __init__(self, component_weights, reduction="mean"): + """ + Custom loss function that applies different weights to the error of each principal component prediction. + + Args: + component_weights (list or torch.Tensor): Weights for each principal component's error, + where the first component has the highest weight. + reduction (str): Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. + """ + super(WeightedPCALoss, self).__init__() + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.component_weights = torch.tensor( + component_weights, dtype=torch.float32, device=self.device + ) + if len(self.component_weights.size()) == 1: + self.component_weights = self.component_weights.unsqueeze(0) # Make it a row vector + self.reduction = reduction + self.to(self.device) + + def forward(self, input, target): + """ + Calculate the weighted loss for principal component predictions. + + Args: + input (tensor): Predicted principal components. + target (tensor): Actual principal components. + + Returns: + Tensor: Computed Weighted PCA Loss. + """ + + input = input.to(self.device) + target = target.to(self.device) + + # Ensure input and target are of the same shape + if input.shape != target.shape: + raise ValueError("Input and target must have the same shape") + + # Calculate the squared error + squared_error = (input - target) ** 2 + + # Apply weights to the squared error + weighted_error = squared_error * self.component_weights.to(input.device) + + # Apply reduction + if self.reduction == "mean": + return torch.mean(weighted_error) + elif self.reduction == "sum": + return torch.sum(weighted_error) + else: + return weighted_error + + +class MSEDeviationLoss(torch.nn.Module): + def __init__(self, threshold=1.0, penalty_multiplier=2.0): + """ + Custom MSE Loss with an additional penalty for large deviations. + + Parameters: + - threshold: The error magnitude beyond which the penalty is applied. + - penalty_multiplier: Multiplier for the penalty term for errors exceeding the threshold. + """ + super(MSEDeviationLoss, self).__init__() + self.threshold = threshold + self.penalty_multiplier = penalty_multiplier + + def forward(self, predictions, targets): + """ + Compute the custom loss. + + Parameters: + - predictions: The predicted values. + - targets: The ground truth values. + + Returns: + - loss: The computed custom loss. + """ + mse_loss = torch.mean((predictions - targets) ** 2) + large_deviation_penalty = torch.mean( + torch.where( + torch.abs(predictions - targets) > self.threshold, + self.penalty_multiplier * (predictions - targets) ** 2, + torch.tensor(0.0, device=predictions.device) + ) + ) + return mse_loss + large_deviation_penalty \ No newline at end of file diff --git a/ise/models/pretrained/emulator.pt b/ise/models/pretrained/variational_lstm_emulator.pt similarity index 100% rename from ise/models/pretrained/emulator.pt rename to ise/models/pretrained/variational_lstm_emulator.pt diff --git a/ise/models/sector.py b/ise/models/sector.py new file mode 100644 index 0000000..fd019d8 --- /dev/null +++ b/ise/models/sector.py @@ -0,0 +1,312 @@ +import numpy as np +import pandas as pd +import torch +from torch import nn +from torch.utils.data import DataLoader + +from ise.data.dataclasses import PyTorchDataset, TSDataset + + +class ExploratoryModel(torch.nn.Module): + def __init__( + self, + input_layer_size, + architecture, + ): + super(ExploratoryModel, self).__init__() + self.model_name = "ExploratoryModel" + self.input_layer_size = input_layer_size + self.num_linear_layers = architecture["num_linear_layers"] + self.nodes = architecture["nodes"] + + if len(self.nodes) != self.num_linear_layers: + raise AttributeError( + f"Length of nodes argument must be equal to num_linear_layers, received {self.num_linear_layers} != {len(self.nodes)}" + ) + + if self.nodes[-1] != 1: + raise ValueError(f"Last node must be equal to 1, received {self.nodes[-1]}") + + model = nn.Sequential() + for i in range(self.num_linear_layers): + if i == 0: + model.append(nn.Linear(self.input_layer_size, self.nodes[i])) + model.append(nn.ReLU()) + elif i == self.num_linear_layers - 1: + model.append(nn.Linear(self.nodes[i - 1], self.nodes[i])) + else: + model.append(nn.Linear(self.nodes[i - 1], self.nodes[i])) + model.append(nn.ReLU()) + + self.model = model + + def forward(self, x): + return self.model(x) + + +class VariationalLSTMEmulator(torch.nn.Module): + def __init__(self, architecture, mc_dropout=False, dropout_prob=None): + super().__init__() + self.model_name = "TimeSeriesEmulator" + self.input_layer_size = architecture["input_layer_size"] + self.num_rnn_layers = architecture["num_rnn_layers"] + self.num_rnn_hidden = architecture["num_rnn_hidden"] + self.time_series = True + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.mc_dropout = mc_dropout + + if not all( + [ + self.num_rnn_layers, + self.num_rnn_hidden, + ] + ): + raise AttributeError( + "Model architecture argument missing. Requires: [num_rnn_layers, num_rnn_hidden, ]." + ) + + if mc_dropout and dropout_prob is None: + raise ValueError("If mc_dropout, dropout_prob cannot be None.") + + if self.mc_dropout: + self.rnn = nn.LSTM( + input_size=self.input_layer_size, + hidden_size=self.num_rnn_hidden, + batch_first=True, + num_layers=self.num_rnn_layers, + dropout=dropout_prob if self.num_rnn_layers > 1 else 0, + ) + else: + self.rnn = nn.LSTM( + input_size=self.input_layer_size, + hidden_size=self.num_rnn_hidden, + batch_first=True, + num_layers=self.num_rnn_layers, + ) + + self.relu = nn.ReLU() + + if self.mc_dropout: + self.dropout = nn.Dropout(p=dropout_prob) + self.linear1 = nn.Linear(in_features=self.num_rnn_hidden, out_features=32) + self.linear_out = nn.Linear(in_features=32, out_features=1) + + def forward(self, x): + batch_size = x.shape[0] + h0 = ( + torch.zeros(self.num_rnn_layers, batch_size, self.num_rnn_hidden) + .requires_grad_() + .to(self.device) + ) + c0 = ( + torch.zeros(self.num_rnn_layers, batch_size, self.num_rnn_hidden) + .requires_grad_() + .to(self.device) + ) + _, (hn, _) = self.rnn(x, (h0, c0)) + x = hn[-1, :, :] + if self.mc_dropout: + x = self.dropout(x) + x = self.linear1(x) + x = self.relu(x) + if self.mc_dropout: + x = self.dropout(x) # fc dropout + x = self.linear_out(x) + + return x + + def predict( + self, + x, + approx_dist=None, + mc_iterations=None, + quantile_range=[0.025, 0.975], + confidence="95", + ): + + approx_dist = self.mc_dropout if approx_dist is None else approx_dist + if approx_dist and mc_iterations is None: + raise ValueError( + "If the model was trained with MC Dropout, mc_iterations cannot be None." + ) + + self.eval() + if isinstance(x, np.ndarray): + dataset = TSDataset(X=torch.from_numpy(x).float(), y=None, sequence_length=5) + elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): + dataset = TSDataset(X=x.float(), y=None, sequence_length=5) + elif isinstance(x, pd.DataFrame): + dataset = TSDataset( + X=torch.from_numpy(np.array(x, dtype=np.float64)).float(), + y=None, + sequence_length=5, + ) + else: + raise ValueError( + f"Input x must be of type [np.ndarray, torch.FloatTensor], received {type(x)}" + ) + + loader = DataLoader(dataset, batch_size=1024, shuffle=False) + iterations = 1 if not approx_dist else mc_iterations + out_preds = np.zeros([iterations, len(dataset)]) + + for i in range(iterations): + preds = torch.tensor([]).to(self.device) + for X_test_batch in loader: + self.eval() + self.enable_dropout() + if approx_dist: + self.enable_dropout() + + X_test_batch = X_test_batch.to(self.device) + test_pred = self(X_test_batch) + preds = torch.cat((preds, test_pred), 0) + + if self.device.type == "cuda": + preds = preds.squeeze().cpu().detach().numpy() + else: + preds = preds.squeeze().detach().numpy() + out_preds[i, :] = preds + + if 1 in out_preds.shape: + out_preds = out_preds.squeeze() + + means = out_preds.mean(axis=0) + sd = out_preds.std(axis=0) + + return out_preds, means, sd + + def enable_dropout( + self, + ): + # For each layer, if it starts with Dropout, turn it from eval mode to train mode + for layer in self.modules(): + if layer.__class__.__name__.startswith("Dropout"): + layer.train() + + +class TimeIndependentEmulator(torch.nn.Module): + def __init__(self, architecture, mc_dropout=False, dropout_prob=None): + super().__init__() + self.model_name = "IndependentEmulator" + self.input_layer_size = architecture["input_layer_size"] + self.num_linear_layers = architecture["num_linear_layers"] + self.num_nodes_hidden = architecture["num_nodes_hidden"] + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.mc_dropout = mc_dropout + + if not all( + [ + self.num_linear_layers, + self.num_nodes_hidden, + ] + ): + raise AttributeError( + "Model architecture argument missing. Requires: [num_rnn_layers, num_rnn_hidden, ]." + ) + + if mc_dropout and dropout_prob is None: + raise ValueError("If mc_dropout, dropout_prob cannot be None.") + + self.linear_in = nn.Linear( + in_features=self.input_layer_size, out_features=self.num_nodes_hidden + ) + self.linear_hidden = nn.Linear( + in_features=self.num_nodes_hidden, out_features=self.num_nodes_hidden + ) + self.relu = nn.ReLU() + if self.mc_dropout: + self.dropout = nn.Dropout(p=dropout_prob) + self.linear1 = nn.Linear(in_features=self.num_nodes_hidden, out_features=32) + self.linear_out = nn.Linear(in_features=32, out_features=1) + + def forward(self, x): + batch_size = x.shape[0] + x = self.linear_in(x) + + if self.num_linear_layers > 1: + for _ in range(self.num_linear_layers): + x = self.linear_hidden(x) + if self.mc_dropout: + x = self.dropout(x) + x = self.linear1(x) + x = self.relu(x) + if self.mc_dropout: + x = self.dropout(x) + x = self.linear_out(x) + + return x + + def predict( + self, + x, + approx_dist=None, + mc_iterations=None, + quantile_range=[0.025, 0.975], + confidence="95", + ): + + approx_dist = self.mc_dropout if approx_dist is None else approx_dist + if approx_dist and mc_iterations is None: + raise ValueError( + "If the model was trained with MC Dropout, mc_iterations cannot be None." + ) + + self.eval() + if isinstance(x, np.ndarray): + dataset = PyTorchDataset( + X=torch.from_numpy(x).float(), + y=None, + ) + elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): + dataset = PyTorchDataset( + X=x.float(), + y=None, + ) + elif isinstance(x, pd.DataFrame): + dataset = PyTorchDataset( + X=torch.from_numpy(np.array(x, dtype=np.float64)).float(), + y=None, + ) + else: + raise ValueError( + f"Input x must be of type [np.ndarray, torch.FloatTensor], received {type(x)}" + ) + + loader = DataLoader(dataset, batch_size=10, shuffle=False) + iterations = 1 if not approx_dist else mc_iterations + out_preds = np.zeros([iterations, len(dataset)]) + + for i in range(iterations): + preds = torch.tensor([]).to(self.device) + for X_test_batch in loader: + self.eval() + self.enable_dropout() + if approx_dist: + self.enable_dropout() + + X_test_batch = X_test_batch.to(self.device) + test_pred = self(X_test_batch) + preds = torch.cat((preds, test_pred), 0) + + if self.device.type == "cuda": + preds = preds.squeeze().cpu().detach().numpy() + else: + preds = preds.squeeze().detach().numpy() + out_preds[i, :] = preds + + if 1 in out_preds.shape: + out_preds = out_preds.squeeze() + + means = out_preds.mean(axis=0) + sd = out_preds.std(axis=0) + + return out_preds, means, sd + + def enable_dropout( + self, + ): + # For each layer, if it starts with Dropout, turn it from eval mode to train mode + for layer in self.modules(): + if layer.__class__.__name__.startswith("Dropout"): + layer.train() diff --git a/ise/models/testing/__init__.py b/ise/models/testing/__init__.py deleted file mode 100644 index 284c190..0000000 --- a/ise/models/testing/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from ise.models.testing.pretrained import ( - test_pretrained_model, - mc_accuracy, - binned_sle_table, -) diff --git a/ise/models/testing/pretrained.py b/ise/models/testing/pretrained.py deleted file mode 100644 index c87943c..0000000 --- a/ise/models/testing/pretrained.py +++ /dev/null @@ -1,266 +0,0 @@ -"""Testing functions for analyzing performance of pretrained models.""" -import torch -import pandas as pd -from ise.models.training.Trainer import Trainer -import numpy as np - -np.random.seed(10) -from sklearn.metrics import r2_score -from ise.utils.data import load_ml_data -from typing import List - - -def test_pretrained_model( - model_path: str, - model_class, - architecture: dict, - data_directory: str, - time_series: bool, - mc_dropout: bool = False, - dropout_prob: float = 0.1, - mc_iterations: int = 100, - verbose: bool = True, -): - """Runs testing procedure on a pretrained and saved model. Makes model predictions and tests - them based on standard metrics. Outputs the metrics in a dictionary as well as the predictions. - - Args: - model_path (str): Path to the pretrained model. Must be a '.pt' model. - model_class (ModelClass): Model class used to train the model. - architecture (dict): Architecture arguments used to train the model. - data_directory (str): Directory containing training and testing data. - time_series (bool): Flag denoting wether model was trained with time-series data. - mc_dropout (bool, optional): Flag denoting whether the model was trained with MC dropout protocol. Defaults to False. - dropout_prob (float, optional): Dropout probability in MC dropout protocol. Unused if mc_dropout=False. Defaults to 0.1. - mc_iterations (int, optional): MC iterations to be used in testing. Unused if mc_dropout=False. Defaults to 100. - verbose (bool, optional): Flag denoting whether to output logs to terminal. Defaults to True. - - Returns: - tuple: Tuple containing [metrics, preds, bounds], or test metrics, predictions, and uncertainty bounds on test_features. - """ - - if verbose: - print("1/3: Loading processed data...") - - ( - train_features, - train_labels, - test_features, - test_labels, - test_scenarios, - ) = load_ml_data(data_directory=data_directory, time_series=time_series) - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - data_dict = { - "train_features": train_features, - "train_labels": train_labels, - "test_features": test_features, - "test_labels": test_labels, - } - - # Load Model - trainer = Trainer() - sequence_length = 5 if time_series else None - - # TODO: do i need this? what about ise.utils.models.load_model ? - trainer._initiate_model( - model_class, - data_dict=data_dict, - architecture=architecture, - sequence_length=sequence_length, - batch_size=256, - mc_dropout=mc_dropout, - dropout_prob=dropout_prob, - ) - - if verbose: - print("2/3: Loading pretrained weights...") - # Assigned pre-trained weights - if isinstance(model_path, str): - trainer.model.load_state_dict(torch.load(model_path, map_location=device)) - model = trainer.model - else: - model = model_path - - # Evaluate on test_features - if verbose: - print("3/3: Evaluating...") - model.eval() - X_test = torch.from_numpy(np.array(test_features, dtype=np.float64)).float() - - if mc_dropout: - all_preds, means, sd = model.predict( - X_test, mc_iterations=mc_iterations - ) - preds = means - else: - preds, means, sd = model.predict( - X_test, mc_iterations=1 - ) - - quantiles = np.quantile(all_preds, [0.05, 0.95], axis=0) - upper_q = quantiles[1, :] - lower_q = quantiles[0, :] - - test_labels = np.array(test_labels).squeeze() - mse = sum((preds - test_labels) ** 2) / len(preds) - mae = sum(abs((preds - test_labels))) / len(preds) - rmse = np.sqrt(mse) - r2 = r2_score(np.array(test_labels), preds) - - metrics = {"MSE": mse, "MAE": mae, "RMSE": rmse, "R2": r2} - - print( - f"""Test Metrics -MSE: {mse:0.6f} -MAE: {mae:0.6f} -RMSE: {rmse:0.6f} -R2: {r2:0.6f}""" - ) - - return metrics, preds, sd - - -def mc_accuracy( - model_path: str, - model_class, - architecture: dict, - data_directory: str, - time_series: bool, - dropout_prob: float = 0.1, - mc_iterations: int = 30, - verbose: bool = True, -): - """Tests the accuracy of the MC dropout uncertainty bands. Shows the proportion of true - values that fall within the uncertainty range. - - Args: - model_path (str): Path to the pretrained model. Must be a '.pt' model. - model_class (ModelClass): Model class used to train the model. - architecture (dict): Architecture arguments used to train the model. - data_directory (str): Directory containing training and testing data. - time_series (bool): Flag denoting wether model was trained with time-series data. - dropout_prob (float, optional): Dropout probability in MC dropout protocol. Unused if mc_dropout=False. Defaults to 0.1. - mc_iterations (int, optional): MC iterations to be used in testing. Unused if mc_dropout=False. Defaults to 30. - verbose (bool, optional): Flag denoting whether to output logs to terminal. Defaults to True. - - Returns: - tuple: Tuple containing [ci_accuracy, q_accuracy], or confidence interval accuracy and quantile accuracy - """ - if verbose: - print("1/3: Loading processed data...") - - ( - train_features, - train_labels, - test_features, - test_labels, - test_scenarios, - ) = load_ml_data(data_directory=data_directory, time_series=time_series) - - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - - data_dict = { - "train_features": train_features, - "train_labels": train_labels, - "test_features": test_features, - "test_labels": test_labels, - } - - # Load Model - trainer = Trainer() - sequence_length = 5 if time_series else None - - # TODO: do i need this? what about ise.utils.models.load_model ? - trainer._initiate_model( - model_class, - data_dict=data_dict, - architecture=architecture, - sequence_length=sequence_length, - batch_size=256, - mc_dropout=True, - dropout_prob=dropout_prob, - ) - - if verbose: - print("2/3: Loading pretrained weights...") - # Assigned pre-trained weights - trainer.model.load_state_dict(torch.load(model_path, map_location=device)) - model = trainer.model - - # Evaluate on test_features - if verbose: - print("3/3: Evaluating...") - model.eval() - X_test = torch.from_numpy(np.array(test_features, dtype=np.float64)).float() - - # Predict on test set and return confidence intervals and quantiles - all_preds, means, sd = model.predict( - X_test, mc_iterations=mc_iterations - ) - quantiles = np.quantile(all_preds, [0.05, 0.95], axis=0) - lower_ci = means - 1.96*sd - upper_ci = means + 1.96*sd - upper_q = quantiles[1, :] - lower_q = quantiles[0, :] - preds = means - - # Get accuracy based on how many true values fall between CI and Q. - test_labels = np.array(test_labels).squeeze() - q_acc = ((test_labels >= lower_q) & (test_labels <= upper_q)).mean() - ci_acc = ((test_labels >= lower_ci) & (test_labels <= upper_ci)).mean() - - return ci_acc, q_acc - - -def binned_sle_table( - results_dataframe: pd.DataFrame, - bins: List[float], -): - """Creates table that analyzes loss functions over given ranges of SLE. Input is the results - dataframe from ise.utils.data.combine_testing_results. Note that bins can be an integer denoting - how many equal-width bins you want to cut the data into, or it can be a list of cutoffs. If the list does not - contain the mins and maxes of SLE in the dataset, it will be added automatically. - - Args: - results_dataframe (pd.DataFrame): Testing results dataframe outputted from ise.utils.data.combine_testing_results - bins (list, optional): List of bin cutoffs or integer number of equal-width bins. Defaults to None. - - Returns: - pd.DataFrame: Table of metrics per binned SLE. - """ - if not bins: - bins = 5 - - if not isinstance(bins, list) and not isinstance(bins, int): - raise AttributeError( - f"bins type must be list[numeric] or int, received {type(bins)}" - ) - - if isinstance(bins, list): - min_sle, max_sle = min(results_dataframe.true), max(results_dataframe.true) - if bins[0] != min_sle: - bins.insert(0, min_sle) - if bins[-1] != max_sle: - bins.append(max_sle) - - results_dataframe["sle_bin"], groups = pd.cut( - results_dataframe.true, bins, labels=None, retbins=True, include_lowest=True - ) - mse_by_group = results_dataframe.groupby("sle_bin").mean()[["mse", "mae"]] - mse_by_group["Count"] = results_dataframe.groupby("sle_bin").count()["true"] - mse_by_group["Prop"] = (mse_by_group["Count"] / len(results_dataframe)) * 100 - mse_by_group["Prop"] = round(mse_by_group["Prop"], 4).astype(str) + "%" - mse_by_group.index = [ - f"Between {val:0.2f} and {groups[i+1]:0.2f} mm SLE" - for i, val in enumerate(groups[:-1]) - ] - mse_by_group.columns = [ - "Mean Squared Error", - "Mean Absolute Error", - "Count in Test Dataset", - "Proportion in Test Dataset", - ] - - return pd.DataFrame(mse_by_group) diff --git a/ise/models/timeseries/TimeSeriesEmulator.py b/ise/models/timeseries/TimeSeriesEmulator.py deleted file mode 100644 index 999a77e..0000000 --- a/ise/models/timeseries/TimeSeriesEmulator.py +++ /dev/null @@ -1,162 +0,0 @@ -import torch -from torch import nn -from ise.models.training.dataclasses import TSDataset -from torch.utils.data import DataLoader -import numpy as np - -np.random.seed(10) -import pandas as pd -import torch - - -class TimeSeriesEmulator(torch.nn.Module): - def __init__(self, architecture, mc_dropout=False, dropout_prob=None): - super().__init__() - self.model_name = "TimeSeriesEmulator" - self.input_layer_size = architecture["input_layer_size"] - self.num_rnn_layers = architecture["num_rnn_layers"] - self.num_rnn_hidden = architecture["num_rnn_hidden"] - self.time_series = True - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) - self.mc_dropout = mc_dropout - - if not all( - [ - self.num_rnn_layers, - self.num_rnn_hidden, - ] - ): - raise AttributeError( - "Model architecture argument missing. Requires: [num_rnn_layers, num_rnn_hidden, ]." - ) - - if mc_dropout and dropout_prob is None: - raise ValueError("If mc_dropout, dropout_prob cannot be None.") - - if self.mc_dropout: - self.rnn = nn.LSTM( - input_size=self.input_layer_size, - hidden_size=self.num_rnn_hidden, - batch_first=True, - num_layers=self.num_rnn_layers, - dropout=dropout_prob if self.num_rnn_layers>1 else 0, - ) - else: - self.rnn = nn.LSTM( - input_size=self.input_layer_size, - hidden_size=self.num_rnn_hidden, - batch_first=True, - num_layers=self.num_rnn_layers, - ) - - self.relu = nn.ReLU() - - if self.mc_dropout: - self.dropout = nn.Dropout(p=dropout_prob) - self.linear1 = nn.Linear(in_features=self.num_rnn_hidden, out_features=32) - self.linear_out = nn.Linear(in_features=32, out_features=1) - - def forward(self, x): - batch_size = x.shape[0] - h0 = ( - torch.zeros(self.num_rnn_layers, batch_size, self.num_rnn_hidden) - .requires_grad_() - .to(self.device) - ) - c0 = ( - torch.zeros(self.num_rnn_layers, batch_size, self.num_rnn_hidden) - .requires_grad_() - .to(self.device) - ) - _, (hn, _) = self.rnn(x, (h0, c0)) - x = hn[-1, :, :] - if self.mc_dropout: - x = self.dropout(x) - x = self.linear1(x) - x = self.relu(x) - if self.mc_dropout: - x = self.dropout(x) # fc dropout - x = self.linear_out(x) - - return x - - def predict(self, x, approx_dist=None, mc_iterations=None, quantile_range=[0.025, 0.975], confidence="95"): - - approx_dist = self.mc_dropout if approx_dist is None else approx_dist - if approx_dist and mc_iterations is None: - raise ValueError( - "If the model was trained with MC Dropout, mc_iterations cannot be None." - ) - - self.eval() - if isinstance(x, np.ndarray): - dataset = TSDataset( - X=torch.from_numpy(x).float(), y=None, sequence_length=5 - ) - elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): - dataset = TSDataset(X=x.float(), y=None, sequence_length=5) - elif isinstance(x, pd.DataFrame): - dataset = TSDataset( - X=torch.from_numpy(np.array(x, dtype=np.float64)).float(), - y=None, - sequence_length=5, - ) - else: - raise ValueError( - f"Input x must be of type [np.ndarray, torch.FloatTensor], received {type(x)}" - ) - - loader = DataLoader(dataset, batch_size=1024, shuffle=False) - iterations = 1 if not approx_dist else mc_iterations - out_preds = np.zeros([iterations, len(dataset)]) - - for i in range(iterations): - preds = torch.tensor([]).to(self.device) - for X_test_batch in loader: - self.eval() - self.enable_dropout() - if approx_dist: - self.enable_dropout() - - X_test_batch = X_test_batch.to(self.device) - test_pred = self(X_test_batch) - preds = torch.cat((preds, test_pred), 0) - - if self.device.type == "cuda": - preds = preds.squeeze().cpu().detach().numpy() - else: - preds = preds.squeeze().detach().numpy() - out_preds[i, :] = preds - - if 1 in out_preds.shape: - out_preds = out_preds.squeeze() - - means = out_preds.mean(axis=0) - sd = out_preds.std(axis=0) - - # # If you chose to approximate output distribution (MC Dropout) - # if approx_dist: - # z = {"95": 1.96, "99": 2.58} - # if confidence not in z.keys(): - # raise ValueError( - # f"confidence must be in {z.keys()}, received {confidence}" - # ) - # means = out_preds.mean(axis=0) - # quantiles = np.quantile(out_preds, quantile_range, axis=0) - # sd = np.sqrt(np.var(out_preds, axis=0)) - # upper_ci = means + (z[confidence] * sd) - # lower_ci = means - (z[confidence] * sd) - # else: - # means, upper_ci, lower_ci, quantiles = None, None, None, None - - return out_preds, means, sd - - def enable_dropout( - self, - ): - # For each layer, if it starts with Dropout, turn it from eval mode to train mode - for layer in self.modules(): - if layer.__class__.__name__.startswith("Dropout"): - layer.train() diff --git a/ise/models/timeseries/__init__.py b/ise/models/timeseries/__init__.py deleted file mode 100644 index 6e433ee..0000000 --- a/ise/models/timeseries/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from ise.models.timeseries.TimeSeriesEmulator import TimeSeriesEmulator \ No newline at end of file diff --git a/ise/models/traditional/ExploratoryModel.py b/ise/models/traditional/ExploratoryModel.py deleted file mode 100644 index 0fc5873..0000000 --- a/ise/models/traditional/ExploratoryModel.py +++ /dev/null @@ -1,39 +0,0 @@ -import torch -from torch import nn - - -class ExploratoryModel(torch.nn.Module): - def __init__( - self, - input_layer_size, - architecture, - ): - super(ExploratoryModel, self).__init__() - self.model_name = "ExploratoryModel" - self.input_layer_size = input_layer_size - self.num_linear_layers = architecture["num_linear_layers"] - self.nodes = architecture["nodes"] - - if len(self.nodes) != self.num_linear_layers: - raise AttributeError( - f"Length of nodes argument must be equal to num_linear_layers, received {self.num_linear_layers} != {len(self.nodes)}" - ) - - if self.nodes[-1] != 1: - raise ValueError(f"Last node must be equal to 1, received {self.nodes[-1]}") - - model = nn.Sequential() - for i in range(self.num_linear_layers): - if i == 0: - model.append(nn.Linear(self.input_layer_size, self.nodes[i])) - model.append(nn.ReLU()) - elif i == self.num_linear_layers - 1: - model.append(nn.Linear(self.nodes[i - 1], self.nodes[i])) - else: - model.append(nn.Linear(self.nodes[i - 1], self.nodes[i])) - model.append(nn.ReLU()) - - self.model = model - - def forward(self, x): - return self.model(x) \ No newline at end of file diff --git a/ise/models/traditional/IndependentEmulator.py b/ise/models/traditional/IndependentEmulator.py deleted file mode 100644 index 8e7cfe3..0000000 --- a/ise/models/traditional/IndependentEmulator.py +++ /dev/null @@ -1,125 +0,0 @@ -import torch -from torch import nn -from ise.models.training.dataclasses import PyTorchDataset -from torch.utils.data import DataLoader -import numpy as np - -np.random.seed(10) -import pandas as pd -import torch - - -class IndependentEmulator(torch.nn.Module): - def __init__(self, architecture, mc_dropout=False, dropout_prob=None): - super().__init__() - self.model_name = "IndependentEmulator" - self.input_layer_size = architecture["input_layer_size"] - self.num_linear_layers = architecture["num_linear_layers"] - self.num_nodes_hidden = architecture["num_nodes_hidden"] - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) - self.mc_dropout = mc_dropout - - if not all( - [ - self.num_linear_layers, - self.num_nodes_hidden, - ] - ): - raise AttributeError( - "Model architecture argument missing. Requires: [num_rnn_layers, num_rnn_hidden, ]." - ) - - if mc_dropout and dropout_prob is None: - raise ValueError("If mc_dropout, dropout_prob cannot be None.") - - self.linear_in = nn.Linear(in_features=self.input_layer_size, out_features=self.num_nodes_hidden) - self.linear_hidden = nn.Linear(in_features=self.num_nodes_hidden, out_features=self.num_nodes_hidden) - self.relu = nn.ReLU() - if self.mc_dropout: - self.dropout = nn.Dropout(p=dropout_prob) - self.linear1 = nn.Linear(in_features=self.num_nodes_hidden, out_features=32) - self.linear_out = nn.Linear(in_features=32, out_features=1) - - def forward(self, x): - batch_size = x.shape[0] - x = self.linear_in(x) - - if self.num_linear_layers > 1: - for _ in range(self.num_linear_layers): - x = self.linear_hidden(x) - if self.mc_dropout: - x = self.dropout(x) - x = self.linear1(x) - x = self.relu(x) - if self.mc_dropout: - x = self.dropout(x) - x = self.linear_out(x) - - return x - - def predict(self, x, approx_dist=None, mc_iterations=None, quantile_range=[0.025, 0.975], confidence="95"): - - approx_dist = self.mc_dropout if approx_dist is None else approx_dist - if approx_dist and mc_iterations is None: - raise ValueError( - "If the model was trained with MC Dropout, mc_iterations cannot be None." - ) - - self.eval() - if isinstance(x, np.ndarray): - dataset = PyTorchDataset( - X=torch.from_numpy(x).float(), y=None, - ) - elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): - dataset = PyTorchDataset(X=x.float(), y=None, ) - elif isinstance(x, pd.DataFrame): - dataset = PyTorchDataset( - X=torch.from_numpy(np.array(x, dtype=np.float64)).float(), - y=None, - - ) - else: - raise ValueError( - f"Input x must be of type [np.ndarray, torch.FloatTensor], received {type(x)}" - ) - - loader = DataLoader(dataset, batch_size=10, shuffle=False) - iterations = 1 if not approx_dist else mc_iterations - out_preds = np.zeros([iterations, len(dataset)]) - - for i in range(iterations): - preds = torch.tensor([]).to(self.device) - for X_test_batch in loader: - self.eval() - self.enable_dropout() - if approx_dist: - self.enable_dropout() - - X_test_batch = X_test_batch.to(self.device) - test_pred = self(X_test_batch) - preds = torch.cat((preds, test_pred), 0) - - if self.device.type == "cuda": - preds = preds.squeeze().cpu().detach().numpy() - else: - preds = preds.squeeze().detach().numpy() - out_preds[i, :] = preds - - if 1 in out_preds.shape: - out_preds = out_preds.squeeze() - - means = out_preds.mean(axis=0) - sd = out_preds.std(axis=0) - - - return out_preds, means, sd - - def enable_dropout( - self, - ): - # For each layer, if it starts with Dropout, turn it from eval mode to train mode - for layer in self.modules(): - if layer.__class__.__name__.startswith("Dropout"): - layer.train() diff --git a/ise/models/traditional/__init__.py b/ise/models/traditional/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/ise/models/traditional/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/ise/models/training/Trainer.py b/ise/models/train.py similarity index 97% rename from ise/models/training/Trainer.py rename to ise/models/train.py index ac878e1..c40265d 100644 --- a/ise/models/training/Trainer.py +++ b/ise/models/train.py @@ -1,14 +1,16 @@ -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +import time +from datetime import datetime + import matplotlib.pyplot as plt import numpy as np +import pandas as pd import torch -from torch import optim, nn -from ise.models.training.dataclasses import PyTorchDataset, TSDataset +from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from torch import nn, optim from torch.utils.data import DataLoader -import time -import pandas as pd from torch.utils.tensorboard import SummaryWriter -from datetime import datetime + +from ise.data.dataclasses import PyTorchDataset, TSDataset np.random.seed(10) @@ -173,7 +175,7 @@ def train( nodes (list, optional): List of integers denoting the number of nodes in num_linear_layers. Len(nodes) must equal num_linear_layers. Defaults to None. save_model (bool, optional): Flag determining whether the trained model should be saved. Defaults to False. performance_optimized (bool, optional): Flag determining whether the training loop should be optimized for fast training. Defaults to False. - + Returns: self (Trainer): Trainer object """ @@ -212,10 +214,10 @@ def train( total_loss = 0 total_mae = 0 - + # for each batch in train_loader for X_train_batch, y_train_batch in self.train_loader: - + # send to gpu if available X_train_batch = X_train_batch.to(self.device) y_train_batch = y_train_batch.to(self.device) @@ -226,7 +228,7 @@ def train( # get prediction and calculate loss pred = self.model(X_train_batch) loss = criterion(pred, y_train_batch.unsqueeze(1)) - + # calculate dloss/dx for every parameter x (gradients) and advance optimizer loss.backward() optimizer.step() @@ -248,24 +250,23 @@ def train( training_end = time.time() - # If it isn't performance_optimized, run a validation process as well if not performance_optimized: self.model.eval() test_total_loss = 0 test_total_mae = 0 - + # for each batch in the test_loader for X_test_batch, y_test_batch in self.test_loader: - + # send to gpu if available X_test_batch = X_test_batch.to(self.device) y_test_batch = y_test_batch.to(self.device) - + # get prediction and calculate loss test_pred = self.model(X_test_batch) loss = criterion(test_pred, y_test_batch.unsqueeze(1)) - + # add losses to total epoch loss test_total_loss += loss.item() test_total_mae += mae(test_pred, y_test_batch.unsqueeze(1)).item() @@ -296,7 +297,6 @@ def train( tb.add_scalar("Validation MAE", test_mae, epoch) tb.add_scalar("R^2", r2, epoch) - # if verbose, do all the print statements that are calculated if verbose: if not performance_optimized: @@ -367,9 +367,7 @@ def evaluate(self, verbose=True): self.model.eval() preds = torch.tensor([]).to(self.device) for X_test_batch, y_test_batch in self.test_loader: - X_test_batch, y_test_batch = X_test_batch.to(self.device), y_test_batch.to( - self.device - ) + X_test_batch, y_test_batch = X_test_batch.to(self.device), y_test_batch.to(self.device) test_pred = self.model(X_test_batch) preds = torch.cat((preds, test_pred), 0) diff --git a/ise/models/training/__init__.py b/ise/models/training/__init__.py deleted file mode 100644 index 35f1d4d..0000000 --- a/ise/models/training/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from ise.models.training.dataclasses import PyTorchDataset, TSDataset -from ise.models.training.Trainer import Trainer -from ise.models.training.iterative import ( - _structure_emulatordata_args, - _structure_architecture_args, - lag_sequence_test, - rnn_architecture_test, -) diff --git a/ise/models/training/dataclasses.py b/ise/models/training/dataclasses.py deleted file mode 100644 index 464cf85..0000000 --- a/ise/models/training/dataclasses.py +++ /dev/null @@ -1,41 +0,0 @@ -from torch.utils.data import Dataset -import torch - - -class PyTorchDataset(Dataset): - def __init__(self, X, y): - self.X_data = X - self.y_data = y - - def __getitem__(self, index): - if self.y_data is None: - return self.X_data[index] - return self.X_data[index], self.y_data[index] - - def __len__(self): - return len(self.X_data) - - -class TSDataset(Dataset): - def __init__(self, X, y, sequence_length=5): - super().__init__() - self.X = X - self.y = y - self.sequence_length = sequence_length - - def __len__(self): - return len(self.X) - - def __getitem__(self, i): - if i >= self.sequence_length - 1: - i_start = i - self.sequence_length + 1 - x = self.X[i_start : (i + 1), :] - else: - padding = self.X[0].repeat(self.sequence_length - i - 1, 1) - x = self.X[0 : (i + 1), :] - x = torch.cat((padding, x), 0) - - if self.y is None: - return x - - return x, self.y[i] diff --git a/ise/pipelines/__init__.py b/ise/pipelines/__init__.py index 5cb715d..e69de29 100644 --- a/ise/pipelines/__init__.py +++ b/ise/pipelines/__init__.py @@ -1,4 +0,0 @@ -from ise.pipelines.processing import process_data -from ise.pipelines.feature_engineering import feature_engineer - -# from ise.pipelines.testing import analyze_model diff --git a/ise/pipelines/feature_engineering.py b/ise/pipelines/feature_engineering.py index f44b0c7..71de0f6 100644 --- a/ise/pipelines/feature_engineering.py +++ b/ise/pipelines/feature_engineering.py @@ -1,13 +1,13 @@ """Pipeline for feature engineering. After data has been processed from the raw NC files using ise.pipelines.processing, this module will get data ready for modeling.""" -from ise.data.EmulatorData import EmulatorData -from ise.utils.utils import _structure_emulatordata_args import pandas as pd +from ise.utils.functions import _structure_emulatordata_args + def feature_engineer( data_directory: str, - spatial_grouping: str = 'sectors', + spatial_grouping: str = "sectors", time_series: bool = True, export_directory: str = None, emulator_data_args: dict = None, @@ -41,12 +41,8 @@ def feature_engineer( if export_directory: export_flag = "ts" if time_series else "traditional" - train_features.to_csv( - f"{export_directory}/{export_flag}_train_features.csv", index=False - ) - test_features.to_csv( - f"{export_directory}/{export_flag}_test_features.csv", index=False - ) + train_features.to_csv(f"{export_directory}/{export_flag}_train_features.csv", index=False) + test_features.to_csv(f"{export_directory}/{export_flag}_test_features.csv", index=False) pd.Series(train_labels, name="sle").to_csv( f"{export_directory}/{export_flag}_train_labels.csv", index=False ) diff --git a/ise/pipelines/processing.py b/ise/pipelines/processing.py deleted file mode 100644 index ded7362..0000000 --- a/ise/pipelines/processing.py +++ /dev/null @@ -1,44 +0,0 @@ -"""Pipeline function for processing forcing data, Zenodo output data, and merging them together for -use in modelling. -""" -import pandas as pd -from ise.data.processors.control import create_control_dataset -from ise.data.processors.forcings import process_forcings -from ise.data.processors.ismip6 import process_ismip6_outputs -from ise.data.processors.merge import merge_datasets - - -def process_data( - forcing_directory: str, - grids_directory: str, - ismip6_output_directory: str, - export_directory: str, -) -> pd.DataFrame: - """Function for processing forcing data, Zenodo output data, and merging them together for - use in modelling. - - Args: - forcing_directory (str): Directory containing forcing files. - grids_directory (str): Directory containing forcing files. - ismip6_output_directory (str): Directory containing forcing - files. - export_directory (str): Directory containing forcing files. - - Returns: - pd.DataFrame: master, Master dataset containing all processing - outputs. - """ - - process_forcings( - forcing_directory, - grids_directory, - export_directory, - to_process="all", - verbose=False, - ) - process_ismip6_outputs(ismip6_output_directory, export_directory) - master, inputs, outputs = merge_datasets( - export_directory, export_directory, export_directory, include_icecollapse=False - ) - create_control_dataset(ismip6_output_directory, export_directory) - return master diff --git a/ise/pipelines/testing.py b/ise/pipelines/testing.py index fa6dc79..3ac2319 100644 --- a/ise/pipelines/testing.py +++ b/ise/pipelines/testing.py @@ -1,17 +1,19 @@ """Pipeline functions for analyzing a trained network, including model testing, automatic generation of descriptive plots, and analyzing the accuracy of uncertainty bounds.""" +import json import os -from ise.models.timeseries import TimeSeriesEmulator -from ise.utils.data import ( + +import pandas as pd + +from ise.evaluation._tests import binned_sle_table, test_pretrained_model +from ise.evaluation.plots import SectorPlotter +from ise.models.sector import VariationalLSTMEmulator +from ise.utils.functions import ( + calculate_distribution_metrics, combine_testing_results, load_ml_data, - calculate_distribution_metrics, + load_model, ) -from ise.models.testing import test_pretrained_model, binned_sle_table -from ise.utils.models import load_model -from ise.visualization import Plotter -import json -import pandas as pd def analyze_model( @@ -65,7 +67,7 @@ def analyze_model( # save metrics, preds, and bounds with open(f"{save_directory}/metrics.txt", "w") as metrics_file: metrics_file.write(json.dumps(metrics)) - + pd.Series(preds, name="preds").to_csv(f"{save_directory}/NN_predictions.csv") # with open(f"{save_directory}/bounds.txt", "w") as bounds_file: # bounds_file.write(json.dumps(bounds)) @@ -85,7 +87,7 @@ def analyze_model( if isinstance(model_path, str): model = load_model( model_path=model_path, - model_class=TimeSeriesEmulator, + model_class=VariationalLSTMEmulator, architecture=architecture, mc_dropout=mc_dropout, dropout_prob=dropout_prob, @@ -112,14 +114,10 @@ def analyze_model( print("3/4: Generating plots") if plot: - plotter = Plotter.Plotter( - results_dataset=dataset, save_directory=save_directory - ) + plotter = SectorPlotter(results_dataset=dataset, save_directory=save_directory) plotter.plot_ensemble(save=f"{save_directory}/ensemble_plot.png") plotter.plot_ensemble_mean(save=f"{save_directory}/ensemble_means.png") - plotter.plot_distributions( - year=2100, save=f"{save_directory}/distributions.png" - ) + plotter.plot_distributions(year=2100, save=f"{save_directory}/distributions.png") plotter.plot_histograms(year=2100, save=f"{save_directory}/histograms.png") plotter.plot_callibration(alpha=0.5, save=f"{save_directory}/callibration.png") diff --git a/ise/pipelines/training.py b/ise/pipelines/training.py index 458fb04..3cbc96c 100644 --- a/ise/pipelines/training.py +++ b/ise/pipelines/training.py @@ -1,23 +1,20 @@ """"Pipeline functions for training various kinds of emulators, including traditional and time-based neural networks as well as a gaussian process-based emulator.""" -from ise.models.training.Trainer import Trainer -from ise.models.traditional import ExploratoryModel -from ise.models.timeseries import TimeSeriesEmulator -from ise.models.traditional.ExploratoryModel import ExploratoryModel -from ise.models.gp.GP import GP -from torch import nn -import pandas as pd import os +from typing import List + +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF -import numpy as np - -from ise.utils.data import unscale_column - -np.random.seed(10) from sklearn.metrics import r2_score -from sklearn.decomposition import PCA -from typing import List +from torch import nn + +from ise.models.gp import GP +from ise.models.sector import ExploratoryModel, VariationalLSTMEmulator +from ise.models.train import Trainer +from ise.utils.functions import unscale_column def train_timeseries_network( @@ -25,7 +22,7 @@ def train_timeseries_network( architecture: dict = None, epochs: int = 20, batch_size: int = 100, - model_class=TimeSeriesEmulator, + model_class=VariationalLSTMEmulator, loss=nn.MSELoss(), mc_dropout: bool = True, dropout_prob: float = 0.1, @@ -64,13 +61,9 @@ def train_timeseries_network( train_features = pd.read_csv(f"{data_directory}/ts_train_features.csv") test_labels = pd.read_csv(f"{data_directory}/ts_test_labels.csv") train_labels = pd.read_csv(f"{data_directory}/ts_train_labels.csv") - scenarios = pd.read_csv( - f"{data_directory}/ts_test_scenarios.csv" - ).values.tolist() + scenarios = pd.read_csv(f"{data_directory}/ts_test_scenarios.csv").values.tolist() except FileNotFoundError: - raise FileNotFoundError( - 'Files not found. Format must be in format "ts_train_features.csv"' - ) + raise FileNotFoundError('Files not found. Format must be in format "ts_train_features.csv"') data_dict = { "train_features": train_features, @@ -157,9 +150,7 @@ def train_traditional_network( train_features = pd.read_csv(f"{data_directory}/traditional_train_features.csv") test_labels = pd.read_csv(f"{data_directory}/traditional_test_labels.csv") train_labels = pd.read_csv(f"{data_directory}/traditional_train_labels.csv") - scenarios = pd.read_csv( - f"{data_directory}/traditional_test_scenarios.csv" - ).values.tolist() + scenarios = pd.read_csv(f"{data_directory}/traditional_test_scenarios.csv").values.tolist() except FileNotFoundError: raise FileNotFoundError( 'Files not found. Format must be in format "traditional_train_features.csv"' @@ -208,9 +199,14 @@ def train_traditional_network( metrics, test_preds = trainer.evaluate(verbose=verbose) return model, metrics, test_preds + def train_independent_gp( data_directory: str, - features: List[str] = ['ts_anomaly', 'salinity','temperature',], + features: List[str] = [ + "ts_anomaly", + "salinity", + "temperature", + ], kernel=None, verbose: bool = False, save_directory: str = None, @@ -232,7 +228,11 @@ def train_independent_gp( def train_gaussian_process( data_directory: str, n: int, - features: List[str] = ['ts_anomaly', 'salinity','temperature',], + features: List[str] = [ + "ts_anomaly", + "salinity", + "temperature", + ], sampling_method: str = "first_n", kernel=None, verbose: bool = False, @@ -262,26 +262,20 @@ def train_gaussian_process( train_features = pd.read_csv(f"{data_directory}/traditional_train_features.csv") test_labels = pd.read_csv(f"{data_directory}/traditional_test_labels.csv") train_labels = pd.read_csv(f"{data_directory}/traditional_train_labels.csv") - scenarios = pd.read_csv( - f"{data_directory}/traditional_test_scenarios.csv" - ).values.tolist() + scenarios = pd.read_csv(f"{data_directory}/traditional_test_scenarios.csv").values.tolist() except FileNotFoundError: test_features = pd.read_csv(f"{data_directory}/ts_test_features.csv") train_features = pd.read_csv(f"{data_directory}/ts_train_features.csv") test_labels = pd.read_csv(f"{data_directory}/ts_test_labels.csv") train_labels = pd.read_csv(f"{data_directory}/ts_train_labels.csv") - scenarios = pd.read_csv( - f"{data_directory}/ts_test_scenarios.csv" - ).values.tolist() + scenarios = pd.read_csv(f"{data_directory}/ts_test_scenarios.csv").values.tolist() if not isinstance(features, list): raise ValueError(f"features argument must be a list, received {type(features)}") # type check features if not isinstance(features, list): - raise AttributeError( - f"features argument must be of type list, received {type(features)}" - ) + raise AttributeError(f"features argument must be of type list, received {type(features)}") # See if features argument contain columns or principal components features_are_pcs = all([f.lower().startswith("pc") for f in features]) @@ -346,9 +340,7 @@ def train_gaussian_process( print("3/3: Evaluating Model") # evaluate on test - preds, std_prediction, metrics = gaussian_process.test( - gp_test_features, test_labels - ) + preds, std_prediction, metrics = gaussian_process.test(gp_test_features, test_labels) if save_directory: if isinstance(save_directory, str): @@ -365,11 +357,13 @@ def train_gaussian_process( return preds, std_prediction, metrics - def train_multiyear_gaussian_process( data_directory: str, n: int, - features: List[str] = ['temperature', 'salinity',], + features: List[str] = [ + "temperature", + "salinity", + ], kernel=None, save_directory: str = None, ): @@ -390,13 +384,13 @@ def train_multiyear_gaussian_process( """ gp = GP(kernel=kernel) - train_features = pd.read_csv(f'{data_directory}/ts_train_features.csv') - train_labels = pd.read_csv(f'{data_directory}/ts_train_labels.csv') - test_features = pd.read_csv(f'{data_directory}/ts_test_features.csv') - test_labels = pd.read_csv(f'{data_directory}/ts_test_labels.csv') + train_features = pd.read_csv(f"{data_directory}/ts_train_features.csv") + train_labels = pd.read_csv(f"{data_directory}/ts_train_labels.csv") + test_features = pd.read_csv(f"{data_directory}/ts_test_features.csv") + test_labels = pd.read_csv(f"{data_directory}/ts_test_labels.csv") - train_features = unscale_column(train_features, column=['year', 'sectors']) - test_features = unscale_column(test_features, column=['year', 'sectors']) + train_features = unscale_column(train_features, column=["year", "sectors"]) + test_features = unscale_column(test_features, column=["year", "sectors"]) all_preds = [] all_std = [] @@ -412,17 +406,17 @@ def train_multiyear_gaussian_process( test_features_year = test_features_year[features] gp.fit(np.array(train_features_year)[:n], np.array(train_labels_year)[:n]) preds, std_prediction, metrics = gp.test(test_features_year, test_labels_year) - + all_preds.append(preds) all_std.append(std_prediction) all_metrics.append(metrics) - + preds = pd.concat(all_preds).sort_index() std = pd.concat(all_std).sort_index() - + gp_results = pd.Series(pd.concat(all_preds).sort_index(), name="preds") - gp_results['std'] = pd.concat(all_std).sort_index() - + gp_results["std"] = pd.concat(all_std).sort_index() + if save_directory: if isinstance(save_directory, str): preds_path = f"{save_directory}/gp_preds.csv" @@ -431,10 +425,8 @@ def train_multiyear_gaussian_process( elif isinstance(save_directory, bool): preds_path = f"gp_preds.csv" uq_path = f"gp_std.csv" - + pd.Series(preds, name="gp_preds").to_csv(preds_path, index=False) pd.Series(std, name="gp_std").to_csv(uq_path, index=False) - - - return preds, std \ No newline at end of file + return preds, std diff --git a/ise/utils/AIS_densities.csv b/ise/utils/AIS_densities.csv new file mode 100644 index 0000000..a83b651 --- /dev/null +++ b/ise/utils/AIS_densities.csv @@ -0,0 +1,17 @@ +group,model,rhoi,rhow +AWI,PISM1,910.0,1023.0 +DOE,MALI,910.0,1023.0 +ILTS_PIK,SICOPOLIS,910.0,1028.0 +IMAU,IMAUICE1,910.0,1028.0 +IMAU,IMAUICE2,910.0,1028.0 +JPL1,ISSM,917.0,1023.0 +LSCE,GRISLI,918.0,1023.0 +NCAR,CISM,910.0,1026.0 +PIK,PISM1,910.0,1028.0 +PIK,PISM2,910.0,1028.0 +UCIJPL,ISSM,917.0,1023.0 +ULB,fETISh_16km,910.0,1028.0 +ULB,fETISh_32km,910.0,1028.0 +UTAS,ElmerIce,900.0,1025.0 +VUB,AISMPALEO,910.0,1028.0 +VUW,PISM,910.0,1028.0 diff --git a/ise/utils/gris_model_densities.csv b/ise/utils/GIS_densities.csv similarity index 100% rename from ise/utils/gris_model_densities.csv rename to ise/utils/GIS_densities.csv diff --git a/ise/utils/__init__.py b/ise/utils/__init__.py index 773bb3d..e69de29 100644 --- a/ise/utils/__init__.py +++ b/ise/utils/__init__.py @@ -1,13 +0,0 @@ -from ise.utils.utils import ( - get_all_filepaths, - check_input, - _structure_architecture_args, - _structure_emulatordata_args, -) -from ise.utils.data import ( - load_ml_data, - undummify, - combine_testing_results, - get_uncertainty_bands, -) -from ise.utils.models import load_model diff --git a/ise/utils/data.py b/ise/utils/data.py deleted file mode 100644 index f618396..0000000 --- a/ise/utils/data.py +++ /dev/null @@ -1,367 +0,0 @@ -"""Utility functions for handling data.""" - -import pandas as pd -from ise.utils.utils import _structure_emulatordata_args -from ise.data import EmulatorData -from itertools import product -import numpy as np -from scipy.stats import gaussian_kde -from scipy.spatial.distance import jensenshannon -from sklearn.preprocessing import MinMaxScaler -from typing import List - - - -def load_ml_data(data_directory: str, time_series: bool = True): - """Loads training and testing data for machine learning models. These files are generated using - functions in the ise.data.processing modules or process_data in the ise.pipelines.processing module. - - Args: - data_directory (str): Directory containing processed files. - time_series (bool): Flag denoting whether to load the time-series version of the data. - - Returns: - tuple: Tuple containing [train features, train_labels, test_features, test_labels, test_scenarios], or the training and testing datasets including the scenarios used in testing. - """ - if time_series: - # TODO: Test scenarios has no use, get rid of it - try: - test_features = pd.read_csv(f"{data_directory}/ts_test_features.csv") - train_features = pd.read_csv(f"{data_directory}/ts_train_features.csv") - test_labels = pd.read_csv(f"{data_directory}/ts_test_labels.csv") - train_labels = pd.read_csv(f"{data_directory}/ts_train_labels.csv") - test_scenarios = pd.read_csv( - f"{data_directory}/ts_test_scenarios.csv" - ).values.tolist() - except FileNotFoundError: - try: - test_features = pd.read_csv(f"{data_directory}/val_features.csv") - train_features = pd.read_csv(f"{data_directory}/train_features.csv") - test_labels = pd.read_csv(f"{data_directory}/val_labels.csv") - train_labels = pd.read_csv(f"{data_directory}/train_labels.csv") - test_scenarios = pd.read_csv( - f"{data_directory}/ts_test_scenarios.csv" - ).values.tolist() - except: - raise FileNotFoundError( - f'Files not found at {data_directory}. Format must be in format "ts_train_features.csv"' - ) - else: - try: - test_features = pd.read_csv( - f"{data_directory}/traditional_test_features.csv" - ) - train_features = pd.read_csv( - f"{data_directory}/traditional_train_features.csv" - ) - test_labels = pd.read_csv(f"{data_directory}/traditional_test_labels.csv") - train_labels = pd.read_csv(f"{data_directory}/traditional_train_labels.csv") - test_scenarios = pd.read_csv( - f"{data_directory}/traditional_test_scenarios.csv" - ).values.tolist() - except FileNotFoundError: - raise FileNotFoundError( - f'Files not found at {data_directory}. Format must be in format "traditional_train_features.csv"' - ) - - return ( - train_features, - pd.Series(train_labels["sle"], name="sle"), - test_features, - pd.Series(test_labels["sle"], name="sle"), - test_scenarios, - ) - - -def undummify(df: pd.DataFrame, prefix_sep: str = "-"): - """Undummifies, or reverses pd.get_dummies, a dataframe. Includes taking encoded categorical - variable columns (boolean indices), and converts them back into the original data format. - - Args: - df (pd.DataFrame): Dataframe to be converted. - prefix_sep (str, optional): Prefix separator used in pd.get_dummies. Recommended not to change this. Defaults to "-". - - Returns: - _type_: _description_ - """ - cols2collapse = { - item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns - } - - series_list = [] - for col, needs_to_collapse in cols2collapse.items(): - if needs_to_collapse: - undummified = ( - df.filter(like=col) - .idxmax(axis=1) - .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1]) - .rename(col) - ) - series_list.append(undummified) - else: - series_list.append(df[col]) - undummified_df = pd.concat(series_list, axis=1) - return undummified_df - - -def combine_testing_results( - data_directory: str, - preds: np.ndarray, #|pd.Series|str, - sd: dict = None, #|pd.DataFrame = None, - gp_data: dict = None, #|pd.DataFrame = None, - time_series: bool = True, - save_directory: str = None, -): - """Creates testing results dataframe that reverts input data to original formatting and adds on - predictions, losses, and uncertainty bounds. Useful for plotting purposes and overall analysis. - - Args: - data_directory (str): Directory containing training and testing data. - preds (np.ndarray | pd.Series | str): Array/Series of neural network predictions, or the path to the csv containing predictions. - bounds (dict | pd.DataFrame): Dictionary or pd.DataFrame of uncertainty bounds to be added to the dataframe, generally outputted from ise.models.testing.pretrained.test_pretrained_model. Defaults to None. - gp_data (dict | pd.DataFrame): Dictionary or pd.DataFrame containing gaussian process predictions to add to the dataset. Columns/keys must be `preds` and `std`. Defaults to None. - time_series (bool, optional): Flag denoting whether to process the data as a time-series dataset or traditional non-time dataset. Defaults to True. - save_directory (str, optional): Directory where output files will be saved. Defaults to None. - - Returns: - pd.DataFrame: test results dataframe. - """ - - ( - train_features, - train_labels, - test_features, - test_labels, - test_scenarios, - ) = load_ml_data(data_directory, time_series=time_series) - - - X_test = pd.DataFrame(test_features) - if isinstance(test_labels, pd.Series): - y_test = test_labels - elif isinstance(test_labels, pd.DataFrame): - y_test = pd.Series(test_labels["sle"]) - else: - y_test = pd.Series(test_labels) - - test = X_test.drop(columns=[col for col in X_test.columns if "lag" in col]) - test["true"] = y_test - test["pred"] = np.array(pd.read_csv(preds)) if isinstance(preds, str) else preds - test["mse"] = (test.true - test.pred) ** 2 - test["mae"] = abs(test.true - test.pred) - - if gp_data: - test["gp_preds"] = gp_data["preds"] - test["gp_std"] = gp_data["std"] - test['gp_upper_bound'] = test.gp_preds + 1.96 * test.gp_std - test['gp_lower_bound'] = test.gp_preds - 1.96 * test.gp_std - - - test = undummify(test) - test = unscale_column(test, column=['year', 'sector']) - - if sd is not None: - test['sd'] = sd - test['upper_bound'] = preds + 1.96 * sd - test['lower_bound'] = preds - 1.96 * sd - - if save_directory: - if isinstance(save_directory, str): - save_path = f"{save_directory}/results.csv" - - elif isinstance(save_directory, bool): - save_path = f"results.csv" - - test.to_csv(save_path, index=False) - - return test - - -def group_by_run( - dataset: pd.DataFrame, - column: str = None, - condition: str = None, -): - """Groups the dataset into each individual simulation series by both the true value of the - simulated SLE as well as the model predicted SLE. The resulting arrays are NXM matrices with - N being the number of simulations and M being 85, or the length of the series. - - Args: - dataset (pd.DataFrame): Dataset to be grouped - column (str, optional): Column to subset on. Defaults to None. - condition (str, optional): Condition to subset with. Can be int, str, float, etc. Defaults to None. - - Returns: - tuple: Tuple containing [all_trues, all_preds], or NXM matrices of each series corresponding to true values and predicted values. - """ - - modelnames = dataset.modelname.unique() - exp_ids = dataset.exp_id.unique() - sectors = dataset.sectors.unique() - - all_runs = [list(i) for i in list(product(modelnames, exp_ids, sectors))] - - all_trues = [] - all_preds = [] - scenarios = [] - for i, run in enumerate(all_runs): - modelname = run[0] - exp = run[1] - sector = run[2] - if column is None and condition is None: - subset = dataset[ - (dataset.modelname == modelname) - & (dataset.exp_id == exp) - & (dataset.sectors == sector) - ] - elif column is not None and condition is not None: - subset = dataset[ - (dataset.modelname == modelname) - & (dataset.exp_id == exp) - & (dataset.sectors == sector) - & (dataset[column] == condition) - ] - else: - raise ValueError( - "Column and condition type must be the same (None & None, not None & not None)." - ) - if not subset.empty: - scenarios.append([modelname, exp, sector]) - all_trues.append(subset.true.to_numpy()) - all_preds.append(subset.pred.to_numpy()) - - return np.array(all_trues), np.array(all_preds), scenarios - - -def get_uncertainty_bands( - data: pd.DataFrame, confidence: str = "95", quantiles: List[float] = [0.05, 0.95] -): - """Calculates uncertainty bands on the monte carlo dropout protocol. Includes traditional - confidence interval calculation as well as a quantile-based approach. - - Args: - data (pd.DataFrame): Dataframe or array of NXM, typically from ise.utils.data.group_by_run. - confidence (str, optional): Confidence level, must be in [95, 99]. Defaults to '95'. - quantiles (list[float], optional): Quantiles of uncertainty bands. Defaults to [0.05, 0.95]. - - Returns: - tuple: Tuple containing [mean, sd, upper_ci, lower_ci, upper_q, lower_q], or the mean prediction, standard deviation, and the lower and upper confidence interval and quantile bands. - """ - z = {"95": 1.96, "99": 2.58} - data = np.array(data) - mean = data.mean(axis=0) - sd = np.sqrt(data.var(axis=0)) - upper_ci = mean + (z[confidence] * (sd / np.sqrt(data.shape[0]))) - lower_ci = mean - (z[confidence] * (sd / np.sqrt(data.shape[0]))) - quantiles = np.quantile(data, quantiles, axis=0) - upper_q = quantiles[1, :] - lower_q = quantiles[0, :] - return mean, sd, upper_ci, lower_ci, upper_q, lower_q - - -def create_distribution(year: int, dataset: np.ndarray): - """Creates a distribution from an array of numbers using a gaussian kernel density estimator. - Takes an array and ensures it follows probability rules (e.g. integrate to 1, nonzero, etc.), - useful for calculating divergences such as ise.utils.data.kl_divergence and ise.utils.data.js_divergence. - - Args: - year (int): Year to generate the distribution. - dataset (np.ndarray): MX85 matrix of true values or predictions for the series, see ise.utils.data.group_by_run. - - Returns: - tuple: Tuple containing [density, support], or the output distribution and the x values associated with those probabilities. - """ - data = dataset[:, year - 2101] # -1 will be year 2100 - kde = gaussian_kde(data, bw_method="silverman") - support = np.arange(-30, 20, 0.001) - density = kde(support) - return density, support - - -def kl_divergence(p: np.ndarray, q: np.ndarray): - """Calculates the Kullback-Leibler Divergence between two distributions. Q is typically a - 'known' distirubtion and should be the true values, whereas P is typcically the test distribution, - or the predicted distribution. Note the the KL divergence is assymetric, and near-zero values for - p with a non-near zero values for q cause the KL divergence to inflate [citation]. - - Args: - p (np.ndarray): Test distribution - q (np.ndarray): Known distribution - - Returns: - float: KL Divergence - """ - return np.sum(np.where(p != 0, p * np.log(p / q), 0)) - - -def js_divergence(p: np.ndarray, q: np.ndarray): - """Calculates the Jensen-Shannon Divergence between two distributions. Q is typically a - 'known' distirubtion and should be the true values, whereas P is typcically the test distribution, - or the predicted distribution. Note the the JS divergence, unlike the KL divergence, is symetric. - - Args: - p (np.ndarray): Test distribution - q (np.ndarray): Known distribution - - Returns: - float: JS Divergence - """ - return jensenshannon(p, q) - - -def calculate_distribution_metrics( - dataset: pd.DataFrame, column: str = None, condition: str = None -): - """Wrapper for calculating distribution metrics from a dataset. Includes ise.utils.data.group_by_run to - group the true values and predicted values into NXM matrices (with N=number of samples and - M=85, or the number of years in the series). Then, it uses ise.utils.data.create_distribution to - calculate individual distributions from the arrays and calculates the divergences. - - Args: - dataset (pd.DataFrame): Dataset to be grouped - column (str, optional): Column to subset on. Defaults to None. - condition (str, optional): Condition to subset with. Can be int, str, float, etc. Defaults to None. - - Returns: - dict: Dictionary containing dict['kl'] for the KL-Divergence and dict['js'] for the Jensen-Shannon Divergence. - """ - trues, preds, _ = group_by_run(dataset, column=column, condition=condition) - true_distribution, _ = create_distribution(year=2100, dataset=trues) - pred_distribution, _ = create_distribution(year=2100, dataset=preds) - distribution_metrics = { - "kl": kl_divergence(pred_distribution, true_distribution), - "js": js_divergence(pred_distribution, true_distribution), - } - return distribution_metrics - - -def unscale_column(dataset: pd.DataFrame, column: str = "year"): - """Unscale column in dataset, particularly for unscaling year and sectors column given that - they have a known range of values (2016-2100 and 1-18 respectively). - - Args: - dataset (pd.DataFrame): Dataset containing columns to unscale. - column (str | list, optional): Columns to be unscaled, must be in [year, sectors]. Can be both. Defaults to 'year'. - - Returns: - pd.DataFrame: dataset containing unscaled columns. - """ - - if isinstance(column, str): - column = [column] - - if "sectors" in column: - sectors_scaler = MinMaxScaler().fit(np.arange(1, 19).reshape(-1, 1)) - dataset["sectors"] = sectors_scaler.inverse_transform( - np.array(dataset.sectors).reshape(-1, 1) - ) - dataset["sectors"] = round(dataset.sectors).astype(int) - - if "year" in column: - year_scaler = MinMaxScaler().fit(np.arange(2016, 2101).reshape(-1, 1)) - dataset["year"] = year_scaler.inverse_transform( - np.array(dataset.year).reshape(-1, 1) - ) - dataset["year"] = round(dataset.year).astype(int) - - return dataset diff --git a/ise/utils/functions.py b/ise/utils/functions.py new file mode 100644 index 0000000..8b42f1f --- /dev/null +++ b/ise/utils/functions.py @@ -0,0 +1,609 @@ +import os +from itertools import product +from typing import List + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import torch +from netCDF4 import Dataset +from scipy.stats import gaussian_kde +from sklearn.preprocessing import MinMaxScaler + +from ise.evaluation.metrics import js_divergence, kl_divergence + + +def load_model(model_path, model_class, architecture, mc_dropout=False, dropout_prob=0.1): + """Loads PyTorch model from saved state_dict. + + Args: + model_path (str): Filepath to model state_dict. + model_class (Model): Model class. + architecture (dict): Defined architecture of pretrained model. + mc_dropout (bool): Flag denoting wether the model was trained using MC Dropout. + dropout_prob (float): Value between 0 and 1 denoting the dropout probability. + + Returns: + model (Model): Pretrained model. + """ + model = model_class(architecture, mc_dropout=mc_dropout, dropout_prob=dropout_prob) + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + model.load_state_dict(torch.load(model_path, map_location=device)) + return model.to(device) + + +def get_all_filepaths( + path: str, filetype: str = None, contains: str = None, not_contains: str = None +): + """Retrieves all filepaths for files within a directory. Supports subsetting based on filetype + and substring search. + + Args: + path (str): Path to directory to be searched. + filetype (str, optional): File type to be returned (e.g. csv, nc). Defaults to None. + contains (str, optional): Substring that files found must contain. Defaults to None. + not_contains(str, optional): Substring that files found must NOT contain. Defaults to None. + + Returns: + List[str]: list of files within the directory matching the input criteria. + """ + all_files = list() + for (dirpath, dirnames, filenames) in os.walk(path): + all_files += [os.path.join(dirpath, file) for file in filenames] + + if filetype: + if filetype.lower() != "all": + all_files = [file for file in all_files if file.endswith(filetype)] + + if contains: + all_files = [file for file in all_files if contains in file] + + if not_contains: + all_files = [file for file in all_files if not_contains not in file] + + return all_files + + +def add_variable_to_nc(source_file_path, target_file_path, variable_name): + """ + Copies a variable from a source NetCDF file to a target NetCDF file. + + Parameters: + - source_file_path: Path to the source NetCDF file. + - target_file_path: Path to the target NetCDF file. + - variable_name: Name of the variable to be copied. + + Both files are assumed to have matching dimensions for the variable. + """ + # Open the source NetCDF file in read mode + with Dataset(source_file_path, "r") as src_nc: + # Check if the variable exists in the source file + if variable_name in src_nc.variables: + # Read the variable data and attributes + variable_data = src_nc.variables[variable_name][:] + variable_attributes = src_nc.variables[variable_name].ncattrs() + + # Open the target NetCDF file in append mode + with Dataset(target_file_path, "a") as target_nc: + # Create or overwrite the variable in the target file + if variable_name in target_nc.variables: + print( + f"The '{variable_name}' variable already exists in the target file. Overwriting data." + ) + target_nc.variables[variable_name][:] = variable_data + else: + # Create the variable with the same datatype and dimensions + variable = target_nc.createVariable( + variable_name, + src_nc.variables[variable_name].datatype, + src_nc.variables[variable_name].dimensions, + ) + + # Copy the variable attributes + for attr_name in variable_attributes: + variable.setncattr( + attr_name, src_nc.variables[variable_name].getncattr(attr_name) + ) + + # Assign the data to the new variable + variable[:] = variable_data + else: + print(f"'{variable_name}' variable not found in the source file.") + + +def load_ml_data(data_directory: str, time_series: bool = True): + """Loads training and testing data for machine learning models. These files are generated using + functions in the ise.data.processing modules or process_data in the ise.pipelines.processing module. + + Args: + data_directory (str): Directory containing processed files. + time_series (bool): Flag denoting whether to load the time-series version of the data. + + Returns: + tuple: Tuple containing [train features, train_labels, test_features, test_labels, test_scenarios], or the training and testing datasets including the scenarios used in testing. + """ + if time_series: + # TODO: Test scenarios has no use, get rid of it + try: + test_features = pd.read_csv(f"{data_directory}/ts_test_features.csv") + train_features = pd.read_csv(f"{data_directory}/ts_train_features.csv") + test_labels = pd.read_csv(f"{data_directory}/ts_test_labels.csv") + train_labels = pd.read_csv(f"{data_directory}/ts_train_labels.csv") + test_scenarios = pd.read_csv(f"{data_directory}/ts_test_scenarios.csv").values.tolist() + except FileNotFoundError: + try: + test_features = pd.read_csv(f"{data_directory}/val_features.csv") + train_features = pd.read_csv(f"{data_directory}/train_features.csv") + test_labels = pd.read_csv(f"{data_directory}/val_labels.csv") + train_labels = pd.read_csv(f"{data_directory}/train_labels.csv") + test_scenarios = pd.read_csv( + f"{data_directory}/ts_test_scenarios.csv" + ).values.tolist() + except: + raise FileNotFoundError( + f'Files not found at {data_directory}. Format must be in format "ts_train_features.csv"' + ) + else: + try: + test_features = pd.read_csv(f"{data_directory}/traditional_test_features.csv") + train_features = pd.read_csv(f"{data_directory}/traditional_train_features.csv") + test_labels = pd.read_csv(f"{data_directory}/traditional_test_labels.csv") + train_labels = pd.read_csv(f"{data_directory}/traditional_train_labels.csv") + test_scenarios = pd.read_csv( + f"{data_directory}/traditional_test_scenarios.csv" + ).values.tolist() + except FileNotFoundError: + raise FileNotFoundError( + f'Files not found at {data_directory}. Format must be in format "traditional_train_features.csv"' + ) + + return ( + train_features, + pd.Series(train_labels["sle"], name="sle"), + test_features, + pd.Series(test_labels["sle"], name="sle"), + test_scenarios, + ) + + +def undummify(df: pd.DataFrame, prefix_sep: str = "-"): + """Undummifies, or reverses pd.get_dummies, a dataframe. Includes taking encoded categorical + variable columns (boolean indices), and converts them back into the original data format. + + Args: + df (pd.DataFrame): Dataframe to be converted. + prefix_sep (str, optional): Prefix separator used in pd.get_dummies. Recommended not to change this. Defaults to "-". + + Returns: + _type_: _description_ + """ + cols2collapse = {item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns} + + series_list = [] + for col, needs_to_collapse in cols2collapse.items(): + if needs_to_collapse: + undummified = ( + df.filter(like=col) + .idxmax(axis=1) + .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1]) + .rename(col) + ) + series_list.append(undummified) + else: + series_list.append(df[col]) + undummified_df = pd.concat(series_list, axis=1) + return undummified_df + + +def combine_testing_results( + data_directory: str, + preds: np.ndarray, # |pd.Series|str, + sd: dict = None, # |pd.DataFrame = None, + gp_data: dict = None, # |pd.DataFrame = None, + time_series: bool = True, + save_directory: str = None, +): + """Creates testing results dataframe that reverts input data to original formatting and adds on + predictions, losses, and uncertainty bounds. Useful for plotting purposes and overall analysis. + + Args: + data_directory (str): Directory containing training and testing data. + preds (np.ndarray | pd.Series | str): Array/Series of neural network predictions, or the path to the csv containing predictions. + bounds (dict | pd.DataFrame): Dictionary or pd.DataFrame of uncertainty bounds to be added to the dataframe, generally outputted from ise.models.testing.pretrained.test_pretrained_model. Defaults to None. + gp_data (dict | pd.DataFrame): Dictionary or pd.DataFrame containing gaussian process predictions to add to the dataset. Columns/keys must be `preds` and `std`. Defaults to None. + time_series (bool, optional): Flag denoting whether to process the data as a time-series dataset or traditional non-time dataset. Defaults to True. + save_directory (str, optional): Directory where output files will be saved. Defaults to None. + + Returns: + pd.DataFrame: test results dataframe. + """ + + ( + train_features, + train_labels, + test_features, + test_labels, + test_scenarios, + ) = load_ml_data(data_directory, time_series=time_series) + + X_test = pd.DataFrame(test_features) + if isinstance(test_labels, pd.Series): + y_test = test_labels + elif isinstance(test_labels, pd.DataFrame): + y_test = pd.Series(test_labels["sle"]) + else: + y_test = pd.Series(test_labels) + + test = X_test.drop(columns=[col for col in X_test.columns if "lag" in col]) + test["true"] = y_test + test["pred"] = np.array(pd.read_csv(preds)) if isinstance(preds, str) else preds + test["mse"] = (test.true - test.pred) ** 2 + test["mae"] = abs(test.true - test.pred) + + if gp_data: + test["gp_preds"] = gp_data["preds"] + test["gp_std"] = gp_data["std"] + test["gp_upper_bound"] = test.gp_preds + 1.96 * test.gp_std + test["gp_lower_bound"] = test.gp_preds - 1.96 * test.gp_std + + test = undummify(test) + test = unscale_column(test, column=["year", "sector"]) + + if sd is not None: + test["sd"] = sd + test["upper_bound"] = preds + 1.96 * sd + test["lower_bound"] = preds - 1.96 * sd + + if save_directory: + if isinstance(save_directory, str): + save_path = f"{save_directory}/results.csv" + + elif isinstance(save_directory, bool): + save_path = f"results.csv" + + test.to_csv(save_path, index=False) + + return test + + +def group_by_run( + dataset: pd.DataFrame, + column: str = None, + condition: str = None, +): + """Groups the dataset into each individual simulation series by both the true value of the + simulated SLE as well as the model predicted SLE. The resulting arrays are NXM matrices with + N being the number of simulations and M being 85, or the length of the series. + + Args: + dataset (pd.DataFrame): Dataset to be grouped + column (str, optional): Column to subset on. Defaults to None. + condition (str, optional): Condition to subset with. Can be int, str, float, etc. Defaults to None. + + Returns: + tuple: Tuple containing [all_trues, all_preds], or NXM matrices of each series corresponding to true values and predicted values. + """ + + modelnames = dataset.modelname.unique() + exp_ids = dataset.exp_id.unique() + sectors = dataset.sectors.unique() + + all_runs = [list(i) for i in list(product(modelnames, exp_ids, sectors))] + + all_trues = [] + all_preds = [] + scenarios = [] + for i, run in enumerate(all_runs): + modelname = run[0] + exp = run[1] + sector = run[2] + if column is None and condition is None: + subset = dataset[ + (dataset.modelname == modelname) + & (dataset.exp_id == exp) + & (dataset.sectors == sector) + ] + elif column is not None and condition is not None: + subset = dataset[ + (dataset.modelname == modelname) + & (dataset.exp_id == exp) + & (dataset.sectors == sector) + & (dataset[column] == condition) + ] + else: + raise ValueError( + "Column and condition type must be the same (None & None, not None & not None)." + ) + if not subset.empty: + scenarios.append([modelname, exp, sector]) + all_trues.append(subset.true.to_numpy()) + all_preds.append(subset.pred.to_numpy()) + + return np.array(all_trues), np.array(all_preds), scenarios + + +def get_uncertainty_bands( + data: pd.DataFrame, confidence: str = "95", quantiles: List[float] = [0.05, 0.95] +): + """Calculates uncertainty bands on the monte carlo dropout protocol. Includes traditional + confidence interval calculation as well as a quantile-based approach. + + Args: + data (pd.DataFrame): Dataframe or array of NXM, typically from ise.utils.functions.group_by_run. + confidence (str, optional): Confidence level, must be in [95, 99]. Defaults to '95'. + quantiles (list[float], optional): Quantiles of uncertainty bands. Defaults to [0.05, 0.95]. + + Returns: + tuple: Tuple containing [mean, sd, upper_ci, lower_ci, upper_q, lower_q], or the mean prediction, standard deviation, and the lower and upper confidence interval and quantile bands. + """ + z = {"95": 1.96, "99": 2.58} + data = np.array(data) + mean = data.mean(axis=0) + sd = np.sqrt(data.var(axis=0)) + upper_ci = mean + (z[confidence] * (sd / np.sqrt(data.shape[0]))) + lower_ci = mean - (z[confidence] * (sd / np.sqrt(data.shape[0]))) + quantiles = np.quantile(data, quantiles, axis=0) + upper_q = quantiles[1, :] + lower_q = quantiles[0, :] + return mean, sd, upper_ci, lower_ci, upper_q, lower_q + + +def create_distribution(dataset: np.ndarray, min_range=-30, max_range=20, step=0.001): + kde = gaussian_kde(dataset, bw_method="silverman") + support = np.arange(min_range, max_range, step) + density = kde(support) + return density, support + + +def calculate_distribution_metrics( + dataset: pd.DataFrame, column: str = None, condition: str = None +): + """Wrapper for calculating distribution metrics from a dataset. Includes ise.utils.data.group_by_run to + group the true values and predicted values into NXM matrices (with N=number of samples and + M=85, or the number of years in the series). Then, it uses ise.utils.data.create_distribution to + calculate individual distributions from the arrays and calculates the divergences. + + Args: + dataset (pd.DataFrame): Dataset to be grouped + column (str, optional): Column to subset on. Defaults to None. + condition (str, optional): Condition to subset with. Can be int, str, float, etc. Defaults to None. + + Returns: + dict: Dictionary containing dict['kl'] for the KL-Divergence and dict['js'] for the Jensen-Shannon Divergence. + """ + trues, preds, _ = group_by_run(dataset, column=column, condition=condition) + true_distribution, _ = create_distribution(year=2100, dataset=trues) + pred_distribution, _ = create_distribution(year=2100, dataset=preds) + distribution_metrics = { + "kl": kl_divergence(pred_distribution, true_distribution), + "js": js_divergence(pred_distribution, true_distribution), + } + return distribution_metrics + + +def unscale_column(dataset: pd.DataFrame, column: str = "year"): + """Unscale column in dataset, particularly for unscaling year and sectors column given that + they have a known range of values (2016-2100 and 1-18 respectively). + + Args: + dataset (pd.DataFrame): Dataset containing columns to unscale. + column (str | list, optional): Columns to be unscaled, must be in [year, sectors]. Can be both. Defaults to 'year'. + + Returns: + pd.DataFrame: dataset containing unscaled columns. + """ + + if isinstance(column, str): + column = [column] + + if "sectors" in column: + sectors_scaler = MinMaxScaler().fit(np.arange(1, 19).reshape(-1, 1)) + dataset["sectors"] = sectors_scaler.inverse_transform( + np.array(dataset.sectors).reshape(-1, 1) + ) + dataset["sectors"] = round(dataset.sectors).astype(int) + + if "year" in column: + year_scaler = MinMaxScaler().fit(np.arange(2016, 2101).reshape(-1, 1)) + dataset["year"] = year_scaler.inverse_transform(np.array(dataset.year).reshape(-1, 1)) + dataset["year"] = round(dataset.year).astype(int) + + return dataset + + +"""Utility functions for handling various parts of the package, including argument checking and +formatting and file traversal.""" + + +file_dir = os.path.dirname(os.path.realpath(__file__)) + + +def check_input(input: str, options: List[str], argname: str = None): + """Checks validity of input argument. Not used frequently due to error raising being better practice. + + Args: + input (str): Input value. + options (List[str]): Valid options for the input value. + argname (str, optional): Name of the argument being tested. Defaults to None. + """ + # simple assert that input is in the designated options (readability purposes only) + if isinstance(input, str): + input = input.lower() + if input not in options: + if argname is not None: + raise ValueError(f"{argname} must be in {options}, received {input}") + raise ValueError(f"input must be in {options}, received {input}") + + +def get_all_filepaths( + path: str, filetype: str = None, contains: str = None, not_contains: str = None +): + """Retrieves all filepaths for files within a directory. Supports subsetting based on filetype + and substring search. + + Args: + path (str): Path to directory to be searched. + filetype (str, optional): File type to be returned (e.g. csv, nc). Defaults to None. + contains (str, optional): Substring that files found must contain. Defaults to None. + not_contains(str, optional): Substring that files found must NOT contain. Defaults to None. + + Returns: + List[str]: list of files within the directory matching the input criteria. + """ + all_files = list() + for (dirpath, dirnames, filenames) in os.walk(path): + all_files += [os.path.join(dirpath, file) for file in filenames] + + if filetype: + if filetype.lower() != "all": + all_files = [file for file in all_files if file.endswith(filetype)] + + if contains: + all_files = [file for file in all_files if contains in file] + + if not_contains: + all_files = [file for file in all_files if not_contains not in file] + + return all_files + + +def _structure_emulatordata_args(input_args: dict, time_series: bool): + """Formats kwargs for EmulatorData processing. Includes establishing defaults if values are not + supplied. + + Args: + input_args (dict): Dictionary containin kwargs for EmulatorData.process() + time_series (bool): Flag denoting whether the processing is time-series. + + Returns: + dict: EmulatorData.process() kwargs formatted with defaults. + """ + emulator_data_defaults = dict( + target_column="sle", + drop_missing=True, + drop_columns=["groupname", "experiment"], + boolean_indices=True, + scale=True, + split_type="batch", + drop_outliers="explicit", + drop_expression=[("sle", "<", -26.3)], + time_series=time_series, + lag=None, + ) + + if time_series: + emulator_data_defaults["lag"] = 5 + + # If no other args are supplied, use defaults + if input_args is None: + return emulator_data_defaults + # else, replace provided key value pairs in the default dict and reassign + else: + for key in input_args.keys(): + emulator_data_defaults[key] = input_args[key] + output_args = emulator_data_defaults + + return output_args + + +def _structure_architecture_args(architecture, time_series): + """Formats the arguments for model architectures. + + Args: + architecture (dict): User input for desired architecture. + time_series (bool): Flag denoting whether to use time series model arguments or traditional. + + Returns: + architecture (dict): Formatted architecture argument. + """ + + # Check to make sure inappropriate args are not used + if not time_series and ( + "num_rnn_layers" in architecture.keys() or "num_rnn_hidden" in architecture.keys() + ): + raise AttributeError( + f"Time series architecture args must be in [num_linear_layers, nodes], received {architecture}" + ) + if time_series and ( + "nodes" in architecture.keys() or "num_linear_layers" in architecture.keys() + ): + raise AttributeError( + f"Time series architecture args must be in [num_rnn_layers, num_rnn_hidden], received {architecture}" + ) + + if architecture is None: + if time_series: + architecture = { + "num_rnn_layers": 3, + "num_rnn_hidden": 128, + } + else: + architecture = { + "num_linear_layers": 4, + "nodes": [128, 64, 32, 1], + } + else: + return architecture + return architecture + + +def get_X_y(data, dataset_type="sectors", return_format=None, ): + if dataset_type.lower() == "sectors": + dropped_columns = [ + "id", + "cmip_model", + "pathway", + "exp", + "ice_sheet", + "Scenario", + "Ocean forcing", + "Ocean sensitivity", + "Ice shelf fracture", + "Tier", + "aogcm", + "id", + "exp", + "model", + "ivaf", + "outlier", + ] + dropped_columns = [x for x in data.columns if x in dropped_columns] + X_drop = [x for x in data.columns if "sle" in x] + dropped_columns + X = data.drop(columns=X_drop) + y = data[[x for x in data.columns if "sle" in x]] + if return_format is not None: + if return_format.lower() == "numpy": + return X.values, y.values + elif return_format.lower() == "tensor": + return torch.tensor(X.values), torch.tensor(y.values) + elif return_format.lower() == "pandas": + pass + else: + raise ValueError( + f"return_format must be in ['numpy', 'tensor', 'pandas'], received {return_format}" + ) + + return X, y + + +def to_tensor(x): + """ + Converts input data to a PyTorch tensor of type float. + + Args: + x: Input data to be converted. Must be a pandas dataframe, numpy array, or PyTorch tensor. + + Returns: + A PyTorch tensor of type float. + """ + if x is None: + return None + if isinstance(x, pd.DataFrame): + x = torch.tensor(x.values) + elif isinstance(x, np.ndarray): + x = torch.tensor(x) + elif isinstance(x, torch.Tensor): + pass + else: + raise ValueError("Data must be a pandas dataframe, numpy array, or PyTorch tensor") + return x.float() diff --git a/ise/utils/grids.png b/ise/utils/grids.png new file mode 100644 index 0000000..7cc66b5 Binary files /dev/null and b/ise/utils/grids.png differ diff --git a/ise/utils/ismip6_experiments.json b/ise/utils/ismip6_experiments.json deleted file mode 100644 index 103fecb..0000000 --- a/ise/utils/ismip6_experiments.json +++ /dev/null @@ -1,211 +0,0 @@ -{ - "exp01": { - "Experiment": "1", - "AOGCM": "noresm1_m_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp02": { - "Experiment": "2", - "AOGCM": "miroc_esm_chem_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp03": { - "Experiment": "3", - "AOGCM": "noresm1_m_rcp26", - "Scenario": "rcp2.6", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp04": { - "Experiment": "4", - "AOGCM": "ccsm4_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp05": { - "Experiment": "5", - "AOGCM": "noresm1_m_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp06": { - "Experiment": "6", - "AOGCM": "miroc_esm_chem_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp07": { - "Experiment": "7", - "AOGCM": "noresm1_m_rcp26", - "Scenario": "rcp2.6", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp08": { - "Experiment": "8", - "AOGCM": "ccsm4_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp09": { - "Experiment": "9", - "AOGCM": "noresm1_m_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "High", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp10": { - "Experiment": "10", - "AOGCM": "noresm1_m_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Low", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "exp11": { - "Experiment": "11", - "AOGCM": "ccsm4_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": true, - "Tier": 1 - }, - - "exp12": { - "Experiment": "12", - "AOGCM": "ccsm4_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": true, - "Tier": 1 - }, - - "exp13": { - "Experiment": "13", - "AOGCM": "noresm1_m_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "PIGL", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "expA1": { - "Experiment": "A1", - "AOGCM": "hadgem2_es_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 2 - }, - - "expA2": { - "Experiment": "A2", - "AOGCM": "csiro_mk3_6_0_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 2 - }, - - "expA3": { - "Experiment": "A3", - "AOGCM": "ipsl_cm5a_mr_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 2 - }, - - "expA4": { - "Experiment": "A4", - "AOGCM": "ipsl_cm5a_mr_rcp26", - "Scenario": "rcp2.6", - "Ocean forcing": "Open", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "expA5": { - "Experiment": "A5", - "AOGCM": "hadgem2_es_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "expA6": { - "Experiment": "1", - "AOGCM": "csiro_mk3_6_0_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 1 - }, - - "expA7": { - "Experiment": "1", - "AOGCM": "ipsl_cm5a_mr_rcp85", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 2 - }, - - "expA8": { - "Experiment": "1", - "AOGCM": "ipsl_cm5a_mr_rcp26", - "Scenario": "rcp8.5", - "Ocean forcing": "Standard", - "Ocean sensitivity": "Medium", - "Ice shelf fracture": false, - "Tier": 2 - } -} \ No newline at end of file diff --git a/ise/utils/ismip6_experiments_updated.csv b/ise/utils/ismip6_experiments_updated.csv new file mode 100644 index 0000000..536f2ed --- /dev/null +++ b/ise/utils/ismip6_experiments_updated.csv @@ -0,0 +1,138 @@ +ice_sheet,exp,AOGCM,Scenario,Ocean forcing,Ocean sensitivity,Ice shelf fracture,Tier +AIS,exp01,noresm1-m_rcp85,rcp8.5,Open,Medium,FALSE,1 +AIS,exp02,miroc-esm-chem_rcp85,rcp8.5,Open,Medium,FALSE,1 +AIS,exp03,noresm1-m_rcp26,rcp2.6,Open,Medium,FALSE,1 +AIS,exp04,ccsm4_rcp85,rcp8.5,Open,Medium,FALSE,1 +AIS,exp05,noresm1-m_rcp85,rcp8.5,Standard,Medium,FALSE,1 +AIS,exp06,miroc-esm-chem_rcp85,rcp8.5,Standard,Medium,FALSE,1 +AIS,exp07,noresm1-m_rcp26,rcp2.6,Standard,Medium,FALSE,1 +AIS,exp08,ccsm4_rcp85,rcp8.5,Standard,Medium,FALSE,1 +AIS,exp09,noresm1-m_rcp85,rcp8.5,Standard,High,FALSE,1 +AIS,exp10,noresm1-m_rcp85,rcp8.5,Standard,Low,FALSE,1 +AIS,exp11,ccsm4_rcp85,rcp8.5,Open,Medium,TRUE,1 +AIS,exp12,ccsm4_rcp85,rcp8.5,Standard,Medium,TRUE,1 +AIS,exp13,noresm1-m_rcp85,rcp8.5,Standard,PIGL,FALSE,1 +GrIS,exp01,miroc5_rcp85,rcp8.5,Open,Medium,FALSE,1 +GrIS,exp02,noresm1-m_rcp85,rcp8.5,Open,Medium,FALSE,1 +GrIS,exp03,miroc5_rcp85,rcp8.5,Open,Medium,FALSE,1 +GrIS,exp04,hadgem2-es_rcp85,rcp8.5,Open,Medium,FALSE,1 +GrIS,exp05,miroc5_rcp85,rcp8.5,Standard,Medium,FALSE,1 +GrIS,exp06,noresm1-m_rcp85,rcp8.5,Standard,Medium,FALSE,1 +GrIS,exp07,miroc5_rcp26,rcp2.6,Standard,Medium,FALSE,1 +GrIS,exp08,hadgem2-es_rcp85,rcp8.5,Standard,Medium,FALSE,1 +GrIS,exp09,miroc5_rcp85,rcp8.5,Standard,High,FALSE,1 +GrIS,exp10,miroc5_rcp85,rcp8.5,Standard,Low,FALSE,1 +AIS,expA1,hadgem2-es_rcp85,rcp8.5,Open,Medium,FALSE,2 +AIS,expA2,csiro-mk3.6_rcp85,rcp8.5,Open,Medium,FALSE,2 +AIS,expA3,ipsl-cm5-mr_rcp85,rcp8.5,Open,Medium,FALSE,2 +AIS,expA4,ipsl-cm5-mr_rcp26,rcp2.6,Open,Medium,FALSE,1 +AIS,expA5,hadgem2-es_rcp85,rcp8.5,Standard,Medium,FALSE,1 +AIS,expA6,csiro-mk3.6_rcp85,rcp8.5,Standard,Medium,FALSE,1 +AIS,expA7,ipsl-cm5-mr_rcp85,rcp8.5,Standard,Medium,FALSE,2 +AIS,expA8,ipsl-cm5-mr_rcp26,rcp8.5,Standard,Medium,FALSE,2 +AIS,expB1,cnrm-cm6_ssp585,ssp58.5,Open,Medium,FALSE,2 +AIS,expB2,cnrm-cm6_ssp126,ssp12.6,Open,Medium,FALSE,2 +AIS,expB3,ukesm1-0-ll_ssp585,ssp58.5,Open,Medium,FALSE,2 +AIS,expB4,cesm2_ssp585,ssp58.5,Open,Medium,FALSE,2 +AIS,expB5,cnrm-esm2_ssp585,ssp58.5,Open,Medium,FALSE,2 +AIS,expB6,cnrm-cm6_ssp585,ssp58.5,Standard,Medium,FALSE,2 +AIS,expB7,cnrm-cm6_ssp126,ssp12.6,Standard,Medium,FALSE,2 +AIS,expB8,ukesm1-0-ll_ssp585,ssp58.5,Standard,Medium,FALSE,2 +AIS,expB9,cesm2_ssp585,ssp58.5,Standard,Medium,FALSE,2 +AIS,expB10,cnrm-esm2_ssp585,ssp58.5,Standard,Medium,FALSE,2 +AIS,expC1,noresm1-m_ao_rcp85,rcp8.5,NAN,Medium,FALSE,3 +AIS,expC2,noresm1-m_oo_rcp85,rcp8.5,Open,Medium,FALSE,3 +AIS,expC3,noresm1-m_oo_rcp85,rcp8.5,Standard,Medium,FALSE,3 +AIS,expC4,miroc-esm-chem_ao_rcp85,rcp8.5,NAN,Medium,FALSE,3 +AIS,expC5,miroc-esm-chem_oo_rcp85,rcp8.5,Open,Medium,FALSE,3 +AIS,expC6,noresm1-m_ao_rcp85,rcp8.5,Standard,Medium,FALSE,3 +AIS,expC7,noresm1-m_ao_rcp26,rcp2.6,NAN,Medium,FALSE,3 +AIS,expC8,noresm1-m_ao_rcp26,rcp2.6,Open,Medium,FALSE,3 +AIS,expC9,noresm1-m_ao_rcp26,rcp2.6,Standard,Medium,FALSE,3 +AIS,expC10,ccsm4_oo_rcp85,rcp8.5,NAN,Medium,FALSE,3 +AIS,expC11,ccsm4_oo_rcp85,rcp8.5,Open,Medium,FALSE,3 +AIS,expC12,ccsm4_oo_rcp85,rcp8.5,Standard,Medium,FALSE,3 +AIS,expD1,miroc-esm-chem_rcp85,rcp8.5,Standard,High,FALSE,3 +AIS,expD2,miroc-esm-chem_rcp85,rcp8.5,Standard,Low,FALSE,3 +AIS,expD3,noresm1-m_rcp26,rcp2.6,Standard,High,FALSE,3 +AIS,expD4,noresm1-m_rcp26,rcp2.6,Standard,Low,FALSE,3 +AIS,expD5,ccsm4_rcp85,rcp8.5,Standard,High,FALSE,3 +AIS,expD6,ccsm4_rcp85,rcp8.5,Standard,Low,FALSE,3 +AIS,expD7,hadgem2-es_rcp85,rcp8.5,Standard,High,FALSE,3 +AIS,expD8,hadgem2-es_rcp85,rcp8.5,Standard,Low,FALSE,3 +AIS,expD9,csiro-mk3.6_rcp85,rcp8.5,Standard,High,FALSE,3 +AIS,expD10,csiro-mk3.6_rcp85,rcp8.5,Standard,Low,FALSE,3 +AIS,expD11,ipsl-cm5-mr_rcp85,rcp8.5,Standard,High,FALSE,3 +AIS,expD12,ipsl-cm5-mr_rcp85,rcp8.5,Standard,Low,FALSE,3 +AIS,expD13,cnrm-cm6_ssp585,ssp58.5,Standard,High,FALSE,3 +AIS,expD14,cnrm-cm6_ssp585,ssp58.5,Standard,Low,FALSE,3 +AIS,expD15,ukesm1-0-ll_ssp585,ssp58.5,Standard,High,FALSE,3 +AIS,expD16,ukesm1-0-ll_ssp585,ssp58.5,Standard,Low,FALSE,3 +AIS,expD17,cesm2_ssp585,ssp58.5,Standard,High,FALSE,3 +AIS,expD18,cesm2_ssp585,ssp58.5,Standard,Low,FALSE,3 +AIS,expD51,noresm1-m_rcp85,rcp8.5,Standard,PIGL_low,FALSE,3 +AIS,expD52,noresm1-m_rcp85,rcp8.5,Standard,PIGL_high,FALSE,3 +AIS,expD53,miroc-esm-chem_rcp85,rcp8.5,Standard,PIGL_medium,FALSE,3 +AIS,expD54,miroc-esm-chem_rcp85,rcp8.5,Standard,PIGL_low,FALSE,3 +AIS,expD55,miroc-esm-chem_rcp85,rcp8.5,Standard,PIGL_high,FALSE,3 +AIS,expD56,ccsm4_rcp85,rcp8.5,Standard,PIGL_medium,FALSE,3 +AIS,expD57,ccsm4_rcp85,rcp8.5,Standard,PIGL_low,FALSE,3 +AIS,expD58,ccsm4_rcp85,rcp8.5,Standard,PIGL_high,FALSE,3 +AIS,expE1,noresm1-m_rcp85,rcp8.5,Open,Medium,TRUE,3 +AIS,expE2,miroc-esm-chem_rcp85,rcp8.5,Open,Medium,TRUE,3 +AIS,expE3,hadgem2-es_rcp85,rcp8.5,Open,Medium,TRUE,3 +AIS,expE4,csiro-mk3.6_rcp85,rcp8.5,Open,Medium,TRUE,3 +AIS,expE5,ipsl-cm5-mr_rcp85,rcp8.5,Open,Medium,TRUE,3 +AIS,expE6,noresm1-m_rcp85,rcp8.5,Standard,Medium,TRUE,3 +AIS,expE7,miroc-esm-chem_rcp85,rcp8.5,Standard,Medium,TRUE,3 +AIS,expE8,hadgem2-es_rcp85,rcp8.5,Standard,Medium,TRUE,3 +AIS,expE9,csiro-mk3.6_rcp85,rcp8.5,Standard,Medium,TRUE,3 +AIS,expE10,ipsl-cm5-mr_rcp85,rcp8.5,Standard,Medium,TRUE,3 +AIS,expE11,cnrm-cm6_ssp585,ssp58.5,Open,Medium,TRUE,3 +AIS,expE12,ukesm1-0-ll_ssp585,ssp58.5,Open,Medium,TRUE,3 +AIS,expE13,cesm2_ssp585,ssp58.5,Open,Medium,TRUE,3 +AIS,expE14,cnrm-esm2_ssp585,ssp58.5,Open,Medium,TRUE,3 +AIS,expE15,cnrm-cm6_ssp585,ssp58.5,Standard,Medium,TRUE,3 +AIS,expE16,ukesm1-0-ll_ssp585,ssp58.5,Standard,Medium,TRUE,3 +AIS,expE17,cesm2_ssp585,ssp58.5,Standard,Medium,TRUE,3 +AIS,expE18,cnrm-esm2_ssp585,ssp58.5,Standard,Medium,TRUE,3 +GrIS,expa01,ipsl-cm5-mr_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expa02,csiro-mk3.6_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expa03,access1.3_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expb01,cnrm-cm6_ssp585,ssp58.5,,Medium,FALSE,2 +GrIS,expb02,cnrm-cm6_ssp126,ssp2.6,,Medium,FALSE,2 +GrIS,expb03,ukesm1-0-ll_ssp585,ssp58.5,,Medium,FALSE,2 +GrIS,expb04,cesm2_ssp585,ssp58.5,,Medium,FALSE,2 +GrIS,expb05,cnrm-esm2_ssp585,ssp58.5,,Medium,FALSE,2 +GrIS,expc01,miroc5_ao_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc02,miroc5_oo_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc03,csiro-mk3.6_ao_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc04,csiro-mk3.6_oo_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc05,miroc5_ao_rcp26,rcp2.6,,Medium,FALSE,2 +GrIS,expc06,miroc5_oo_rcp26,rcp2.6,,Medium,FALSE,2 +GrIS,expc07,noresm1-m_ao_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc08,noresm1-m_oo_rcp85,rcp8.5,,Medium,FALSE,2 +GrIS,expc09,miroc5_oo_rcp85,rcp8.5,,High,FALSE,2 +GrIS,expc10,miroc5_oo_rcp85,rcp8.5,,Low,FALSE,2 +GrIS,expd01,noresm1-m_rcp85,rcp8.5,,High,FALSE,3 +GrIS,expd02,noresm1-m_rcp85,rcp8.5,,Low,FALSE,3 +GrIS,expd03,hadgem2-es_rcp85,rcp8.5,,High,FALSE,3 +GrIS,expd04,hadgem2-es_rcp85,rcp8.5,,Low,FALSE,3 +GrIS,expd05,miroc5_rcp26,rcp2.6,,High,FALSE,3 +GrIS,expd06,miroc5_rcp26,rcp2.6,,Low,FALSE,3 +GrIS,expd07,ipsl-cm5-mr_rcp85,rcp8.5,,High,FALSE,3 +GrIS,expd08,ipsl-cm5-mr_rcp85,rcp8.5,,Low,FALSE,3 +GrIS,expd09,csiro-mk3.6_rcp85,rcp8.5,,High,FALSE,3 +GrIS,expd10,csiro-mk3.6_rcp85,rcp8.5,,Low,FALSE,3 +GrIS,expd11,access1.3_rcp85,rcp8.5,,High,FALSE,3 +GrIS,expd12,access1.3_rcp85,rcp8.5,,Low,FALSE,3 +GrIS,expd13,cnrm-cm6_ssp585,ssp58.5,,High,FALSE,3 +GrIS,expd14,cnrm-cm6_ssp585,ssp58.5,,Low,FALSE,3 +GrIS,expd15,cnrm-cm6_ssp126,ssp12.6,,High,FALSE,3 +GrIS,expd16,cnrm-cm6_ssp126,ssp12.6,,Low,FALSE,3 +GrIS,expd17,ukesm1-0-ll_ssp585,ssp58.5,,High,FALSE,3 +GrIS,expd18,ukesm1-0-ll_ssp585,ssp58.5,,Low,FALSE,3 +GrIS,expd19,cesm2_ssp585,ssp58.5,,High,FALSE,3 +GrIS,expd20,cesm2_ssp585,ssp58.5,,Low,FALSE,3 +GrIS,expd21,cnrm-esm2_ssp585,ssp58.5,,High,FALSE,3 +GrIS,expd22,cnrm-esm2_ssp585,ssp58.5,,Low,FALSE,3 diff --git a/ise/utils/model_characteristics.csv b/ise/utils/model_characteristics.csv new file mode 100644 index 0000000..3a217a1 --- /dev/null +++ b/ise/utils/model_characteristics.csv @@ -0,0 +1,17 @@ +model,numerics,stress_balance,resolution,init_method,initial_year,melt,ice_front,open_melt_param,standard_melt_param +AWI_PISM1,FD,Hybrid,8,Eq,2005,Sub-grid,StR,Quad,Nonlocal +DOE_MALI,FE/FV,HO,variable,DA_relax,2015,Floating_condition,Fix,None,Nonlocal_anom +PIK_SICOPOLIS,FD,Hybrid,8,SP_icethickness,1990,Floating_condition,MH,None,Nonlocal +IMAU_IMAUICE1,FD,Hybrid,32,Eq,1978,No,Fix,None,Local_anom +IMAU_IMAUICE2,FD,Hybrid,32,SP_icethickness,1978,No,Fix,None,Local_anom +JPL1_ISSM,FE,SSA,variable,DA_relax,2007,Sub-grid,Fix,None,Nonlocal +LSCE_GRISLI,FD,Hybrid,16,SP_icethickness,1995,None,MH,None,Nonlocal +NCAR_CISM,FE/FV,L1L2,4,SP_icethickness,1995,Sub-grid,RO,Nonlocal_Slope,Nonlocal_anom +PIK_PISM1,FD,Hybrid,8,SP,1850,Sub-grid,StR,PICO,None +PIK_PISM2,FD,Hybrid,8,SP,2015,Sub-grid,StR,PICO,None +UCIJPL_ISSM,FE,HO,variable,DA,2007,Sub-grid,Fix,PICOP,Nonlocal +fETISh_16km,FD,Hybrid,16,DA_geom,2005,None,Div,Plume,Nonlocal +fETISh_32km,FD,Hybrid,32,DA_geom,2005,None,Div,Plume,Nonlocal +UTAS_ElmerIce,FE,Stokes,variable,DA,2015,Sub-grid,Fix,None,Local +VUB_AISMPALEO,FD,SIA_SSA,20,SP,2000,None,MH,None,Nonlocal_anom +VUW_PISM,FD,Hybrid,16,SP,2015,No,StR,Lin,None diff --git a/ise/utils/models.py b/ise/utils/models.py deleted file mode 100644 index b18eeff..0000000 --- a/ise/utils/models.py +++ /dev/null @@ -1,24 +0,0 @@ -"""Utility functions for working with pretrained models.""" - -import torch - - -def load_model( - model_path, model_class, architecture, mc_dropout=False, dropout_prob=0.1 -): - """Loads PyTorch model from saved state_dict. - - Args: - model_path (str): Filepath to model state_dict. - model_class (Model): Model class. - architecture (dict): Defined architecture of pretrained model. - mc_dropout (bool): Flag denoting wether the model was trained using MC Dropout. - dropout_prob (float): Value between 0 and 1 denoting the dropout probability. - - Returns: - model (Model): Pretrained model. - """ - model = model_class(architecture, mc_dropout=mc_dropout, dropout_prob=dropout_prob) - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - model.load_state_dict(torch.load(model_path, map_location=device)) - return model.to(device) diff --git a/ise/utils/utils.py b/ise/utils/utils.py deleted file mode 100644 index 7b10f6e..0000000 --- a/ise/utils/utils.py +++ /dev/null @@ -1,142 +0,0 @@ -"""Utility functions for handling various parts of the package, including argument checking and -formatting and file traversal.""" - -import os -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np -from typing import List - -np.random.seed(10) - - -file_dir = os.path.dirname(os.path.realpath(__file__)) - - -def check_input(input: str, options: List[str], argname: str = None): - """Checks validity of input argument. Not used frequently due to error raising being better practice. - - Args: - input (str): Input value. - options (List[str]): Valid options for the input value. - argname (str, optional): Name of the argument being tested. Defaults to None. - """ - # simple assert that input is in the designated options (readability purposes only) - if isinstance(input, str): - input = input.lower() - if input not in options: - if argname is not None: - raise ValueError(f"{argname} must be in {options}, received {input}") - raise ValueError(f"input must be in {options}, received {input}") - - -def get_all_filepaths(path: str, filetype: str = None, contains: str = None, not_contains: str = None): - """Retrieves all filepaths for files within a directory. Supports subsetting based on filetype - and substring search. - - Args: - path (str): Path to directory to be searched. - filetype (str, optional): File type to be returned (e.g. csv, nc). Defaults to None. - contains (str, optional): Substring that files found must contain. Defaults to None. - not_contains(str, optional): Substring that files found must NOT contain. Defaults to None. - - Returns: - List[str]: list of files within the directory matching the input criteria. - """ - all_files = list() - for (dirpath, dirnames, filenames) in os.walk(path): - all_files += [os.path.join(dirpath, file) for file in filenames] - - if filetype: - if filetype.lower() != "all": - all_files = [file for file in all_files if file.endswith(filetype)] - - if contains: - all_files = [file for file in all_files if contains in file] - - if not_contains: - all_files = [file for file in all_files if not_contains not in file] - - return all_files - - -def _structure_emulatordata_args(input_args: dict, time_series: bool): - """Formats kwargs for EmulatorData processing. Includes establishing defaults if values are not - supplied. - - Args: - input_args (dict): Dictionary containin kwargs for EmulatorData.process() - time_series (bool): Flag denoting whether the processing is time-series. - - Returns: - dict: EmulatorData.process() kwargs formatted with defaults. - """ - emulator_data_defaults = dict( - target_column="sle", - drop_missing=True, - drop_columns=["groupname", "experiment"], - boolean_indices=True, - scale=True, - split_type="batch", - drop_outliers="explicit", - drop_expression=[("sle", "<", -26.3)], - time_series=time_series, - lag=None, - ) - - if time_series: - emulator_data_defaults["lag"] = 5 - - # If no other args are supplied, use defaults - if input_args is None: - return emulator_data_defaults - # else, replace provided key value pairs in the default dict and reassign - else: - for key in input_args.keys(): - emulator_data_defaults[key] = input_args[key] - output_args = emulator_data_defaults - - return output_args - - -def _structure_architecture_args(architecture, time_series): - """Formats the arguments for model architectures. - - Args: - architecture (dict): User input for desired architecture. - time_series (bool): Flag denoting whether to use time series model arguments or traditional. - - Returns: - architecture (dict): Formatted architecture argument. - """ - - # Check to make sure inappropriate args are not used - if not time_series and ( - "num_rnn_layers" in architecture.keys() - or "num_rnn_hidden" in architecture.keys() - ): - raise AttributeError( - f"Time series architecture args must be in [num_linear_layers, nodes], received {architecture}" - ) - if time_series and ( - "nodes" in architecture.keys() or "num_linear_layers" in architecture.keys() - ): - raise AttributeError( - f"Time series architecture args must be in [num_rnn_layers, num_rnn_hidden], received {architecture}" - ) - - if architecture is None: - if time_series: - architecture = { - "num_rnn_layers": 3, - "num_rnn_hidden": 128, - } - else: - architecture = { - "num_linear_layers": 4, - "nodes": [128, 64, 32, 1], - } - else: - return architecture - return architecture - diff --git a/ise/visualization/Plotter.py b/ise/visualization/Plotter.py deleted file mode 100644 index aeceac4..0000000 --- a/ise/visualization/Plotter.py +++ /dev/null @@ -1,140 +0,0 @@ -from ise.utils.data import group_by_run -from ise.visualization import ensemble, testing -from ise.utils.data import create_distribution, kl_divergence, js_divergence -import ise - - -class Plotter: - def __init__( - self, results_dataset, column=None, condition=None, save_directory=None - ): - super().__init__() - self.dataset = results_dataset - self.save_directory = save_directory - self.trues, self.preds, self.scenarios = group_by_run( - self.dataset, column=column, condition=condition - ) - self.true_bounds = ensemble.UncertaintyBounds(self.trues) - self.pred_bounds = ensemble.UncertaintyBounds(self.preds) - self.cache = { - "true_sle_runs": self.trues, - "pred_sle_runs": self.preds, - "true_bounds": self.true_bounds, - "pred_bounds": self.pred_bounds, - } - self.true_distribution, self.support = create_distribution( - year=2100, dataset=self.trues - ) - self.pred_distribution, _ = create_distribution(year=2100, dataset=self.preds) - self.distribution_metrics = { - "kl": kl_divergence(self.pred_distribution, self.true_distribution), - "js": js_divergence(self.pred_distribution, self.true_distribution), - } - self.model = None - self.ml_directory = None - - def plot_ensemble( - self, - uncertainty="quantiles", - column=None, - condition=None, - save=None, - ): - return ensemble.plot_ensemble( - dataset=self.dataset, - uncertainty=uncertainty, - column=column, - condition=condition, - save=save, - cache=self.cache, - ) - - def plot_ensemble_mean( - self, - uncertainty=False, - column=None, - condition=None, - save=None, - ): - return ensemble.plot_ensemble_mean( - dataset=self.dataset, - uncertainty=uncertainty, - column=column, - condition=condition, - save=save, - cache=self.cache, - ) - - def plot_distributions( - self, - year, - column=None, - condition=None, - save=None, - ): - return ensemble.plot_distributions( - dataset=self.dataset, - year=year, - column=column, - condition=condition, - save=save, - cache=self.cache, - ) - - def plot_histograms( - self, - year, - column=None, - condition=None, - save=None, - ): - return ensemble.plot_histograms( - dataset=self.dataset, - year=year, - column=column, - condition=condition, - save=save, - cache=self.cache, - ) - - def plot_test_series( - self, - model, - data_directory, - time_series=True, - approx_dist=True, - mc_iterations=100, - confidence="95", - draws="random", - k=10, - save_directory=None, - ): - if not isinstance(model, ise.models.timeseries.TimeSeriesEmulator): - raise NotImplementedError( - "currently the only model compatible with this function is TimeSeriesEmulator." - ) - self.model = model - self.ml_directory = data_directory - return testing.plot_test_series( - model=model, - data_directory=data_directory, - time_series=time_series, - approx_dist=approx_dist, - mc_iterations=mc_iterations, - confidence=confidence, - draws=draws, - k=k, - save_directory=save_directory, - ) - - def plot_callibration( - self, color_by=None, alpha=0.2, column=None, condition=None, save=None - ): - return testing.plot_callibration( - dataset=self.dataset, - column=column, - condition=condition, - color_by=color_by, - alpha=alpha, - save=save, - ) diff --git a/ise/visualization/__init__.py b/ise/visualization/__init__.py deleted file mode 100644 index 4792fc9..0000000 --- a/ise/visualization/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from ise.visualization.testing import plot_test_series -from ise.visualization.ensemble import ( - plot_ensemble_mean, - plot_ensemble, - plot_distributions, -) -from ise.visualization import Plotter diff --git a/ise/visualization/ensemble.py b/ise/visualization/ensemble.py deleted file mode 100644 index 25ec497..0000000 --- a/ise/visualization/ensemble.py +++ /dev/null @@ -1,359 +0,0 @@ -"""Plotting functions for analyzing and comparing the ensembles from simulated data and emulated data. Generally compares distributions at given years or plots all paths over the entire series.""" - - -import numpy as np - -np.random.seed(10) -import matplotlib.pyplot as plt -import pandas as pd -from ise.utils.data import ( - group_by_run, - get_uncertainty_bands, - create_distribution, - kl_divergence, -) -import seaborn as sns - - -class UncertaintyBounds: - def __init__(self, data, confidence="95", quantiles=[0.05, 0.95]): - self.data = data - ( - self.mean, - self.sd, - self.upper_ci, - self.lower_ci, - self.upper_q, - self.lower_q, - ) = get_uncertainty_bands(data, confidence=confidence, quantiles=quantiles) - - -def plot_ensemble( - dataset: pd.DataFrame, - uncertainty: str = "quantiles", - column: str = None, - condition: str = None, - save: str = None, - cache: dict = None, -): - """Generates a plot of the comparison of ensemble results from the true simulations and the predicted emulation. - Adds uncertainty bounds and plots them side-by-side. - - Args: - dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/utils/data.html#combine_testing_results). - uncertainty (str, optional): Type of uncertainty for creating bounds, must be in [quantiles, confidence]. Defaults to 'quantiles'. - column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Defaults to None. - condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. - save (str, optional): Path to save plot. Defaults to None. - cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/visualization/Plotter.html#Plotter). Defaults to None. - """ - - if cache is None: - all_trues, all_preds, scenarios = group_by_run( - dataset, column=column, condition=condition - ) - ( - mean_true, - true_sd, - true_upper_ci, - true_lower_ci, - true_upper_q, - true_lower_q, - ) = get_uncertainty_bands( - all_trues, - ) - ( - mean_pred, - pred_sd, - pred_upper_ci, - pred_lower_ci, - pred_upper_q, - pred_lower_q, - ) = get_uncertainty_bands( - all_preds, - ) - else: - all_trues = cache["true_sle_runs"] - all_preds = cache["pred_sle_runs"] - t = cache["true_bounds"] - p = cache["pred_bounds"] - mean_true, true_upper_ci, true_lower_ci, true_upper_q, true_lower_q = ( - t.mean, - t.upper_ci, - t.lower_ci, - t.upper_q, - t.lower_q, - ) - mean_pred, pred_upper_ci, pred_lower_ci, pred_upper_q, pred_lower_q = ( - p.mean, - p.upper_ci, - p.lower_ci, - p.upper_q, - p.lower_q, - ) - - true_df = pd.DataFrame(all_trues).transpose() - pred_df = pd.DataFrame(all_preds).transpose() - - fig, axs = plt.subplots(1, 2, figsize=(15, 6), sharey=True, sharex=True) - axs[0].plot(true_df) - axs[0].plot(mean_true, "r-", linewidth=4, label="Mean") - axs[1].plot(pred_df) - axs[1].plot(mean_pred, "r-", linewidth=4, label="Mean") - if uncertainty and uncertainty.lower() == "confidence": - axs[0].plot(true_upper_ci, "b--", linewidth=3, label="5/95% Confidence (True)") - axs[0].plot(true_lower_ci, "b--", linewidth=3) - axs[1].plot( - pred_upper_ci, "b--", linewidth=3, label="5/95% Confidence (Predicted)" - ) - axs[1].plot(pred_lower_ci, "b--", linewidth=3) - - elif uncertainty and uncertainty.lower() == "quantiles": - axs[0].plot( - pred_upper_q, "b--", linewidth=3, label="5/95% Percentile (Predicted)" - ) - axs[0].plot(pred_lower_q, "b--", linewidth=3) - axs[1].plot(true_upper_q, "b--", linewidth=3, label="5/95% Percentile (True)") - axs[1].plot(true_lower_q, "b--", linewidth=3) - - elif uncertainty and uncertainty.lower() == "both": - axs[0].plot(true_upper_ci, "r--", linewidth=2, label="5/95% Confidence (True)") - axs[0].plot(true_lower_ci, "r--", linewidth=2) - axs[1].plot( - pred_upper_ci, "b--", linewidth=2, label="5/95% Confidence (Predicted)" - ) - axs[1].plot(pred_lower_ci, "b--", linewidth=2) - axs[1].plot( - pred_upper_q, "o--", linewidth=2, label="5/95% Percentile (Predicted)" - ) - axs[1].plot(pred_lower_q, "o--", linewidth=2) - axs[0].plot(true_upper_q, "k--", linewidth=2, label="5/95% Percentile (True)") - axs[0].plot(true_lower_q, "k--", linewidth=2) - - elif uncertainty and uncertainty.lower() not in ["confidence", "quantiles"]: - raise AttributeError( - f"uncertainty argument must be in ['confidence', 'quantiles'], received {uncertainty}" - ) - - axs[0].title.set_text("True") - axs[0].set_ylabel("True SLE (mm)") - axs[1].title.set_text("Predicted") - plt.xlabel("Years since 2015") - if column is not None and condition is not None: - plt.suptitle(f"Time Series of ISM Ensemble - where {column} == {condition}") - else: - plt.suptitle(f"Time Series of ISM Ensemble") - plt.subplots_adjust(wspace=0, hspace=0) - plt.legend() - - - # TODO: FileNotFoundError: [Errno 2] No such file or directory: 'None/ensemble_plot.png' - if save: - plt.savefig(save) - - -def plot_ensemble_mean( - dataset: pd.DataFrame, - uncertainty: str = False, - column=None, - condition=None, - save=None, - cache=None, -): - """Generates a plot of the mean sea level contribution from the true simulations and the predicted emulation. - - Args: - dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/utils/data.html#combine_testing_results). - uncertainty (str, optional): Type of uncertainty for creating bounds. If not None/False, must be in [quantiles, confidence]. Defaults to 'quantiles'. - column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Defaults to None. - condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. - save (str, optional): Path to save plot. Defaults to None. - cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/visualization/Plotter.html#Plotter). Defaults to None. - """ - - if cache is None: - all_trues, all_preds, scenarios = group_by_run( - dataset, column=column, condition=condition - ) - ( - mean_true, - true_sd, - true_upper_ci, - true_lower_ci, - true_upper_q, - true_lower_q, - ) = get_uncertainty_bands( - all_trues, - ) - ( - mean_pred, - pred_sd, - pred_upper_ci, - pred_lower_ci, - pred_upper_q, - pred_lower_q, - ) = get_uncertainty_bands( - all_preds, - ) - else: - all_trues = cache["true_sle_runs"] - all_preds = cache["pred_sle_runs"] - t = cache["true_bounds"] - p = cache["pred_bounds"] - mean_true, true_upper_ci, true_lower_ci, true_upper_q, true_lower_q = ( - t.mean, - t.upper_ci, - t.lower_ci, - t.upper_q, - t.lower_q, - ) - mean_pred, pred_upper_ci, pred_lower_ci, pred_upper_q, pred_lower_q = ( - p.mean, - p.upper_ci, - p.lower_ci, - p.upper_q, - p.lower_q, - ) - - plt.figure(figsize=(15, 6)) - plt.plot(mean_true, label="True Mean SLE") - plt.plot(mean_pred, label="Predicted Mean SLE") - - if uncertainty and uncertainty.lower() == "confidence": - plt.plot(true_upper_ci, "r--", linewidth=2, label="5/95% Percentile (True)") - plt.plot(true_lower_ci, "r--", linewidth=2) - plt.plot( - pred_upper_ci, "b--", linewidth=2, label="5/95% Percentile (Predicted)" - ) - plt.plot(pred_lower_ci, "b--", linewidth=2) - - elif uncertainty and uncertainty.lower() == "quantiles": - plt.plot(pred_upper_q, "r--", linewidth=2, label="5/95% Confidence (Predicted)") - plt.plot(pred_lower_q, "r--", linewidth=2) - plt.plot(true_upper_q, "b--", linewidth=2, label="5/95% Confidence (True)") - plt.plot(true_lower_q, "b--", linewidth=2) - - elif uncertainty and uncertainty.lower() == "both": - plt.plot(true_upper_ci, "r--", linewidth=2, label="5/95% Percentile (True)") - plt.plot(true_lower_ci, "r--", linewidth=2) - plt.plot( - pred_upper_ci, "b--", linewidth=2, label="5/95% Percentile (Predicted)" - ) - plt.plot(pred_lower_ci, "b--", linewidth=2) - plt.plot(pred_upper_q, "o--", linewidth=2, label="5/95% Confidence (Predicted)") - plt.plot(pred_lower_q, "o--", linewidth=2) - plt.plot(true_upper_q, "k--", linewidth=2, label="5/95% Confidence (True)") - plt.plot(true_lower_q, "k--", linewidth=2) - - elif uncertainty and uncertainty.lower() not in ["confidence", "quantiles"]: - raise AttributeError( - f"uncertainty argument must be in ['confidence', 'quantiles'], received {uncertainty}" - ) - - else: - pass - - if column is not None and condition is not None: - plt.suptitle(f"ISM Ensemble Mean SLE over Time - where {column} == {condition}") - else: - plt.suptitle(f"ISM Ensemble Mean over Time") - plt.xlabel("Years since 2015") - plt.ylabel("Mean SLE (mm)") - plt.legend() - - if save: - plt.savefig(save) - - -def plot_distributions( - dataset: pd.DataFrame, - year: int = 2100, - column: str = None, - condition: str = None, - save: str = None, - cache: dict = None, -): - """Generates a plot of comparison of distributions at a given time slice (year) from the true simulations and the predicted emulation. - - Args: - dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/utils/data.html#combine_testing_results). - year (int, optional): Distribution year (time slice). Defaults to 2100. - column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Defaults to None. - condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. - save (str, optional): Path to save plot. Defaults to None. - cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/visualization/Plotter.html#Plotter). Defaults to None. - """ - - if cache is None: - all_trues, all_preds, scenarios = group_by_run( - dataset, column=column, condition=condition - ) - else: - all_trues = cache["true_sle_runs"] - all_preds = cache["pred_sle_runs"] - - true_dist, true_support = create_distribution(year=year, dataset=all_trues) - pred_dist, pred_support = create_distribution(year=year, dataset=all_preds) - plt.figure(figsize=(15, 8)) - plt.plot(true_support, true_dist, label="True") - plt.plot(pred_support, pred_dist, label="Predicted") - plt.title( - f"Distribution Comparison at year {year}, KL Divergence: {kl_divergence(pred_dist, true_dist):0.3f}" - ) - plt.xlabel("SLE (mm)") - plt.ylabel("Probability") - plt.legend() - if save: - plt.savefig(save) - - -def plot_histograms( - dataset: pd.DataFrame, - year: int = 2100, - column: str = None, - condition: str = None, - save: str = None, - cache: dict = None, -): - """Generates a plot of comparison of histograms at a given time slice (year) from the true simulations and the predicted emulation. - - Args: - dataset (pd.DataFrame): testing results dataframe, result from [ise.utils.data.combine_testing_results](https://brown-sciml.github.io/ise/ise/utils/data.html#combine_testing_results). - year (int, optional): Histogram year (time slice). Defaults to 2100. - column (str, optional): Column to subset on, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Defaults to None. - condition (str, optional): Condition to subset with, used in [ise.utils.data.group_by_run](https://brown-sciml.github.io/ise/ise/utils/data.html#group_by_run). Can be int, str, float, etc. Defaults to None. - save (str, optional): Path to save plot. Defaults to None. - cache (dict, optional): Cached results from previous calculation, used internally in [ise.visualization.Plotter](https://brown-sciml.github.io/ise/ise/visualization/Plotter.html#Plotter). Defaults to None. - """ - if cache is None: - all_trues, all_preds, scenarios = group_by_run( - dataset, column=column, condition=condition - ) - - else: - all_trues = cache["true_sle_runs"] - all_preds = cache["pred_sle_runs"] - - fig = plt.figure(figsize=(15, 8)) - ax1 = plt.subplot( - 1, - 2, - 1, - ) - sns.histplot( - all_preds[:, year - 2101], - label="Predicted Distribution", - color="blue", - alpha=0.3, - ) - plt.legend() - plt.subplot(1, 2, 2, sharex=ax1, sharey=ax1) - sns.histplot( - all_trues[:, year - 2101], label="True Distribution", color="red", alpha=0.3 - ) - plt.suptitle(f"Histograms of Predicted vs True SLE at year {year}") - plt.ylabel("") - plt.legend() - fig.tight_layout(rect=[0, 0.03, 1, 0.95]) - if save: - plt.savefig(save) diff --git a/ise/visualization/testing.py b/ise/visualization/testing.py deleted file mode 100644 index c5dfc3e..0000000 --- a/ise/visualization/testing.py +++ /dev/null @@ -1,154 +0,0 @@ -from ise.utils.data import load_ml_data -import pandas as pd -import seaborn as sns -import random -import torch -import matplotlib.pyplot as plt -import numpy as np - -np.random.seed(10) - - -def plot_test_series( - model, - data_directory, - time_series, - approx_dist=True, - mc_iterations=100, - confidence="95", - draws="random", - k=10, - save_directory=None, -): - _, _, test_features, test_labels, test_scenarios = load_ml_data( - data_directory, time_series=time_series - ) - - sectors = list(set(test_features.sectors)) - sectors.sort() - - if draws == "random": - data = random.sample(test_scenarios, k=k) - elif draws == "first": - data = test_scenarios[:k] - else: - raise ValueError(f"draws must be in [random, first], received {draws}") - - for scen in data: - single_scenario = scen - test_model = single_scenario[0] - test_exp = single_scenario[2] - test_sector = single_scenario[1] - single_test_features = torch.tensor( - np.array( - test_features[ - (test_features[test_model] == 1) - & (test_features[test_exp] == 1) - & (test_features.sectors == test_sector) - ], - dtype=np.float64, - ), - dtype=torch.float, - ) - single_test_labels = np.array( - test_labels[ - (test_features[test_model] == 1) - & (test_features[test_exp] == 1) - & (test_features.sectors == test_sector) - ], - dtype=np.float64, - ) - preds, means, sd = model.predict( - single_test_features, - approx_dist=approx_dist, - mc_iterations=mc_iterations, - confidence=confidence, - ) # TODO: this doesn't work with traditional - - quantiles = np.quantile(preds, [0.05, 0.95], axis=0) - lower_ci = means - 1.96*sd - upper_ci = means + 1.96*sd - upper_q = quantiles[1, :] - lower_q = quantiles[0, :] - - if not approx_dist: - plt.figure(figsize=(15, 8)) - plt.plot(single_test_labels, "r-", label="True") - plt.plot(preds, "b-", label="Predicted") - plt.xlabel("Time (years since 2015)") - plt.ylabel("SLE (mm)") - plt.title( - f"Model={test_model}, Exp={test_exp}, sector={sectors.index(test_sector)+1}" - ) - plt.legend() - if save_directory: - plt.savefig(f"{save_directory}/{test_model}_{test_exp}_test_sector.png") - else: - preds = pd.DataFrame(preds).transpose() - plt.figure(figsize=(15, 8)) - plt.plot( - preds, - alpha=0.2, - ) - plt.plot(means, "b-", label="Predicted") - plt.plot(upper_ci, "k-", label=f"{confidence}% CI") - plt.plot( - lower_ci, - "k-", - ) - plt.plot(quantiles[0, :], "k--", label=f"Quantiles") - plt.plot(quantiles[1, :], "k--") - plt.plot( - lower_ci, - "k-", - ) - plt.plot(single_test_labels, "r-", label="True") - - plt.xlabel("Time (years since 2015)") - plt.ylabel("SLE (mm)") - plt.title( - f"Model={test_model}, Exp={test_exp}, sector={sectors.index(test_sector)+1}" - ) - plt.legend() - if save_directory: - plt.savefig( - f'{save_directory}/{test_model.replace("-", "_")}_{test_exp}_test_sector.png' - ) - - -def plot_callibration( - dataset, column=None, condition=None, color_by=None, alpha=0.2, save=None -): - - # TODO: Add ability to subset multiple columns and conditions. Not needed now so saving for later... - if column is None and condition is None: - subset = dataset - elif column is not None and condition is not None: - subset = dataset[(dataset[column] == condition)] - else: - raise ValueError( - "Column and condition type must be the same (None & None, not None & not None)." - ) - - plt.figure(figsize=(15, 8)) - sns.scatterplot(data=subset, x="true", y="pred", hue=color_by, alpha=alpha) - plt.plot( - [min(subset.true), max(subset.true)], - [min(subset.true), max(subset.true)], - "r-", - ) - - # TODO: Add density plots (below) - # sns.kdeplot(data=subset, x='true', y='pred', hue=color_by, fill=True) - # plt.plot([min(subset.true),max(subset.true)], [min(subset.true),max(subset.true)], 'r-',) - - # TODO: add plotly export - plt.xlabel("True Value") - plt.ylabel("Predicted Value") - plt.title("Callibration Plot") - - if color_by is not None: - plt.legend() - - if save: - plt.savefig(save) diff --git a/manuscripts/A variational LSTM emulator/Manuscript Plots.ipynb b/manuscripts/A variational LSTM emulator/Manuscript Plots.ipynb index d6fa57f..76f63e3 100644 --- a/manuscripts/A variational LSTM emulator/Manuscript Plots.ipynb +++ b/manuscripts/A variational LSTM emulator/Manuscript Plots.ipynb @@ -8,6 +8,24 @@ "## Load packages" ] }, + { + "cell_type": "markdown", + "id": "873f0c11", + "metadata": {}, + "source": [ + "ISE v1.0.0 is required to run this code. Be sure to download the appropriate version using the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86c0b1cf", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install https://github.com/Brown-SciML/ise/archive/refs/tags/v1.0.0.zip" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/manuscripts/A variational LSTM emulator/ablation/1_reduced.py b/manuscripts/A variational LSTM emulator/ablation/1_reduced.py index 4f22f91..e0d4d19 100644 --- a/manuscripts/A variational LSTM emulator/ablation/1_reduced.py +++ b/manuscripts/A variational LSTM emulator/ablation/1_reduced.py @@ -1,23 +1,27 @@ -from ise.models.timeseries.TimeSeriesEmulator import TimeSeriesEmulator -from ise.models.training.Trainer import Trainer -from ise.models.training.dataclasses import TSDataset -from ise.utils.data import load_ml_data -from torch.utils.data import DataLoader +import time + import numpy as np import pandas as pd -from torch import nn import torch -import time +from torch import nn +from torch.utils.data import DataLoader from tqdm import tqdm -print('Loading data...') +from ise.models.timeseries.TimeSeriesEmulator import TimeSeriesEmulator +from ise.models.training.dataclasses import TSDataset +from ise.models.training.Trainer import Trainer +from ise.utils.data import load_ml_data + +print("Loading data...") -train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data(data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data",) +train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data( + data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data", +) data_dict = { - 'train_features': train_features, - 'train_labels': train_labels, - 'test_features': test_features, - 'test_labels': test_labels, + "train_features": train_features, + "train_labels": train_labels, + "test_features": test_features, + "test_labels": test_labels, } criterion = nn.MSELoss() @@ -25,41 +29,41 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Get all columns associated with the variable -ts_anomaly_cols = [c for c in train_features.columns if 'ts_anomaly' in c] -salinity_cols = [c for c in train_features.columns if 'salinity' in c] -temperature_cols = [c for c in train_features.columns if 'temperature' in c] -columns = ts_anomaly_cols + salinity_cols + temperature_cols # + ['year', 'sectors'] +ts_anomaly_cols = [c for c in train_features.columns if "ts_anomaly" in c] +salinity_cols = [c for c in train_features.columns if "salinity" in c] +temperature_cols = [c for c in train_features.columns if "temperature" in c] +columns = ts_anomaly_cols + salinity_cols + temperature_cols # + ['year', 'sectors'] train_features = np.array(train_features[columns]) test_features = np.array(test_features[columns]) train_dataset = TSDataset( - torch.from_numpy(train_features).float(), - torch.from_numpy(np.array(train_labels)).float().squeeze(), - sequence_length=5, - ) + torch.from_numpy(train_features).float(), + torch.from_numpy(np.array(train_labels)).float().squeeze(), + sequence_length=5, +) test_dataset = TSDataset( - torch.from_numpy(test_features).float(), - torch.from_numpy(np.array(test_labels)).float().squeeze(), - sequence_length=5, - ) - -# Create dataset and data loaders to be used in training loop -train_loader = DataLoader( - dataset=train_dataset, batch_size=256, shuffle=True + torch.from_numpy(test_features).float(), + torch.from_numpy(np.array(test_labels)).float().squeeze(), + sequence_length=5, ) + +# Create dataset and data loaders to be used in training loop +train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True) test_loader = DataLoader( dataset=test_dataset, batch_size=256, ) - + architecture = { - 'input_layer_size': train_features.shape[1], - 'num_rnn_layers': 1, - 'num_rnn_hidden': 256, + "input_layer_size": train_features.shape[1], + "num_rnn_layers": 1, + "num_rnn_hidden": 256, } model = TimeSeriesEmulator(architecture=architecture, mc_dropout=True, dropout_prob=0.2).to(device) -optimizer = torch.optim.Adam(model.parameters(),) +optimizer = torch.optim.Adam( + model.parameters(), +) # Loop through epochs for epoch in tqdm(range(1, individual_epochs + 1)): @@ -68,10 +72,10 @@ total_loss = 0 total_mae = 0 - + # for each batch in train_loader for X_train_batch, y_train_batch in train_loader: - + # send to gpu if available X_train_batch = X_train_batch.to(device) y_train_batch = y_train_batch.to(device) @@ -82,7 +86,7 @@ # get prediction and calculate loss pred = model(X_train_batch) loss = criterion(pred, y_train_batch.unsqueeze(1)) - + # calculate dloss/dx for every parameter x (gradients) and advance optimizer loss.backward() optimizer.step() @@ -97,5 +101,7 @@ raw_preds, preds, sd = model.predict(test_features, mc_iterations=100) -out_df = pd.DataFrame(dict(preds=preds, sd=sd),) -out_df.to_csv('/users/pvankatw/emulator/untracked_folder/baylor_tests/1_reduced.csv') +out_df = pd.DataFrame( + dict(preds=preds, sd=sd), +) +out_df.to_csv("/users/pvankatw/emulator/untracked_folder/baylor_tests/1_reduced.csv") diff --git a/manuscripts/A variational LSTM emulator/ablation/85_full.py b/manuscripts/A variational LSTM emulator/ablation/85_full.py index d9f808e..e59ad9b 100644 --- a/manuscripts/A variational LSTM emulator/ablation/85_full.py +++ b/manuscripts/A variational LSTM emulator/ablation/85_full.py @@ -1,28 +1,29 @@ -from ise.models.training.dataclasses import PyTorchDataset -from ise.utils.data import load_ml_data -from torch.utils.data import DataLoader +import time + import numpy as np import pandas as pd -from torch import nn import torch -import time +from torch import nn +from torch.utils.data import DataLoader from tqdm import tqdm +from ise.models.training.dataclasses import PyTorchDataset +from ise.utils.data import load_ml_data + + class YearlyModel(torch.nn.Module): def __init__(self, architecture, dropout_prob=0.2): super().__init__() self.model_name = "YearlyModel" self.input_layer_size = architecture["input_layer_size"] self.num_nodes = architecture["num_nodes"] - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.linear_main = nn.Linear(self.input_layer_size, self.num_nodes) self.relu = nn.ReLU() self.dropout = nn.Dropout(p=dropout_prob) self.linear1 = nn.Linear(in_features=self.num_nodes, out_features=32) self.linear_out = nn.Linear(in_features=32, out_features=1) - + def forward(self, x): x = self.relu(self.linear_main(x)) x = self.dropout(x) @@ -30,14 +31,24 @@ def forward(self, x): x = self.dropout(x) x = self.linear_out(x) return x - - def predict(self, x, mc_iterations=None,): + + def predict( + self, + x, + mc_iterations=None, + ): self.eval() if isinstance(x, np.ndarray): - dataset = PyTorchDataset(torch.from_numpy(x).float(), None,) + dataset = PyTorchDataset( + torch.from_numpy(x).float(), + None, + ) elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): - dataset = PyTorchDataset(x.float(), None,) + dataset = PyTorchDataset( + x.float(), + None, + ) elif isinstance(x, pd.DataFrame): dataset = PyTorchDataset( torch.from_numpy(np.array(x, dtype=np.float64)).float(), @@ -70,12 +81,12 @@ def predict(self, x, mc_iterations=None,): if 1 in out_preds.shape: out_preds = out_preds.squeeze() - + means = out_preds.mean(axis=0) sd = out_preds.std(axis=0) return out_preds, means, sd - + def enable_dropout( self, ): @@ -84,13 +95,16 @@ def enable_dropout( if layer.__class__.__name__.startswith("Dropout"): layer.train() -print('Loading data...') -train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data(data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data",) + +print("Loading data...") +train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data( + data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data", +) data_dict = { - 'train_features': train_features, - 'train_labels': train_labels, - 'test_features': test_features, - 'test_labels': test_labels, + "train_features": train_features, + "train_labels": train_labels, + "test_features": test_features, + "test_labels": test_labels, } criterion = nn.MSELoss() @@ -99,41 +113,41 @@ def enable_dropout( year_results = [] for year in tqdm(train_features.year.unique()): - + train_features_year = train_features[train_features.year == year] train_labels_year = np.array(train_labels[train_labels.index.isin(train_features_year.index)]) test_features_year = test_features[test_features.year == year] test_labels_year = np.array(test_labels[test_labels.index.isin(test_features_year.index)]) - - columns = [c for c in test_features_year.columns if 'lag' not in c] + + columns = [c for c in test_features_year.columns if "lag" not in c] train_features_year = np.array(train_features_year[columns]) test_features_year = np.array(test_features_year[columns]) - + architecture = { - 'input_layer_size': train_features_year.shape[1], - 'num_nodes': 128, + "input_layer_size": train_features_year.shape[1], + "num_nodes": 128, } model = YearlyModel(architecture=architecture, dropout_prob=0.2).to(device) - optimizer = torch.optim.Adam(model.parameters(),) - + optimizer = torch.optim.Adam( + model.parameters(), + ) + train_dataset = PyTorchDataset( - torch.from_numpy(train_features_year).float(), - torch.from_numpy(train_labels_year).float().squeeze(), - ) + torch.from_numpy(train_features_year).float(), + torch.from_numpy(train_labels_year).float().squeeze(), + ) test_dataset = PyTorchDataset( - torch.from_numpy(test_features_year).float(), - torch.from_numpy(test_labels_year).float().squeeze(), - ) - - # Create dataset and data loaders to be used in training loop - train_loader = DataLoader( - dataset=train_dataset, batch_size=256, shuffle=True + torch.from_numpy(test_features_year).float(), + torch.from_numpy(test_labels_year).float().squeeze(), ) + + # Create dataset and data loaders to be used in training loop + train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True) test_loader = DataLoader( dataset=test_dataset, batch_size=256, ) - + # Loop through epochs for epoch in range(1, individual_epochs + 1): model.train() @@ -141,10 +155,10 @@ def enable_dropout( total_loss = 0 total_mae = 0 - + # for each batch in train_loader for X_train_batch, y_train_batch in train_loader: - + # send to gpu if available X_train_batch = X_train_batch.to(device) y_train_batch = y_train_batch.to(device) @@ -155,7 +169,7 @@ def enable_dropout( # get prediction and calculate loss pred = model(X_train_batch) loss = criterion(pred, y_train_batch.unsqueeze(1)) - + # calculate dloss/dx for every parameter x (gradients) and advance optimizer loss.backward() optimizer.step() @@ -166,13 +180,12 @@ def enable_dropout( # divide total losses by number of batches and save to logs avg_mse = total_loss / len(train_loader) - raw_preds, preds, sd = model.predict(test_features_year, mc_iterations=100) - + year_df = pd.DataFrame(dict(preds=preds, std=sd)) year_results.append(year_df) year_results = pd.concat(year_results).sort_index() -print('MSE:', np.mean((year_results['preds'] - test_labels)**2)) -year_results.to_csv('/users/pvankatw/emulator/untracked_folder/baylor_tests/85_full.csv') +print("MSE:", np.mean((year_results["preds"] - test_labels) ** 2)) +year_results.to_csv("/users/pvankatw/emulator/untracked_folder/baylor_tests/85_full.csv") diff --git a/manuscripts/A variational LSTM emulator/ablation/85_reduced.py b/manuscripts/A variational LSTM emulator/ablation/85_reduced.py index 65ac87a..15e05ac 100644 --- a/manuscripts/A variational LSTM emulator/ablation/85_reduced.py +++ b/manuscripts/A variational LSTM emulator/ablation/85_reduced.py @@ -1,28 +1,29 @@ -from ise.models.training.dataclasses import PyTorchDataset -from ise.utils.data import load_ml_data -from torch.utils.data import DataLoader +import time + import numpy as np import pandas as pd -from torch import nn import torch -import time +from torch import nn +from torch.utils.data import DataLoader from tqdm import tqdm +from ise.models.training.dataclasses import PyTorchDataset +from ise.utils.data import load_ml_data + + class YearlyModel(torch.nn.Module): def __init__(self, architecture, dropout_prob=0.2): super().__init__() self.model_name = "YearlyModel" self.input_layer_size = architecture["input_layer_size"] self.num_nodes = architecture["num_nodes"] - self.device = torch.device( - "cuda" if torch.cuda.is_available() else "cpu" - ) + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.linear_main = nn.Linear(self.input_layer_size, self.num_nodes) self.relu = nn.ReLU() self.dropout = nn.Dropout(p=dropout_prob) self.linear1 = nn.Linear(in_features=self.num_nodes, out_features=32) self.linear_out = nn.Linear(in_features=32, out_features=1) - + def forward(self, x): x = self.relu(self.linear_main(x)) x = self.dropout(x) @@ -30,14 +31,24 @@ def forward(self, x): x = self.dropout(x) x = self.linear_out(x) return x - - def predict(self, x, mc_iterations=None,): + + def predict( + self, + x, + mc_iterations=None, + ): self.eval() if isinstance(x, np.ndarray): - dataset = PyTorchDataset(torch.from_numpy(x).float(), None,) + dataset = PyTorchDataset( + torch.from_numpy(x).float(), + None, + ) elif isinstance(x, torch.FloatTensor) or isinstance(x, torch.Tensor): - dataset = PyTorchDataset(x.float(), None,) + dataset = PyTorchDataset( + x.float(), + None, + ) elif isinstance(x, pd.DataFrame): dataset = PyTorchDataset( torch.from_numpy(np.array(x, dtype=np.float64)).float(), @@ -70,12 +81,12 @@ def predict(self, x, mc_iterations=None,): if 1 in out_preds.shape: out_preds = out_preds.squeeze() - + means = out_preds.mean(axis=0) sd = out_preds.std(axis=0) return out_preds, means, sd - + def enable_dropout( self, ): @@ -84,13 +95,16 @@ def enable_dropout( if layer.__class__.__name__.startswith("Dropout"): layer.train() -print('Loading data...') -train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data(data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data",) + +print("Loading data...") +train_features, train_labels, test_features, test_labels, test_scenarios = load_ml_data( + data_directory=r"/users/pvankatw/emulator/untracked_folder/ml_data", +) data_dict = { - 'train_features': train_features, - 'train_labels': train_labels, - 'test_features': test_features, - 'test_labels': test_labels, + "train_features": train_features, + "train_labels": train_labels, + "test_features": test_features, + "test_labels": test_labels, } criterion = nn.MSELoss() @@ -99,41 +113,41 @@ def enable_dropout( year_results = [] for year in tqdm(train_features.year.unique()): - + train_features_year = train_features[train_features.year == year] train_labels_year = np.array(train_labels[train_labels.index.isin(train_features_year.index)]) test_features_year = test_features[test_features.year == year] test_labels_year = np.array(test_labels[test_labels.index.isin(test_features_year.index)]) - - columns = [c for c in test_features_year.columns if 'lag' not in c] + + columns = [c for c in test_features_year.columns if "lag" not in c] train_features_year = np.array(train_features_year[columns]) test_features_year = np.array(test_features_year[columns]) - + architecture = { - 'input_layer_size': train_features_year.shape[1], - 'num_nodes': 64, + "input_layer_size": train_features_year.shape[1], + "num_nodes": 64, } model = YearlyModel(architecture=architecture, dropout_prob=0.2).to(device) - optimizer = torch.optim.Adam(model.parameters(),) - + optimizer = torch.optim.Adam( + model.parameters(), + ) + train_dataset = PyTorchDataset( - torch.from_numpy(train_features_year).float(), - torch.from_numpy(train_labels_year).float().squeeze(), - ) + torch.from_numpy(train_features_year).float(), + torch.from_numpy(train_labels_year).float().squeeze(), + ) test_dataset = PyTorchDataset( - torch.from_numpy(test_features_year).float(), - torch.from_numpy(test_labels_year).float().squeeze(), - ) - - # Create dataset and data loaders to be used in training loop - train_loader = DataLoader( - dataset=train_dataset, batch_size=256, shuffle=True + torch.from_numpy(test_features_year).float(), + torch.from_numpy(test_labels_year).float().squeeze(), ) + + # Create dataset and data loaders to be used in training loop + train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True) test_loader = DataLoader( dataset=test_dataset, batch_size=256, ) - + # Loop through epochs for epoch in range(1, individual_epochs + 1): model.train() @@ -141,10 +155,10 @@ def enable_dropout( total_loss = 0 total_mae = 0 - + # for each batch in train_loader for X_train_batch, y_train_batch in train_loader: - + # send to gpu if available X_train_batch = X_train_batch.to(device) y_train_batch = y_train_batch.to(device) @@ -155,7 +169,7 @@ def enable_dropout( # get prediction and calculate loss pred = model(X_train_batch) loss = criterion(pred, y_train_batch.unsqueeze(1)) - + # calculate dloss/dx for every parameter x (gradients) and advance optimizer loss.backward() optimizer.step() @@ -166,15 +180,14 @@ def enable_dropout( # divide total losses by number of batches and save to logs avg_mse = total_loss / len(train_loader) - raw_preds, preds, sd = model.predict(test_features_year, mc_iterations=100) - + year_df = test_features[test_features.year == year].copy() - year_df['preds'] = preds - year_df['std'] = sd + year_df["preds"] = preds + year_df["std"] = sd year_results.append(year_df) year_results = pd.concat(year_results).sort_index() -print('MSE:', np.mean((year_results['preds'] - test_labels)**2)) -year_results.to_csv('/users/pvankatw/emulator/untracked_folder/baylor_tests/85_full.csv') +print("MSE:", np.mean((year_results["preds"] - test_labels) ** 2)) +year_results.to_csv("/users/pvankatw/emulator/untracked_folder/baylor_tests/85_full.csv") diff --git a/manuscripts/A variational LSTM emulator/gpytorch/train.py b/manuscripts/A variational LSTM emulator/gpytorch/train.py index 0ad85d6..6d4b13d 100644 --- a/manuscripts/A variational LSTM emulator/gpytorch/train.py +++ b/manuscripts/A variational LSTM emulator/gpytorch/train.py @@ -1,14 +1,14 @@ -import pandas as pd -import numpy as np import gpytorch +import numpy as np +import pandas as pd import torch DATA_DIRECTORY = r"/users/pvankatw/emulator/untracked_folder/ml_data" -train_features = pd.read_csv(f'{DATA_DIRECTORY}/ts_train_features.csv') -train_labels = pd.read_csv(f'{DATA_DIRECTORY}/ts_train_labels.csv') -test_features = pd.read_csv(f'{DATA_DIRECTORY}/ts_test_features.csv') -test_labels = pd.read_csv(f'{DATA_DIRECTORY}/ts_test_labels.csv') +train_features = pd.read_csv(f"{DATA_DIRECTORY}/ts_train_features.csv") +train_labels = pd.read_csv(f"{DATA_DIRECTORY}/ts_train_labels.csv") +test_features = pd.read_csv(f"{DATA_DIRECTORY}/ts_test_features.csv") +test_labels = pd.read_csv(f"{DATA_DIRECTORY}/ts_test_labels.csv") train_features = torch.from_numpy(np.array(train_features)) @@ -16,6 +16,7 @@ test_features = torch.from_numpy(np.array(test_features)) test_labels = torch.from_numpy(np.array(test_labels)) + class GPyTorchModel(gpytorch.models.ExactGP): def __init__(self, train_x, train_y, likelihood): super(GPyTorchModel, self).__init__(train_x, train_y, likelihood) @@ -27,6 +28,7 @@ def forward(self, x): covar_x = self.covar_module(x) return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) + # initialize likelihood and model likelihood = gpytorch.likelihoods.GaussianLikelihood() model = GPyTorchModel(train_features, train_labels, likelihood) @@ -55,12 +57,17 @@ def forward(self, x): # Calc loss and backprop gradients loss = -mll(output, train_labels) loss.backward() - print('Iter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % ( - i + 1, training_iter, loss.item(), - model.covar_module.base_kernel.lengthscale.item(), - model.likelihood.noise.item() - )) + print( + "Iter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f" + % ( + i + 1, + training_iter, + loss.item(), + model.covar_module.base_kernel.lengthscale.item(), + model.likelihood.noise.item(), + ) + ) optimizer.step() -print(model) \ No newline at end of file +print(model) diff --git a/sbatch_queue.py b/sbatch_queue.py new file mode 100644 index 0000000..9e1594e --- /dev/null +++ b/sbatch_queue.py @@ -0,0 +1,144 @@ +import sys + +sys.path.append("../..") + +import numpy as np +from ise.data.feature_engineer import FeatureEngineer +import pandas as pd +import matplotlib.pyplot as plt +import torch +import os +from sklearn.metrics import r2_score + +from ise.models.grid import WeakPredictor +from ise.models.loss import WeightedMSELoss, WeightedMSELossWithSignPenalty, WeightedMSEPCALoss, WeightedPCALoss +from ise.evaluation.metrics import mape, relative_squared_error +from ise.utils.functions import get_X_y + +ice_sheet = 'AIS' +dataset = 'sectors' +loss = 'MSELoss' +lag = 5 +overwrite_data = True +scale = True + +train = True +epochs = 1 + + +dir_ = f"/oscar/scratch/pvankatw/datasets/{dataset}/{ice_sheet}/" +experiment_description = f'{dataset}_scaled{scale}_lag{lag}_{loss}' +print('Experiment Description:', experiment_description) +print('Ice Sheet:', ice_sheet) + +if not train and not os.path.exists(f"{dir_}/WeakPredictorModel_{experiment_description}.pth"): + raise FileNotFoundError(f"{dir_}/WeakPredictorModel_{experiment_description}.pth does not exist. Please train the model first.") + +grids_directory = f"/oscar/home/pvankatw/data/pvankatw/pvankatw-bfoxkemp/Grid_Files/" +grids_file = f"{grids_directory}/AIS_sectors_8km.nc" if ice_sheet=='AIS' else f"{grids_directory}/GrIS_Basins_Rignot_sectors_5km.nc" + + +train_exists = os.path.exists(f"{dir_}/train.csv") +if not train_exists or (train_exists and overwrite_data): + fe = FeatureEngineer(ice_sheet=ice_sheet, data=pd.read_csv(f"{dir_}/dataset.csv")) + fe.fill_mrro_nans(method='mean') + fe.add_lag_variables(lag=lag) + quantile = 0.005 + fe.drop_outliers(method='explicit', column='sle', expression=[('sle', '<', np.percentile(fe.data.sle, quantile*100))]) + if scale: + fe.scale_data(save_dir=f"{dir_}/") + fe.split_data(train_size=0.7, val_size=0.15, test_size=0.15, output_directory=dir_) + data = fe.train +else: + data = pd.read_csv(f"{dir_}/train.csv") + +# data = pd.read_csv('/users/pvankatw/research/A_variational_LSTM_emulator/emulator/untracked_folder/ml_data/ts_train_features.csv') +# data = data[data.columns[0:55]] +# data = data.sort_values(by=['model', 'exp', 'sector', 'year']) +X, y = get_X_y(pd.read_csv(f"{dir_}/train.csv"), 'sectors', return_format='numpy') +val_X, val_y = get_X_y(pd.read_csv(f"{dir_}/val.csv"), 'sectors', return_format='numpy') + + +# X = data +# y = pd.read_csv('/users/pvankatw/research/A_variational_LSTM_emulator/emulator/untracked_folder/ml_data/ts_train_labels.csv') + + +# dim_processor = DimensionProcessor( +# pca_model=f"{dir_}/pca_models/{ice_sheet}_pca_sle.pth", +# scaler_model=f"{dir_}/scalers/{ice_sheet}_scaler_sle.pth" +# ) +dim_processor = None + + +losses = dict(WeightedMSELoss=WeightedMSELoss(y.mean().mean(), y.flatten().std(),), + # WeightedMSEPCALoss=WeightedMSEPCALoss(y.mean().mean(), y.values.flatten().std(), component_weights), + # WeightedPCALoss=WeightedPCALoss(component_weights), + HuberLoss=torch.nn.HuberLoss(), + MSELoss=torch.nn.MSELoss(), + WeightedMSELossWithSignPenalty=WeightedMSELossWithSignPenalty(y.mean().mean(), y.flatten().std(), weight_factor=1.0, sign_penalty_factor=0.5), + ) + +model = WeakPredictor( + input_size=X.shape[1], + lstm_num_layers=1, + lstm_hidden_size=512, + output_size=1, + dim_processor=dim_processor, + ice_sheet = ice_sheet, + ) +if train: + model.fit(X, y, epochs=epochs, sequence_length=5, batch_size=256, loss=losses[loss], val_X=val_X, val_y=val_y) + torch.save(model.state_dict(), f"{dir_}/WeakPredictorModel_{experiment_description}.pth") +else: + model.load_state_dict(torch.load(f"{dir_}/WeakPredictorModel_{experiment_description}.pth", map_location=torch.device('cpu')), ) + +model.eval() +X = val_X +y = val_y + +y_preds = model.predict(X).cpu().detach().numpy() +comparison = pd.DataFrame(dict(y=y, y_preds=y_preds.flatten())) +comparison['diff'] = (comparison.y_preds - comparison.y).values +comparison['se'] = (comparison.y_preds - comparison.y).values **2 + +if scale: + fe = FeatureEngineer(ice_sheet=ice_sheet, data=pd.read_csv(f"{dir_}/dataset.csv")) + _, y = fe.unscale_data(y=y, scaler_y_path=f"{dir_}/scaler_y.pkl") + _, y_pred = fe.unscale_data(y=y_preds, scaler_y_path=f"{dir_}/scaler_y.pkl") + +# mse = pd.DataFrame(dict(mse=((y_preds-y)**2).sle, y_true=y.sle.squeeze(), y_pred=y_preds.flatten())) +mse = np.mean((y_pred.flatten() - y) ** 2) +mae = np.mean(abs((y_pred.flatten() - y.sle.values))) +rmse = np.sqrt(mse) +r2 = r2_score(y.sle.values, y_pred.flatten()) + +print(f"""Validation MSE: {mse} +-- MAE: {mae} +-- RMSE: {rmse} +-- R2: {r2} +-- Relative Squared Error: {relative_squared_error(y.sle.values, y_pred.flatten())} +-- Mean Absolute Percentage Error: {mape(y.sle.values, y_pred.flatten())}""") + +scenarios = data[['year', 'sector', 'aogcm', 'exp', 'model', 'Scenario', ]] +scenarios['predicted'], scenarios['true'] = y_preds, y +scenarios['squared_error'] = (scenarios['predicted'] - scenarios['true'])**2 +sector_errors = scenarios.groupby('sector').mean()['squared_error'] +scenario_timeseries = scenarios.groupby(['Scenario', 'year']).mean()['predicted'] + +plt.plot(np.arange(2015,2101), scenario_timeseries['rcp2.6'], label='RCP2.6') +plt.plot(np.arange(2015,2101), scenario_timeseries['rcp8.5'], label='RCP8.5') +plt.title('Average Projection by Scenario') +plt.legend() +plt.savefig(f'./supplemental/{ice_sheet}scenarios.png') +plt.close('all') + + +for i in [0, 5, 10, 15, 20, 25]: + plt.plot(y_pred[i*86:i*86+86, :], label='Predicted') + plt.plot(y.values[i*86:i*86+86, :], label='True') + plt.title('True v Predicted') + plt.legend() + plt.savefig(f'/users/pvankatw/research/current/supplemental/sectors/example_plots/{i}.png') + plt.close('all') + +stop = '' \ No newline at end of file diff --git a/setup.py b/setup.py index 25dc86e..ea7af07 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,22 @@ -from setuptools import setup, find_packages +from setuptools import find_packages, setup setup( - name='ise', - version='0.0.1', - description='Package for creating ice sheet emulators predicting future sea level rise.', - author='Peter Van Katwyk', - author_email='pvankatwyk@gmail.com', - packages=find_packages(), - install_requires=['pdoc', 'numpy', 'pandas', 'scikit-learn', 'torch', 'xarray', 'tensorboard', 'matplotlib', 'seaborn', 'tqdm'], -) \ No newline at end of file + name="ise", + version="0.0.1", + description="Package for creating ice sheet emulators predicting future sea level rise.", + author="Peter Van Katwyk", + author_email="pvankatwyk@gmail.com", + packages=find_packages(), + install_requires=[ + "pdoc", + "numpy", + "pandas", + "scikit-learn", + "torch", + "xarray", + "tensorboard", + "matplotlib", + "seaborn", + "tqdm", + ], +) diff --git a/tests/paths.py b/tests/paths.py index f561545..952c3c0 100644 --- a/tests/paths.py +++ b/tests/paths.py @@ -1,5 +1,9 @@ FORCING_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/" -ISMIP6_OUTPUT_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/Zenodo_Outputs/" -GRIDS_DIRECTORY = r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/ISMIP6_sectors/" +ISMIP6_OUTPUT_DIRECTORY = ( + r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/Zenodo_Outputs/" +) +GRIDS_DIRECTORY = ( + r"/users/pvankatw/data/pvankatw/pvankatw-bfoxkemp/GHub-ISMIP6-Forcing/AIS/ISMIP6_sectors/" +) PROCESSED_FORCING_OUTPUTS = r"/users/pvankatw/emulator/ise/data/datasets/processed_output_files/" -RESULTS_DATASET = r"/users/pvankatw/emulator/untracked_folder/analyze_model/results.csv" \ No newline at end of file +RESULTS_DATASET = r"/users/pvankatw/emulator/untracked_folder/analyze_model/results.csv" diff --git a/tests/test_processed_data.py b/tests/test_processed_data.py index ff3fad3..d730aeb 100644 --- a/tests/test_processed_data.py +++ b/tests/test_processed_data.py @@ -1,13 +1,31 @@ import os + import pandas as pd from paths import PROCESSED_FORCING_OUTPUTS + # Test Processed Data def test_processed_data_exists(): - assert os.path.exists(PROCESSED_FORCING_OUTPUTS), "Processed forcings directory doesn't exist. Run the processing pipelines found in ise.pipelines.processing." - + assert os.path.exists( + PROCESSED_FORCING_OUTPUTS + ), "Processed forcings directory doesn't exist. Run the processing pipelines found in ise.pipelines.processing." + + def test_processed_correct_files(): - assert all([sub in os.listdir(PROCESSED_FORCING_OUTPUTS) for sub in ['thermal_forcing.csv', 'salinity.csv', 'temperature.csv', 'master.csv', 'ice_collapse.csv', 'atmospheric_forcing.csv']]), "Forcing Directory does not contain the correct subdirectories." + assert all( + [ + sub in os.listdir(PROCESSED_FORCING_OUTPUTS) + for sub in [ + "thermal_forcing.csv", + "salinity.csv", + "temperature.csv", + "master.csv", + "ice_collapse.csv", + "atmospheric_forcing.csv", + ] + ] + ), "Forcing Directory does not contain the correct subdirectories." + thermal_forcing = pd.read_csv(f"{PROCESSED_FORCING_OUTPUTS}/thermal_forcing.csv") salinity = pd.read_csv(f"{PROCESSED_FORCING_OUTPUTS}/salinity.csv") @@ -16,13 +34,26 @@ def test_processed_correct_files(): ice_collapse = pd.read_csv(f"{PROCESSED_FORCING_OUTPUTS}/ice_collapse.csv") atmospheric_forcing = pd.read_csv(f"{PROCESSED_FORCING_OUTPUTS}/atmospheric_forcing.csv") processed_data = [thermal_forcing, salinity, temperature, master, ice_collapse, atmospheric_forcing] - + + def test_processed_nonempty(): - assert all([not dataset.empty for dataset in processed_data]), "One of the processed files is empty." + assert all( + [not dataset.empty for dataset in processed_data] + ), "One of the processed files is empty." + def test_processed_attributes(): # Test each dataset for correct columns - assert all([column in dataset for dataset in processed_data for column in ['year', 'aogcm', 'sectors']]), "Year, AOGCM, or Sectors columns are missing from processed data." - assert 'temperature' in temperature.columns, "Temperature column missing from temperature dataset." - assert 'salinity' in salinity.columns, "Salinity column missing from salinity dataset." - assert all([column in master.columns for column in ['salinity', 'temperature', 'thermal_forcing', 'evspsbl_anomaly']]), "Master dataset does not contain all columns." \ No newline at end of file + assert all( + [column in dataset for dataset in processed_data for column in ["year", "aogcm", "sectors"]] + ), "Year, AOGCM, or Sectors columns are missing from processed data." + assert ( + "temperature" in temperature.columns + ), "Temperature column missing from temperature dataset." + assert "salinity" in salinity.columns, "Salinity column missing from salinity dataset." + assert all( + [ + column in master.columns + for column in ["salinity", "temperature", "thermal_forcing", "evspsbl_anomaly"] + ] + ), "Master dataset does not contain all columns." diff --git a/tests/test_raw_data.py b/tests/test_raw_data.py index 674cb60..18981e3 100644 --- a/tests/test_raw_data.py +++ b/tests/test_raw_data.py @@ -1,66 +1,119 @@ import os + from paths import FORCING_DIRECTORY, GRIDS_DIRECTORY, ISMIP6_OUTPUT_DIRECTORY + # Test Forcing Data def test_forcing_data_exists(): - assert os.path.exists(FORCING_DIRECTORY), "Forcing Directory doesn't exist. Download it from Globus Collection 'GHub-ISMIP6-Forcing'" - + assert os.path.exists( + FORCING_DIRECTORY + ), "Forcing Directory doesn't exist. Download it from Globus Collection 'GHub-ISMIP6-Forcing'" + + def test_forcing_AIS_directory(): assert FORCING_DIRECTORY.endswith(r"AIS/"), "Directory must be specific to the AIS." - + + def test_forcing_correct_subfolders(): - assert all([sub in os.listdir(FORCING_DIRECTORY) for sub in ['Ocean_Forcing', 'Ice_Shelf_Fracture', 'Atmosphere_Forcing']]), "Forcing Directory does not contain the correct subdirectories." + assert all( + [ + sub in os.listdir(FORCING_DIRECTORY) + for sub in ["Ocean_Forcing", "Ice_Shelf_Fracture", "Atmosphere_Forcing"] + ] + ), "Forcing Directory does not contain the correct subdirectories." + def test_forcing_file_counts(): assert len(os.listdir(f"{FORCING_DIRECTORY}/Ocean_Forcing")) == 26 assert len(os.listdir(f"{FORCING_DIRECTORY}/Ice_Shelf_Fracture")) == 14 assert len(os.listdir(f"{FORCING_DIRECTORY}/Atmosphere_Forcing")) == 19 - + count = 0 for _, _, files in os.walk(FORCING_DIRECTORY): count += len(files) - - assert count == 3673, "All files as found in Globus are not present. Functionality may be limited." + assert ( + count == 3673 + ), "All files as found in Globus are not present. Functionality may be limited." # Test Grids Data def test_grids_data_exists(): - assert os.path.exists(GRIDS_DIRECTORY), "Grid Data Directory doesn't exist. Contact Helene Seroussi to get access." + assert os.path.exists( + GRIDS_DIRECTORY + ), "Grid Data Directory doesn't exist. Contact Helene Seroussi to get access." + - def test_grids_AIS_directory(): - assert r'AIS' in GRIDS_DIRECTORY, "Directory must be specific to the AIS." - + assert r"AIS" in GRIDS_DIRECTORY, "Directory must be specific to the AIS." + + def test_grids_correct_files(): - assert all([sub in os.listdir(GRIDS_DIRECTORY) for sub in ['sectors_32km.nc', 'sectors_16km.nc', 'sectors_8km.nc', 'sectors_4km.nc']]) - + assert all( + [ + sub in os.listdir(GRIDS_DIRECTORY) + for sub in ["sectors_32km.nc", "sectors_16km.nc", "sectors_8km.nc", "sectors_4km.nc"] + ] + ) + + def test_grids_file_counts(): assert len(os.listdir(GRIDS_DIRECTORY)) == 4 assert all(substring.endswith(".nc") for substring in os.listdir(GRIDS_DIRECTORY)) - - - - + + # Test Output Data def test_output_data_exists(): - assert os.path.exists(ISMIP6_OUTPUT_DIRECTORY), "ISMIP6 Output Directory doesn't exist. Download it from https://zenodo.org/record/3940766#.Y7yKwXZKhrp" - + assert os.path.exists( + ISMIP6_OUTPUT_DIRECTORY + ), "ISMIP6 Output Directory doesn't exist. Download it from https://zenodo.org/record/3940766#.Y7yKwXZKhrp" + + def test_output_AIS_directory(): - assert 'AIS' in ISMIP6_OUTPUT_DIRECTORY, "Directory must be specific to the AIS." - + assert "AIS" in ISMIP6_OUTPUT_DIRECTORY, "Directory must be specific to the AIS." + + def test_output_correct_subfolders(): - assert all([sub in os.listdir(ISMIP6_OUTPUT_DIRECTORY) for sub in ['ComputedScalarsPaper',]]) - assert all([sub in os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper/") for sub in ['JPL1', 'VUB', 'NCAR', 'AWI', 'PIK', 'UTAS', 'UCIJPL', 'VUW', 'ULB', 'LSCE', 'DOE', 'ILTS_PIK', 'IMAU']]) + assert all( + [ + sub in os.listdir(ISMIP6_OUTPUT_DIRECTORY) + for sub in [ + "ComputedScalarsPaper", + ] + ] + ) + assert all( + [ + sub in os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper/") + for sub in [ + "JPL1", + "VUB", + "NCAR", + "AWI", + "PIK", + "UTAS", + "UCIJPL", + "VUW", + "ULB", + "LSCE", + "DOE", + "ILTS_PIK", + "IMAU", + ] + ] + ) + def test_output_file_counts(): assert len(os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper")) == 13 assert len(os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper/JPL1")) == 1 assert len(os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper/UCIJPL/ISSM")) == 21 assert len(os.listdir(f"{ISMIP6_OUTPUT_DIRECTORY}/ComputedScalarsPaper/DOE/MALI")) == 10 - + count = 0 for _, _, files in os.walk(ISMIP6_OUTPUT_DIRECTORY): count += len(files) - - assert count == 3335, "All files as found in Zenodo are not present. Functionality may be limited." \ No newline at end of file + + assert ( + count == 3335 + ), "All files as found in Zenodo are not present. Functionality may be limited." diff --git a/tests/test_results_dataset.py b/tests/test_results_dataset.py index 5e42aad..49c3e4b 100644 --- a/tests/test_results_dataset.py +++ b/tests/test_results_dataset.py @@ -1,15 +1,24 @@ import os + import pandas as pd from paths import RESULTS_DATASET + # Test Results Data def test_results_data_exists(): - assert os.path.exists(RESULTS_DATASET), "Results dataset doesn't exist. Run the testing procedure to generate using ise.utils.data.combine_testing_results." - + assert os.path.exists( + RESULTS_DATASET + ), "Results dataset doesn't exist. Run the testing procedure to generate using ise.utils.data.combine_testing_results." + + results = pd.read_csv(RESULTS_DATASET) - + + def test_results_nonempty(): assert not results.empty, "Results dataset is empty." + def test_results_attributes(): - assert all([col in results.columns for col in ['sectors', 'aogcm', 'modelname', 'exp_id', 'salinity']]) \ No newline at end of file + assert all( + [col in results.columns for col in ["sectors", "aogcm", "modelname", "exp_id", "salinity"]] + )