-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_regression.py
142 lines (133 loc) · 6.72 KB
/
preprocess_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
######################################################################################################
# Author: Ellen Sarauer #
# Affiliation: German Aerospace Center (DLR) #
# Filename: preprocess_regression.py #
######################################################################################################
# In this script we preprocess our data for the Microphysics Regression Model. #
# We load our netcdf simulation file and apply preselection criteria. #
# We split data in test, train and validation sets and save them. #
# For more information, please check Methodology section in our paper. #
######################################################################################################
# Import
import xarray as xr
import numpy as np
import pandas as pd
import glob
from sklearn.utils import shuffle
# Load Data
data_path = "path/to/data/"
# ml_varlist = 'ps', 'psl', 'rsdt', 'rsut', 'rsutcs', 'rlut', 'rlutcs',
# 'rsds', 'rsdscs', 'rlds', 'rldscs', 'rsus', 'rsuscs', 'rlus',
# 'ts', 'sic', 'sit', 'clt', 'prlr', 'prls', 'pr', 'prw',
# 'cllvi', 'clivi', 'qgvi', 'qrvi', 'qsvi', 'cptgzvi', 'hfls',
# 'hfss', 'evspsbl', 'tauu', 'tauv', 'sfcwind', 'uas', 'vas',
# 'tas', 'pr_rain', 'pr_ice', 'pr_snow', 'pr_grpl'
atm_2d_general_vars_path = glob.glob(data_path+"*atm_2d_general_vars_ml_20200207*")
# ml_varlist = 'rho', 'ta', 'ua', 'va', 'tv', 'omega', 'hus', 'hur', 'clw',
# 'cli', 'cl'
atm_3d_general_vars_path = glob.glob(data_path+"*atm_cl_ml_20200207*")
# ml_varlist = 'dz_mig', 'rho_mig', 'pf_mig', 'cpair_mig', 'ta_mig', 'qv_mig',
# 'qc_mig', 'qi_mig', 'qr_mig', 'qs_mig', 'qg_mig'
atm_mig_inputs_path = glob.glob(data_path+"*mig_inputs_ml_20200207*")
# ml_varlist = 'tend_ta_mig', 'tend_qhus_mig', 'tend_qclw_mig',
# 'tend_qcli_mig', 'tend_qr_mig', 'tend_qs_mig', 'tend_qg_mig'
atm_mig_tendencies_path = glob.glob(data_path+"*mig_tendencies_ml_20200207*")
# Create data arrays from nc files
def create_data_array(path_to_files, varname):
joined_arr = np.zeros(32768000,)
for path in path_to_files:
file = xr.open_dataset(path)
raw_arr = file[varname]
#print(raw_arr)
del_100_arr = raw_arr[:,1:51,:]
cut_arr = np.array(del_100_arr)
out_arr = np.reshape(cut_arr, (32768000,))
joined_arr = np.concatenate((joined_arr,out_arr))
print(f"Final array shape of {varname}: {joined_arr.shape}")
return(joined_arr)
# Create arrays
print("create input arrays")
dz_mig = create_data_array(atm_mig_inputs_path, "dz_mig")
pf_mig = create_data_array(atm_mig_inputs_path, "pf_mig")
ta_mig = create_data_array(atm_mig_inputs_path, "ta_mig")
qv_mig = create_data_array(atm_mig_inputs_path, "qv_mig")
qc_mig = create_data_array(atm_mig_inputs_path, "qc_mig")
qi_mig = create_data_array(atm_mig_inputs_path, "qi_mig")
qr_mig = create_data_array(atm_mig_inputs_path, "qr_mig")
qs_mig = create_data_array(atm_mig_inputs_path, "qs_mig")
qg_mig = create_data_array(atm_mig_inputs_path, "qg_mig")
#tv = create_data_array(atm_3d_general_vars_path,"tv")
#omega = create_data_array(atm_3d_general_vars_path,"wap")
#ua = create_data_array(atm_3d_general_vars_path,"ua")
#va = create_data_array(atm_3d_general_vars_path,"va")
#hus = create_data_array(atm_3d_general_vars_path,"hus")
print("create output arrays")
tend_ta_mig = create_data_array(atm_mig_tendencies_path, "tend_ta_mig")
tend_qv_mig = create_data_array(atm_mig_tendencies_path,"tend_qhus_mig")
tend_qc_mig = create_data_array(atm_mig_tendencies_path,"tend_qclw_mig")
tend_qi_mig = create_data_array(atm_mig_tendencies_path,"tend_qcli_mig")
tend_qr_mig = create_data_array(atm_mig_tendencies_path,"tend_qr_mig")
tend_qs_mig = create_data_array(atm_mig_tendencies_path,"tend_qs_mig")
tend_qg_mig = create_data_array(atm_mig_tendencies_path,"tend_qg_mig")
# Fill dataframe
print("fill dataframe")
df_mig = pd.DataFrame()
df_mig["dz_mig"] = dz_mig
df_mig["pf_mig"] = pf_mig
df_mig["ta_mig"] = ta_mig
df_mig["qv_mig"] = qv_mig
df_mig["qc_mig"] = qc_mig
df_mig["qi_mig"] = qi_mig
df_mig["qr_mig"] = qr_mig
df_mig["qs_mig"] = qs_mig
df_mig["qg_mig"] = qg_mig
#df_mig["tv"] = tv
#df_mig["omega"] = omega
#df_mig["ua"] = ua
#df_mig["va"] = va
#df_mig["hus"] = hus
df_mig["tend_ta_mig"] = tend_ta_mig
df_mig["tend_qv_mig"] = tend_qv_mig
df_mig["tend_qc_mig"] = tend_qc_mig
df_mig["tend_qi_mig"] = tend_qi_mig
df_mig["tend_qr_mig"] = tend_qr_mig
df_mig["tend_qs_mig"] = tend_qs_mig
df_mig["tend_qg_mig"] = tend_qg_mig
# Apply preselection criteria
print("apply basic preselection criteria")
df_mig = df_mig.dropna()
print(df_mig.shape)
df_mig_cld_sig = df_mig
cond1 = abs(df_mig_cld_sig['tend_ta_mig']) > 10**-6
cond2 = (abs(df_mig_cld_sig['tend_qv_mig']) + abs(df_mig_cld_sig['tend_qc_mig'])) > 10**-9
cond3 = (abs(df_mig_cld_sig['tend_qi_mig']) + abs(df_mig_cld_sig['tend_qr_mig']) + abs(df_mig_cld_sig['tend_qs_mig']) + abs(df_mig_cld_sig['tend_qg_mig'])) > 10**-10
df_mig_cld_sig['mig_active'] = np.where(cond1 & cond2 & cond3 , 1, 0)
df_mig_cld_sig = df_mig
print(f"sig samples: {df_mig_cld_sig.size} out of {df_mig.size} ({np.round(100*df_mig_cld_sig.size/df_mig.size,1)} percent) remain.")
count_ones = df_mig_cld_sig['mig_active'].sum()
print("Number of 1 entries:", count_ones)
my_df = df_mig_cld_sig[df_mig_cld_sig['mig_active']==1]
print(my_df.size)
final_df = my_df[["dz_mig","pf_mig","ta_mig","qv_mig","qc_mig","qi_mig","qr_mig","qs_mig","qg_mig",
"tend_ta_mig","tend_qv_mig","tend_qc_mig","tend_qi_mig","tend_qr_mig","tend_qs_mig","tend_qg_mig"]]
# Convert to numpy and shuffle
print("Split in train, val, test.")
final_array = final_df.to_numpy()
total_num_samples = 8000000
print(total_num_samples)
num_train_samples = int(10*total_num_samples/12)
num_val_samples = int((total_num_samples-num_train_samples)/2)
num_test_samples = total_num_samples - num_train_samples - num_val_samples
data_final = shuffle(final_array, n_samples=total_num_samples)
set_train = data_final[:num_train_samples]
set_val = data_final[num_train_samples:(num_train_samples+num_val_samples), :]
set_test = data_final[(num_train_samples+num_val_samples):, :]
# Save preprocessed files
out_path = "path/to/out/"
np.save(out_path + "df_nextgems_mig_subset_regression_train.npy", set_train)
np.save(out_path + "df_nextgems_mig_subset_regression_val.npy", set_val)
np.save(out_path + "df_nextgems_mig_subset_regression_test.npy", set_test)
# Check shapes
print(set_train.shape)
print(set_test.shape)
print(set_val.shape)