-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSTEP2_interp_levels_avg_ens.py
362 lines (283 loc) · 12.4 KB
/
STEP2_interp_levels_avg_ens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
# STEP2 of preprocessing raw 3D files
# Calculate ensemble averages and interpolate
# hybrid sigma to pressure levels
#%%
import Ngl
import os
import re
import glob
from datetime import datetime
import xarray as xr
import numpy as np
import pandas as pd
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from numba import jit
# %matplotlib inline
from pooled_stats import *
#%%
# filename = "/media/gabriel/hd2_6tb/ccsm4/test/rcp0.0_veg_005.cam2.h0.1950-01.nc"
basefolder = "output/"
baseoutfolder = "output/"
scens = ["rcp8.5_weg", "rcp8.5_seg", "rcp2.6_weg", "rcp2.6_seg"]
syear = 2040
eyear = 2050
years = list(range(syear, eyear + 1))
nyears = len(years)
# Interpolation parameters
# New pressure levels. We need them in hPa/mb here for Ngl.vinth2p,
# but will convert them to Pa later for consistency with reference CMIP5 files
pnew = [1000.0,925.0,850.0,700.0,600.0,500.0,400.0,300.0,250.0,200.0,150.0,100.0,70.0,50.0,30.0,20.0,10.0]
intyp = 1 # 1=linear, 2=log, 3=log-log
kxtrp = True # Should we extrapolate into levels below the topography?
# %% Function definitions
def drop_unused_vars(ds):
# smallvarnames = [var for var in ds.data_vars if ds[var].shape.__len__() < 4]
fixedvarnames = [var for var in ds.data_vars if ds[var].shape.__len__() <= 2]
smallvarnames = [\
"CLDHGH", "CLDLOW", "CLDMED", "CLDTOT", \
"FSDS", "FLDS", "SOLIN", "SRFRAD",\
"LHFLX", "SHFLX", "QFLX",\
"PBLH", "PHIS", "PS","PSL",
"TMQ",\
"PRECC", "PRECL", "PRECT",\
"QREFHT", "RHREFHT", "TREFHT", "TREFMNAV", "TREFMXAV","TS","U10",\
]
smallvarnames.extend([i + "_var" for i in smallvarnames])
bigvarnames = ["Z3","T","Q","U","V","OMEGA","VQ","OMEGAQ"] #There is no UQ for some reason
bigvarnames.extend([i + "_var" for i in bigvarnames])
# bigvarnames = ["Z3","T"]
bothvarnames = smallvarnames + bigvarnames + fixedvarnames
dropvarnames = [i for i in ds.data_vars if i not in bothvarnames]
ds = ds[bothvarnames]
return(ds)
#%% Scenario loop
# scen = scens[3]
for scen in scens:
print("=================================================")
print("Running " + scen)
print("=================================================")
# Create output folder
outfolder = baseoutfolder + scen + "/"
os.makedirs(outfolder, exist_ok=True)
regex = re.compile(scen + "_[0-9][0-9][0-9]_" + str(syear) + "_" + str(eyear) + ".*")
infnames = [baseoutfolder + scen + "/" + i for i in os.listdir(baseoutfolder + scen) if regex.match(i)]
# items = [{"filename" : filename, "dset" : xr.open_dataset(filename), "member" : re.sub(scen + "_([0-9][0-9][0-9])_.*","\g<1>", os.path.basename(filename))} for filename in infnames]
# Load data into a list of dicts with the data, the filename and the member
print("Loading inputs...")
items = [{"filename" : filename, "dset" : drop_unused_vars(xr.open_dataset(filename)), "member" : re.sub(scen + "_([0-9][0-9][0-9])_.*","\g<1>", os.path.basename(filename))} for filename in infnames]
# Expand a "member" dimension in each DataSet and return a simple list of them
print("Expanding dimensions...")
expitems = [item["dset"].expand_dims("member") for item in items]
#%%
# Combine the DataSets on the "member" dimension and take the ensemble mean
# print("Combining and taking ensemble mean...")
# dsmean = xr.combine_nested(expitems, concat_dim="member", combine_attrs= "override").mean(dim = "member", keep_attrs = True)
print("Combining ...")
bigds = xr.combine_nested(expitems, concat_dim="member", combine_attrs= "override")
# Separate Datasets for means and variances
regexvar = re.compile(".*_var$")
dsmeans = bigds[[i for i in bigds.data_vars if not regexvar.match(i)]]
# Rename variance variable names to match those of the means for pooling
dsvars = bigds[[i for i in bigds.data_vars if regexvar.match(i)]]
dsvars = dsvars.rename({i:(re.sub("(.*)_var$","\g<1>",i)) for i in dsvars.data_vars})
# Aggregate means by taking a simple mean
dsmean = dsmeans.mean(dim = "member", keep_attrs = True)
# Aggregate variances by applying pooled_
dsvariance = xr.apply_ufunc(pool_variances_dask, dsmeans, dsvars, nyears, \
dask = "parallelized", input_core_dims=[["member"], ["member"], [] ],
keep_attrs=True)
# Join variance variables into dsmean
dsvariance = dsvariance.rename({i:(i + "_var") for i in dsvariance.data_vars})
dsmean = dsmean.merge(dsvariance)
#%%
# With the climatological monthly ensemble means, we will now interpolate
# levels on each 4D variable
print("Preparing for interpolation...")
# List of variables to interpolate
intvarnames = [var for var in dsmean.data_vars if dsmean[var].shape.__len__() == 4]
# Extract numpy arrays of the relevant variables for Ngl.vinth2p
# We'll need the name of time variable, could be "time" or "month" at this point
# We need it because previous steps likely copied these variables over all times
timvarname = [i for i in ["time", "month"] if i in list(dsmean.coords)][0]
hyam = dsmean["hyam"][{timvarname : 0}] # Hybrid level coefficient A
hybm = dsmean["hybm"][{timvarname : 0}] # Hybrid level coefficient B
P0mb = 0.01*dsmean["P0"][{timvarname : 0}] # Reference pressure (scalar)
psrf = dsmean["PS"] # Pressure at surface (time-dependent)
lats = dsmean["lat"]
lons = dsmean["lon"]
times = dsmean[timvarname]
# Define a coordinate variable for the output pressure levels
# Set units to "hPa" here, but will override later when converting
lev = xr.DataArray(pnew, coords = [pnew], dims = "lev", attrs = {"units" : "hPa"})
# DataSet that will contain all output
dsout = xr.Dataset()
#%%
# 4D variables loop
# varname = intvarnames[1]
for varname in intvarnames:
print("Interpolating " + varname)
var = dsmean[varname]
# Actual interpolation
varnew = Ngl.vinth2p(var,hyam,hybm,pnew,psrf,intyp,P0mb,1,kxtrp)
# Put the interpolated variable in a DataSet with it's metadata
dsnew = xr.Dataset({varname: ((timvarname,'lev','lat','lon'), varnew[:,:,:,:])},
{'lon': lons, 'lat': lats, timvarname: times, 'lev' : lev})
# Convert vertical coordinate from hPa to Pa
dsnew["lev"] = dsnew["lev"]*100.0
dsnew["lev"].attrs['units'] = 'Pa'
# Copy metadata from the original variable
dsnew[varname].name = var.name
dsnew[varname].attrs = var.attrs
# Merge this DataSet with the output one, that starts out empty
dsout = dsout.merge(dsnew)
# Merge in the 2D variables
dsout = dsout.merge(dsmean[list(set([i for i in list(dsmean.data_vars) if dsmean[i].shape.__len__() == 3]) - \
set(dsout.data_vars))])
# Copy attributes from the original dataset and add some
dsout.attrs = dsmean.attrs
dsout.attrs['STEP2_comment'] = 'Vertical levels interpolated via Ngl.vinth2p using method (intyp) ' + str(intyp) + " and extrapolation " + str(kxtrp) + " on " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
dsout.attrs['STEP2_original_levels'] = str(dsmean.lev)
#%%
# Save output file
outfname = outfolder + scen + "_ensmean_pres_"+ str(syear) + "_" + str(eyear) + ".nc"
dsout.to_netcdf(outfname)
# Cleanup
del(dsout)
del(dsmean)
del(dsvariance)
del(dsmeans)
del(dsvars)
del(bigds)
del(expitems)
del(items)
# #%%
# bigv = np.random.normal(4,1,40)
# np.var(bigv)
# n = 10
# subv = [bigv[i:i+n] for i in range(0,len(bigv),n)]
# means = [np.mean(i) for i in subv]
# variances = [np.var(i, ddof = 1) for i in subv]
# @jit(nopython = True)
# def pool_pair_means(a,b,rn):
# out = (rn*a + b)/(rn+1)
# return(out)
# @jit(nopython = True)
# def pool_means(v):
# i = 0
# s = v[0]
# while i < (len(v)-1):
# rn = i + 1
# s = pool_pair_means(s,v[i+1],rn)
# i = i + 1
# return(s)
# @jit(nopython = True)
# def pool_pair_variances(v1,v2,m1,m2,n1,n2):
# vp = (1/(n1+n2-1)) * ( (n1-1)*v1 + (n2-1)*v2 + ((n1*n2)/(n1+n2)) * (m1-m2)**2 )
# return(vp)
# @jit(nopython = True)
# def pool_variances(means, variances,n):
# i = 0
# vp = variances[0]
# mp = means[0]
# while i < (len(variances)-1):
# rn = i + 1
# vp = pool_pair_variances(vp, variances[i+1], mp, means[i+1], n*rn, n)
# mp = pool_pair_means(mp,means[i+1],rn)
# i = i + 1
# return(vp)
# pool_variances(means,variances,n)
# np.var(bigv, ddof = 1)
# pool_pair_variances(variances[0],variances[1],means[0],means[1],len(subv[0]),len(subv[1]))
# np.var(np.concatenate([subv[0],subv[1]]), ddof =1 )
# pool_means(means)
# np.mean(bigv)
# subsoma(1,4)
#%% Old code
# #%% Folders (members) loop
# # infolder = infolders[0]
# for infolder in infolders:
# memberstr = (os.path.split(os.path.split(os.path.split(infolder)[0])[0])[1]).split(sep = "_")[-1]
# # filenames = glob.glob(infolder + '*{2030..2050}*')
# filenames = [glob.glob(infolder + "*." + mon + ".*") for mon in [str(year) + "-" + i for year in years for i in [str(j).zfill(2) for j in range(1,13)]]]
# filenames = [i for j in filenames for i in j] # Flatten it
# #%% Open a folder's (member) dataset
# inds = xr.open_mfdataset(filenames)
# tmeands = inds.groupby("time.month").mean()
# #%% Save output. Since we're lazy loading, this is where most of the processing happens
# outfname = outfolder + scen + "_" + memberstr + "_" + str(syear) + "_" + str(eyear) + ".nc"
# print(outfname + " | " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
# tmeands.to_netcdf(outfname)
# #%%
# # ds = xr.open_dataset(filename)
# ds = xr.open_mfdataset(glob.glob(infolder + '*1950*'))
# #%%
# varname = "T"
# # Extract the desired variables (need numpy arrays for vertical interpolation)
# hyam = ds["hyam"].isel(time = 0)
# hybm = ds["hybm"].isel(time = 0)
# # psrf = ds["PS"].isel(time = 0)
# psrf = ds["PS"]
# P0mb = 0.01*ds["P0"].isel(time = 0)
# lats = ds["lat"]
# lons = ds["lon"]
# times = ds['time']
# var = ds[varname]
# # Define the output pressure levels.
# # We need them in hPa/mb here for Ngl.vinth2p, but convert to Pa later for consistency with reference CMIP5 files
# pnew = [1000.0,925.0,850.0,700.0,600.0,500.0,400.0,300.0,250.0,200.0,150.0,100.0,70.0,50.0,30.0,20.0,10.0]
# lev = xr.DataArray(pnew, coords = [pnew], dims = "lev", attrs = {"units" : "hPa"})
# # Do the interpolation.
# intyp = 1 # 1=linear, 2=log, 3=log-log
# kxtrp = True # True=extrapolate (when the output pressure level is outside of the range of psrf)
# # FIXME:Load the variable onto memory if it's a dask array.
# # Not sure if Ngl works well with those, check later
# # if var.chunks is not None:
# # var.load()
# # hyam.load()
# # hybm.load()
# # psrf.load()
# # P0mb.load()
# # lats.load()
# # lons.load()
# # times.load()
# #%%
# # Vertical interpolation
# varnew = Ngl.vinth2p(var,hyam,hybm,pnew,psrf,intyp,P0mb,1,kxtrp)
# # varnew = Ngl.vinth2p(var3,hyam.values,hybm.values,pnew,psrf.values,intyp,P0mb.values,1,kxtrp)
# # varnew = Ngl.vinth2p(var.isel(time = [0]).values,hyam,hybm,pnew,psrf,intyp,P0mb,1,kxtrp)
# # varnew[varnew==1e30] = np.NaN
# varnew.shape
# # varnew
# #%%
# # Create new xarray Dataset
# dsnew = xr.Dataset({varname: (('time','lev','lat','lon'), varnew[:,:,:,:])},
# {'lon': lons, 'lat': lats, 'time': times, 'lev' : lev})
# # dsnew = xr.Dataset({varname: (('time','lev','lat','lon'), varnew[:,:,:,:])},
# # {'lon': lons, 'lat': lats, 'time': [0,1], 'lev' : lev})
# dsnew["lev"] = dsnew["lev"]*100.0
# dsnew["lev"].attrs['units'] = 'Pa'
# dsnew[varname].name = ds[varname].name
# dsnew[varname].attrs = ds[varname].attrs
# dsnew
# #%%
# dsnew.to_netcdf("~/ccsm4/test/lala.nc")
# #%%
# dsnew.rename(T = "ta").to_netcdf("~/ccsm4/test/lala.nc")
# #%%
# maxlat = 5.0
# minlat = -33.0
# maxlon = 326.0
# # minlon = 280.0
# minlon = 300.0
# pds = dsnew.sel(lat = slice(minlat,maxlat), lon = slice(minlon,maxlon))
# # pds[varname].sel(lev = 925).plot()
# pds[varname].isel(time=0).sel(lat = [-20,-10,0], lon = 300, method = "nearest").plot.line(x = "lev")
# #%%
# # Plotting
# fig = plt.figure(figsize=(15, 5))
# ax = plt.axes(projection=ccrs.PlateCarree())
# dsnew.T850.plot(ax=ax, transform=ccrs.PlateCarree(), cmap=load_cmap('vik'))
# ax.coastlines()
# plt.title(ds.time.values[0].strftime("%B year %Y"))
# %%