Skip to content

Commit

Permalink
add logic for all time unit types
Browse files Browse the repository at this point in the history
  • Loading branch information
jtmims committed Jan 16, 2025
1 parent 210acc5 commit 5a42bda
Showing 1 changed file with 69 additions and 19 deletions.
88 changes: 69 additions & 19 deletions src/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -1014,30 +1014,80 @@ def check_group_daterange(self, df: pd.DataFrame, date_range: util.DateRange,
# hit an exception; return empty DataFrame to signify failure
return pd.DataFrame(columns=group_df.columns)

def normalize_time_units(self, subset_dict: dict, log=_log) -> dict:
def normalize_time_units(self, subset_dict: dict, time_coord, log=_log) -> dict:
"""
Some datasets will have the time units defined based on each individual file.
This function updates each time unit to rely on the earliest year grabbed
in the query stage.
Some datasets will have the time units that are different in each individual file.
This function updates each time unit to rely on the earliest year grabbed in the
query stage.
This function assumes the time coord units attr will be of the form "{unit} since ????".
"""
time_units = [subset_dict[f].time.units for f in list(subset_dict)]

if len(set(time_units)) > 1:
if all(["days since " in u for u in time_units]):
#assumes "days since ????" unit format
start = np.sort([int(u.replace("days since ", "")[:4]) for u in time_units])[0]
new_unit = f"days since {start}-01-01 00:00:00"

time_units = np.sort([subset_dict[f].time.units for f in list(subset_dict)])
tn = time_coord.name #abbreviate

# assumes each dataset has the same calendar
cal = 'noleap'
if 'calendar' in subset_dict[list(subset_dict)[0]][tn].attrs:
cal = subset_dict[list(subset_dict)[0]][tn].attrs['calendar']
elif 'calendar' in subset_dict[list(subset_dict)[0]][tn].encoding:
cal = subset_dict[list(subset_dict)[0]][tn].encoding['calendar']

if len(set(time_units)) > 1: # check if each dataset has the different time coord units
# check if time coord units are in the form "{unit} since {date}"
# they can be different units as this function converts to the earliest case
if all(["since" in u for u in time_units]):
start_unit = time_units[0].split(" ")[0]
start_str = " ".join(time_units[0].split(" ")[2:])
start_cft = dl.str_to_cftime(
start_str.replace(" ","").replace(":", "").replace("-", ""),
calendar=cal
)
new_unit_str = f"{start_unit} since {start_str}"

# dictionary of how many seconds are in each time unit
seconds_in = {
"seconds": 1.0,
"minutes": 60.0,
"hours": 3600.0,
"days": 86400.0,
"weeks": 604800.0, # these are rarer and vague cases (they could be problematic)
"months": 2628000.0, # seconds in common year (365 days) / 12
"years": 31536000.0 # common year (365 days)
}


for f in list(subset_dict):
current = int(subset_dict[f].time.units.replace("days since ", "")[:4])
if current > start:
subset_dict[f].coords['time'] = subset_dict[f].time.assign_attrs(
units=new_unit
current_unit = subset_dict[f][time_coord.name].units.split(" ")[0].lower()
current_str = " ".join(subset_dict[f][tn].units.split(" ")[2:])
current_cft = dl.str_to_cftime(
current_str.replace(" ","").replace(":", "").replace("-", ""),
calendar=cal
)

#TODO: add logic to add year values for different calendars

if current_cft > start_cft:
# get difference between current files unit reference point and earliest found
diff = ((current_cft-start_cft).total_seconds())/seconds_in[start_unit]

subset_dict[f].coords['time'] = subset_dict[f][tn].assign_attrs(
units=new_unit_str
)
for i, v in enumerate(subset_dict[f].time.values):
subset_dict[f].coords['time'].values[i] = v + 365*(current-start) + 1

# convert current unit if it is not the same as the earliest reference
if current_unit != start_unit:
factor = seconds_in[current_unit]/seconds_in[start_unit]
else:
factor = 1.0

# change the values in the dataset
for i, v in enumerate(subset_dict[f][tn].values):
subset_dict[f].coords[tn].values[i] = factor*v + diff
else:
raise AttributeError("Different units were found for time coord in each file. "
"We were unable to normalize due to the units not being in 'days since ' format")
"We were unable to normalize due to the units not being in '{unit} since ' format")

return subset_dict

def query_catalog(self,
Expand Down Expand Up @@ -1151,7 +1201,7 @@ def query_catalog(self,
# tl;dr hic sunt dracones
var_xr = []
if not var.is_static:
cat_subset_dict = self.normalize_time_units(cat_subset_dict)
cat_subset_dict = self.normalize_time_units(cat_subset_dict, var.T)
time_sort_dict = {f: cat_subset_dict[f].time.values[0]
for f in list(cat_subset_dict)}
time_sort_dict = dict(sorted(time_sort_dict.items(), key=lambda item: item[1]))
Expand Down

0 comments on commit 5a42bda

Please sign in to comment.