Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the observation preparation functions for North America SDA experiments. #3451

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
15 changes: 13 additions & 2 deletions book_source/03_topical_pages/03_pecan_xml.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,11 @@ The following tags can be used for state data assimilation. More detailed inform
<unit>year</unit>
<num>1</num>
</timestep>
<sd_threshold>20</sd_threshold>
<boundary>
<upper_boundary>0.95</upper_boundary>
<lower_boundary>0.05</lower_boundary>
</boundary>
</MODIS_LAI>
<SMAP_SMP>
<search_window>30</search_window>
Expand Down Expand Up @@ -909,7 +914,7 @@ The following tags can be used for observation preparation for the SDA workflow.
<state.data.assimilation>
<Obs_Prep>
<Landtrendr_AGB>
<AGB_indir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_indir>
<AGB_input_dir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_input_dir>
<allow_download>TRUE</allow_download>
<export_csv>TRUE</export_csv>
<timestep>
Expand All @@ -926,6 +931,11 @@ The following tags can be used for observation preparation for the SDA workflow.
<unit>year</unit>
<num>1</num>
</timestep>
<sd_threshold>20</sd_threshold>
<boundary>
<upper_boundary>0.95</upper_boundary>
<lower_boundary>0.05</lower_boundary>
</boundary>
</MODIS_LAI>

<SMAP_SMP>
Expand Down Expand Up @@ -969,7 +979,8 @@ For the MODIS LAI and SMAP soil moisture observations, the `search_window` speci
* **search_window** : [optional] This tag is used for LAI and SMAP observation preparation functions, which defines the search window within which the closest observation to the target date is selected. The default value is 30 days.
* **update_csv** : [optional] This tag is used for SMAP preparation function only, which decide if we want to update the CSV file. Normally, we will only set it to be TRUE if we redownload the SMAP_GEE csv file from Google Earth Engine, the tutorial can be found in the SMAP_SMP_prep function itself. The default value is FALSE.
* **run_parallel** : [optional] This tag defines if you want to proceed the MODIS LAI function parallely, the default value is FALSE.

* **sd_threshold** : [optional] This tag defines the maximum allowable value for the MODIS LAI. Any records that have higher uncertainty will be removed.
* **boundary** : [optional] This tag defines the upper and lower boundaries (in quantiles) for filtering the noisy LAI time series.

### (experimental) Benchmarking {#xml-benchmarking}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ LAI_search_window <- 30
LAI_timestep <- list(unit="year", num=1)
LAI_export_csv <- TRUE
run_parallel <- TRUE
sd_threshold <- 20
upper_quantile <- 0.95
lower_quantile <- 0.05

#SMP
SMP_search_window <- 30
Expand Down Expand Up @@ -128,7 +131,8 @@ template <- PEcAn.settings::Settings(list(

Obs_Prep = structure(list(
Landtrendr_AGB = structure(list(AGB_indir = AGB_indir, timestep = AGB_timestep, allow_download = allow_download, export_csv = AGB_export_csv)),
MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel)),
MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel,
sd_threshold = sd_threshold, boundary = structure(list(upper_quantile = upper_quantile, lower_quantile = lower_quantile)))),
SMAP_SMP = structure(list(search_window = SMP_search_window, timestep = SMP_timestep, export_csv = SMP_export_csv, update_csv = update_csv)),
Soilgrids_SoilC = structure(list(timestep = SoilC_timestep, export_csv = SoilC_export_csv)),
start.date = obs_start_date,
Expand Down
3 changes: 3 additions & 0 deletions modules/data.remote/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
export(GEDI_AGB_plot)
export(Landtrendr_AGB_prep)
export(MODIS_LAI_prep)
export(MODIS_LAI_ts_filter)
export(MODIS_LC_prep)
export(NASA_DAAC_download)
export(Prep.MODIS.CSV.from.DAAC)
export(Prep.SMAP.CSV.from.DAAC)
export(Prep_AGB_IC_from_2010_global)
export(SMAP_SMP_prep)
export(call_MODIS)
Expand Down
228 changes: 211 additions & 17 deletions modules/data.remote/R/MODIS_LAI_prep.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
#' Prepare MODIS LAI data for the SDA workflow.
#'
#' @param site_info Bety list of site info including site_id, lon, and lat.
#' @param time_points A vector contains each time point within the start and end date.
#' @param outdir Where the final CSV file will be stored.
#' @param search_window search window for locate available LAI values.
#' @param export_csv Decide if we want to export the CSV file.
#' @param sd_threshold Threshold for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
#' @param site_info list: Bety list of site info including site_id, lon, and lat.
#' @param time_points character: a vector contains each time point within the start and end date.
#' @param outdir character: where the final CSV file will be stored.
#' @param search_window numeric: search window for locate available LAI values.
#' @param export_csv boolean: decide if we want to export the CSV file.
#' @param sd_threshold numeric or character: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
#' @param skip.download boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.
#' @param boundary numeric vector or list: the upper and lower quantiles for filtering out noisy LAI values (e.g., c(0.05, 0.95) or list(0.05, 0.95)). The default is NULL.
#'
#' @return A data frame containing LAI and sd for each site and each time step.
#' @export
#'
#' @examples
#' @author Dongchen Zhang
#' @importFrom magrittr %>%
MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20){
#initialize future parallel computation.
MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20, skip.download = TRUE, boundary = NULL){
# unlist boundary if it's passing from the assembler function.
if (is.list(boundary)) {
boundary <- as.numeric(unlist(boundary))
}
# initialize future parallel computation.
if (future::supportsMulticore()) {
future::plan(future::multicore, workers = 10)
} else {
Expand All @@ -37,14 +42,20 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
#if we have previous downloaded CSV file
if(file.exists(file.path(outdir, "LAI.csv"))){
PEcAn.logger::logger.info("Extracting previous LAI file!")
Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"))
Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"),
colClasses = c(rep("character", 2), rep("numeric", 4), "character"))
# filter out high uncertain LAI records.
if (!is.null(sd_threshold)) {
PEcAn.logger::logger.info("filtering out records with high standard errors!")
ind.rm <- which(Previous_CSV$sd >= sd_threshold)
ind.rm <- which(Previous_CSV$sd >= as.numeric(sd_threshold))
if (length(ind.rm) > 0) {
Previous_CSV <- Previous_CSV[-ind.rm,]
}
}
# filter out noisy lai records from time series.
if (!is.null(boundary)) {
Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
}
LAI_Output <- matrix(NA, length(site_info$site_id), 2*length(time_points)+1) %>%
`colnames<-`(c("site_id", paste0(time_points, "_LAI"), paste0(time_points, "_SD"))) %>% as.data.frame()#we need: site_id, LAI, std, target time point.
LAI_Output$site_id <- site_info$site_id
Expand Down Expand Up @@ -77,7 +88,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
#only Site that has NA for any time points need to be downloaded.
new_site_info <- site_info %>% purrr::map(function(x)x[!stats::complete.cases(LAI_Output)])
#TODO: only download data for specific date when we have missing data.
if(length(new_site_info$site_id) != 0){
if(length(new_site_info$site_id) != 0 && !skip.download){
product <- "MCD15A3H"
PEcAn.logger::logger.info("Extracting LAI mean products!")
lai_mean <- split(as.data.frame(new_site_info), seq(nrow(as.data.frame(new_site_info)))) %>%
Expand All @@ -91,7 +102,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
start = dates$start_date,
end = dates$end_date,
progress = FALSE)))) {
return(list(mean = mean$value, date = mean$calendar_date))
return(list(mean = mean$value * as.numeric(mean$scale), date = mean$calendar_date))
} else {
return(NA)
}
Expand All @@ -109,7 +120,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
start = dates$start_date,
end = dates$end_date,
progress = FALSE)))) {
return(std$value)
return(std$value * as.numeric(std$scale))
} else {
return(NA)
}
Expand Down Expand Up @@ -150,18 +161,23 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
next
}
if (!is.null(sd_threshold)) {
if (lai_std[[i]][j] >= sd_threshold) {
if (lai_std[[i]][j] >= as.numeric(sd_threshold)) {
next
}
}
LAI <- rbind(LAI, list(date = lai_mean[[i]]$date[j],
site_id = new_site_info$site_id[i],
lat = new_site_info$lat[i],
lon = new_site_info$lon[i],
lai = lai_mean[[i]]$mean[j]*0.1,
sd = lai_std[[i]][j]*0.1))
lai = lai_mean[[i]]$mean[j],
sd = lai_std[[i]][j],
qc = lai_qc[[i]][j]))
}
}
# filter out noisy lai records from time series.
if (!is.null(boundary)) {
Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
}
#Compare with existing CSV file. (We name the CSV file as LAI.csv)
if(as.logical(export_csv)){
if(exists("Previous_CSV")){#we already read the csv file previously.
Expand Down Expand Up @@ -192,4 +208,182 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
}
PEcAn.logger::logger.info("MODIS LAI Prep Completed!")
list(LAI_Output = LAI_Output, time_points = time_points, var = "LAI")
}
#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
#'
#' @param lai.csv data frame: A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
#' @param boundary numeric: A vector contains the upper and lower quantiles for filtering lai records. The default is c(0.05, 0.95).
#'
#' @return A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
#' @export
#'
#' @author Dongchen Zhang
MODIS_LAI_ts_filter <- function(lai.csv, boundary = c(0.05, 0.95)) {
# get unique site ids.
site.ids <- sort(unique(lai.csv$site_id))
# loop over sites.
for (i in seq_along(site.ids)) {
# get the lai records for the current site.
inds <- which(lai.csv$site_id == site.ids[i])
temp.lai <- lai.csv$lai[inds]
# calculate the upper and lower boundaries for the current lai records.
minmax <- quantile(temp.lai, boundary)
# find and remove outliers.
inds.rm <- which(temp.lai < minmax[1] | temp.lai > minmax[2])
lai.csv <- lai.csv[-inds[inds.rm]]
}
return(lai.csv)
}
#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
#'
#' @param site_info list: Bety list of site info including site_id, lon, and lat.
#' @param extent numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).
#' @param from character: the start time for searching the MODIS products.
#' @param to character: the end time for searching the MODIS products.
#' @param download.outdir character: Where the MODIS tiles will be stored.
#' @param csv.outdir character: Where the final CSV file will be stored.
#'
#' @return A data frame containing LAI and sd for each site and each time step.
#' @export
#'
#' @author Dongchen Zhang
#' @importFrom magrittr %>%
Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdir, csv.outdir) {
# load previous CSV file.
if (file.exists(file.path(csv.outdir, "LAI.csv"))) {
previous.csv <- read.csv(file.path(csv.outdir, "LAI.csv"),
colClasses = c("character", rep("numeric", 5), "character"))
} else {
previous.csv <- NULL
}
# setup the foreach parallel computation.
cl <- parallel::makeCluster(parallel::detectCores())
doParallel::registerDoParallel(cl)
# reproject site locations to MODIS projection.
site.ids <- site_info$site_id
site.locs <- cbind(site_info$lon, site_info$lon) %>%
`colnames<-`(c("lon","lat")) %>%
`rownames<-`(site.ids)
# create vector based on coordinates and MODIS projection.
pts <- data.frame(lon = site.locs[,1], lat = site.locs[,2])
sp::coordinates(pts) <- ~lon+lat
sp::proj4string(pts) <- sp::CRS("+proj=longlat +datum=WGS84")
pts.reproj <- sp::spTransform(pts, "+proj=sinu +a=6371007.181 +b=6371007.181 +units=m")
coords.reproj <- sp::coordinates(pts.reproj) %>% `colnames<-`(c("x", "y"))
# download data.
metadata <- NASA_DAAC_download(ul_lat = extent[4],
ul_lon = extent[1],
lr_lat = extent[3],
lr_lon = extent[2],
from = from,
to = to,
just_path = F,
outdir = download.outdir,
doi = "10.5067/MODIS/MCD15A3H.061",
ncore = parallel::detectCores()-1)
# grab file paths for downloaded hdf files.
modis.out <- list.files(download.outdir, full.names = T, pattern = "*.hdf")
# grab id for each file.
ids <- basename(modis.out)
# split string.
splits <- strsplit(x = ids, split = ".", fixed = T)
# collect attributes table (date, ih, and iv).
dates <- ivs <- ihs <- c()
for (i in seq_along(ids)) {
temp <- splits[[i]]
# calculate dates based on year and day of year.
y.doy <- substr(temp[[2]], 2, 8)
year <- substr(y.doy, 1, 4)
doy <- substr(y.doy, 5, 7)
dates <- c(dates, as.character(as.Date(paste(year, doy), format="%Y %j")))
# extract tile id.
ihs <- c(ihs, substr(temp[[3]], 2, 3))
ivs <- c(ivs, substr(temp[[3]], 5, 6))
}
# create data frame of extents for MODIS tiles.
# record progress.
PEcAn.logger::logger.info("\nCreating information table for downloaded MODIS tiles.")
doSNOW::registerDoSNOW(cl)
pb <- utils::txtProgressBar(min=1, max=length(modis.out), style=3)
progress <- function(n) utils::setTxtProgressBar(pb, n)
opts <- list(progress=progress)
MODIS.tiles.extents <- foreach::foreach(
tile = modis.out,
.packages=c("terra", "Kendall"),
.options.snow=opts
) %dopar% {
return(as.vector(terra::ext(terra::rast(tile))))
} %>% dplyr::bind_rows()
MODIS.tiles.info <- cbind(dates, ihs, ivs, MODIS.tiles.extents)
# filter dates.
unique.dates <- unique(MODIS.tiles.info$dates)
unique.dates <- unique.dates[which(as.Date(unique.dates) >= from &
as.Date(unique.dates) <= to)]
# find inds of file paths that match each point.
PEcAn.logger::logger.info("\nFinding indexes of file paths that match each point.")
pb <- utils::txtProgressBar(min=1, max=length(pts), style=3)
progress <- function(n) utils::setTxtProgressBar(pb, n)
opts <- list(progress=progress)
tile.ids <- foreach::foreach(
i = as.numeric(site.ids),
.packages="Kendall",
.options.snow=opts
) %dopar% {
# filter tile position ids based on lat/lon range.
inds <- which(as.Date(MODIS.tiles.info$dates) >= from &
as.Date(MODIS.tiles.info$dates) <= to &
coords.reproj[i, "x"] > MODIS.tiles.info$xmin &
coords.reproj[i, "x"] < MODIS.tiles.info$xmax &
coords.reproj[i, "y"] > MODIS.tiles.info$ymin &
coords.reproj[i, "y"] < MODIS.tiles.info$ymax)
names(inds) <- rep(i,length(inds))
return(inds)
} %>% unlist
# extract MODIS products.
# loop over tiles.
PEcAn.logger::logger.info("\nExtracting MODIS datasets by points.")
unique.tile.ids <- sort(unique(tile.ids))
pb <- utils::txtProgressBar(min=1, max=length(unique.tile.ids), style=3)
progress <- function(n) utils::setTxtProgressBar(pb, n)
opts <- list(progress=progress)
outputs <- foreach::foreach(
i = unique.tile.ids,
.packages=c("purrr", "Kendall"),
.options.snow=opts
) %dopar% {
temp.tile <- terra::rast(modis.out[i])
points.inds <- as.numeric(names(tile.ids)[which(tile.ids == i)])
temp.res <- terra::extract(temp.tile, data.frame(x = coords.reproj[points.inds, "x"],
y = coords.reproj[points.inds, "y"]))
# convert QC flag.
qc <- temp.res[,4] %>% purrr::map(function(v){
qc_flag <- intToBits(as.integer(v)) # NB big-endian (ones place first)
qc_flag <- as.integer(rev(utils::head(qc_flag, 3))) # now ones place last
paste(qc_flag, collapse = "")
}) %>% unlist
# write into table.
temp.res <- data.frame(date = MODIS.tiles.info$dates[i],
site_id = points.inds,
lat = site.locs[points.inds, "lat"],
lon = site.locs[points.inds, "lon"],
lai = temp.res[,3],
sd = temp.res[,7],
qc = qc)
return(temp.res)
} %>% dplyr::bind_rows()
# stop parallel.
parallel::stopCluster(cl)
foreach::registerDoSEQ()
# combine with previous CSV file.
if (!is.null(previous.csv)) {
outputs <- rbind(previous.csv, outputs)
outputs <- outputs[!duplicated(outputs),]
}
# filter by QC band.
outputs <- outputs[which(outputs$qc %in% c("000", "001")),]
# write into CSV file.
write.csv(outputs, file = file.path(csv.outdir, "LAI.csv"), row.names = F)
# delete downloaded files.
unlink(list.files(download.outdir, full.names = T), recursive = T)
return(outputs)
}
Loading
Loading