PecanProject · DongchenZ · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -806,6 +806,11 @@ The following tags can be used for state data assimilation. More detailed inform
      <unit>year</unit>
      <num>1</num>
     </timestep>
+    <sd_threshold>20</sd_threshold>
+    <boundary>
+     <upper_boundary>0.95</upper_boundary>
+     <lower_boundary>0.05</lower_boundary>
+    </boundary>
    </MODIS_LAI>
    <SMAP_SMP>
     <search_window>30</search_window>
@@ -909,7 +914,7 @@ The following tags can be used for observation preparation for the SDA workflow.
 <state.data.assimilation>
   <Obs_Prep>
    <Landtrendr_AGB>
-    <AGB_indir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_indir>
+    <AGB_input_dir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_input_dir>
     <allow_download>TRUE</allow_download>
     <export_csv>TRUE</export_csv>
     <timestep>
@@ -926,6 +931,11 @@ The following tags can be used for observation preparation for the SDA workflow.
      <unit>year</unit>
      <num>1</num>
     </timestep>
+    <sd_threshold>20</sd_threshold>
+    <boundary>
+     <upper_boundary>0.95</upper_boundary>
+     <lower_boundary>0.05</lower_boundary>
+    </boundary>
    </MODIS_LAI>
 
    <SMAP_SMP>
@@ -969,7 +979,8 @@ For the MODIS LAI and SMAP soil moisture observations, the `search_window` speci
 * **search_window** : [optional] This tag is used for LAI and SMAP observation preparation functions, which defines the search window within which the closest observation to the target date is selected. The default value is 30 days.
 * **update_csv** : [optional] This tag is used for SMAP preparation function only, which decide if we want to update the CSV file. Normally, we will only set it to be TRUE if we redownload the SMAP_GEE csv file from Google Earth Engine, the tutorial can be found in the SMAP_SMP_prep function itself. The default value is FALSE.
 * **run_parallel** : [optional] This tag defines if you want to proceed the MODIS LAI function parallely, the default value is FALSE.
-
+* **sd_threshold** : [optional] This tag defines the maximum allowable value for the MODIS LAI. Any records that have higher uncertainty will be removed.
+* **boundary** : [optional] This tag defines the upper and lower boundaries (in quantiles) for filtering the noisy LAI time series.
 
 ### (experimental) Benchmarking {#xml-benchmarking}
 

diff --git a/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R b/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
@@ -31,6 +31,9 @@ LAI_search_window <- 30
 LAI_timestep <- list(unit="year", num=1)
 LAI_export_csv <- TRUE
 run_parallel <- TRUE
+sd_threshold <- 20
+upper_quantile <- 0.95
+lower_quantile <- 0.05
 
 #SMP
 SMP_search_window <- 30
@@ -128,7 +131,8 @@ template <- PEcAn.settings::Settings(list(
 
     Obs_Prep = structure(list(
       Landtrendr_AGB = structure(list(AGB_indir = AGB_indir, timestep = AGB_timestep, allow_download = allow_download, export_csv = AGB_export_csv)),
-      MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel)),
+      MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel,
+                                 sd_threshold = sd_threshold, boundary = structure(list(upper_quantile = upper_quantile, lower_quantile = lower_quantile)))),
       SMAP_SMP = structure(list(search_window = SMP_search_window, timestep = SMP_timestep, export_csv = SMP_export_csv, update_csv = update_csv)),
       Soilgrids_SoilC = structure(list(timestep = SoilC_timestep, export_csv = SoilC_export_csv)),
       start.date = obs_start_date,

diff --git a/modules/data.remote/NAMESPACE b/modules/data.remote/NAMESPACE
@@ -3,8 +3,11 @@
 export(GEDI_AGB_plot)
 export(Landtrendr_AGB_prep)
 export(MODIS_LAI_prep)
+export(MODIS_LAI_ts_filter)
 export(MODIS_LC_prep)
 export(NASA_DAAC_download)
+export(Prep.MODIS.CSV.from.DAAC)
+export(Prep.SMAP.CSV.from.DAAC)
 export(Prep_AGB_IC_from_2010_global)
 export(SMAP_SMP_prep)
 export(call_MODIS)

diff --git a/modules/data.remote/R/MODIS_LAI_prep.R b/modules/data.remote/R/MODIS_LAI_prep.R
@@ -1,20 +1,25 @@
 #' Prepare MODIS LAI data for the SDA workflow.
 #'
-#' @param site_info Bety list of site info including site_id, lon, and lat.
-#' @param time_points A vector contains each time point within the start and end date.
-#' @param outdir Where the final CSV file will be stored.
-#' @param search_window search window for locate available LAI values.
-#' @param export_csv Decide if we want to export the CSV file.
-#' @param sd_threshold Threshold for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
+#' @param site_info list: Bety list of site info including site_id, lon, and lat.
+#' @param time_points character: a vector contains each time point within the start and end date.
+#' @param outdir character: where the final CSV file will be stored.
+#' @param search_window numeric: search window for locate available LAI values.
+#' @param export_csv boolean: decide if we want to export the CSV file.
+#' @param sd_threshold numeric or character: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
+#' @param skip.download boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.
+#' @param boundary numeric vector or list: the upper and lower quantiles for filtering out noisy LAI values (e.g., c(0.05, 0.95) or list(0.05, 0.95)). The default is NULL.
 #'
 #' @return A data frame containing LAI and sd for each site and each time step.
 #' @export
 #' 
-#' @examples
 #' @author Dongchen Zhang
 #' @importFrom magrittr %>%
-MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20){
-  #initialize future parallel computation.
+MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20, skip.download = TRUE, boundary = NULL){
+  # unlist boundary if it's passing from the assembler function.
+  if (is.list(boundary)) {
+    boundary <- as.numeric(unlist(boundary))
+  }
+  # initialize future parallel computation.
   if (future::supportsMulticore()) {
     future::plan(future::multicore, workers = 10)
   } else {
@@ -37,14 +42,20 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   #if we have previous downloaded CSV file
   if(file.exists(file.path(outdir, "LAI.csv"))){
     PEcAn.logger::logger.info("Extracting previous LAI file!")
-    Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"))
+    Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"), 
+                                    colClasses = c(rep("character", 2), rep("numeric", 4), "character"))
+    # filter out high uncertain LAI records.
     if (!is.null(sd_threshold)) {
       PEcAn.logger::logger.info("filtering out records with high standard errors!")
-      ind.rm <- which(Previous_CSV$sd >= sd_threshold)
+      ind.rm <- which(Previous_CSV$sd >= as.numeric(sd_threshold))
       if (length(ind.rm) > 0) {
         Previous_CSV <- Previous_CSV[-ind.rm,]
       }
     }
+    # filter out noisy lai records from time series.
+    if (!is.null(boundary)) {
+      Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
+    }
     LAI_Output <- matrix(NA, length(site_info$site_id), 2*length(time_points)+1) %>% 
       `colnames<-`(c("site_id", paste0(time_points, "_LAI"), paste0(time_points, "_SD"))) %>% as.data.frame()#we need: site_id, LAI, std, target time point.
     LAI_Output$site_id <- site_info$site_id
@@ -77,7 +88,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   #only Site that has NA for any time points need to be downloaded.
   new_site_info <- site_info %>% purrr::map(function(x)x[!stats::complete.cases(LAI_Output)])
   #TODO: only download data for specific date when we have missing data.
-  if(length(new_site_info$site_id) != 0){
+  if(length(new_site_info$site_id) != 0 && !skip.download){
     product <- "MCD15A3H"
     PEcAn.logger::logger.info("Extracting LAI mean products!")
     lai_mean <- split(as.data.frame(new_site_info), seq(nrow(as.data.frame(new_site_info)))) %>% 
@@ -91,7 +102,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                                                            start = dates$start_date,
                                                                            end = dates$end_date,
                                                                            progress = FALSE)))) {
-              return(list(mean = mean$value, date = mean$calendar_date))
+              return(list(mean = mean$value * as.numeric(mean$scale), date = mean$calendar_date))
             } else {
               return(NA)
             }
@@ -109,7 +120,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                                                           start = dates$start_date,
                                                                           end = dates$end_date,
                                                                           progress = FALSE)))) {
-              return(std$value)
+              return(std$value * as.numeric(std$scale))
             } else {
               return(NA)
             }
@@ -150,18 +161,23 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
           next
         }
         if (!is.null(sd_threshold)) {
-          if (lai_std[[i]][j] >= sd_threshold) {
+          if (lai_std[[i]][j] >= as.numeric(sd_threshold)) {
             next
           }
         }
         LAI <- rbind(LAI, list(date = lai_mean[[i]]$date[j],
                                site_id = new_site_info$site_id[i],
                                lat = new_site_info$lat[i],
                                lon = new_site_info$lon[i],
-                               lai = lai_mean[[i]]$mean[j]*0.1,
-                               sd = lai_std[[i]][j]*0.1))
+                               lai = lai_mean[[i]]$mean[j],
+                               sd = lai_std[[i]][j],
+                               qc = lai_qc[[i]][j]))
       }
     }
+    # filter out noisy lai records from time series.
+    if (!is.null(boundary)) {
+      Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
+    }
     #Compare with existing CSV file. (We name the CSV file as LAI.csv)
     if(as.logical(export_csv)){
       if(exists("Previous_CSV")){#we already read the csv file previously.
@@ -192,4 +208,182 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   }
   PEcAn.logger::logger.info("MODIS LAI Prep Completed!")
   list(LAI_Output = LAI_Output, time_points = time_points, var = "LAI")
+}
+#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+#'
+#' @param lai.csv data frame: A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
+#' @param boundary numeric: A vector contains the upper and lower quantiles for filtering lai records. The default is c(0.05, 0.95).
+#' 
+#' @return A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
+#' @export
+#' 
+#' @author Dongchen Zhang
+MODIS_LAI_ts_filter <- function(lai.csv, boundary = c(0.05, 0.95)) {
+  # get unique site ids.
+  site.ids <- sort(unique(lai.csv$site_id))
+  # loop over sites.
+  for (i in seq_along(site.ids)) {
+    # get the lai records for the current site.
+    inds <- which(lai.csv$site_id == site.ids[i])
+    temp.lai <- lai.csv$lai[inds]
+    # calculate the upper and lower boundaries for the current lai records.
+    minmax <- quantile(temp.lai, boundary)
+    # find and remove outliers.
+    inds.rm <- which(temp.lai < minmax[1] | temp.lai > minmax[2])
+    lai.csv <- lai.csv[-inds[inds.rm]]
+  }
+  return(lai.csv)
+}
+#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+#'
+#' @param site_info list: Bety list of site info including site_id, lon, and lat.
+#' @param extent numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).
+#' @param from character: the start time for searching the MODIS products.
+#' @param to character: the end time for searching the MODIS products.
+#' @param download.outdir character: Where the MODIS tiles will be stored.
+#' @param csv.outdir character: Where the final CSV file will be stored.
+#' 
+#' @return A data frame containing LAI and sd for each site and each time step.
+#' @export
+#' 
+#' @author Dongchen Zhang
+#' @importFrom magrittr %>%
+Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdir, csv.outdir) {
+  # load previous CSV file.
+  if (file.exists(file.path(csv.outdir, "LAI.csv"))) {
+    previous.csv <- read.csv(file.path(csv.outdir, "LAI.csv"), 
+                             colClasses = c("character", rep("numeric", 5), "character"))
+  } else {
+    previous.csv <- NULL
+  }
+  # setup the foreach parallel computation.
+  cl <- parallel::makeCluster(parallel::detectCores())
+  doParallel::registerDoParallel(cl)
+  # reproject site locations to MODIS projection.
+  site.ids <- site_info$site_id
+  site.locs <- cbind(site_info$lon, site_info$lon) %>%
+    `colnames<-`(c("lon","lat")) %>%
+    `rownames<-`(site.ids)
+  # create vector based on coordinates and MODIS projection.
+  pts <- data.frame(lon = site.locs[,1], lat = site.locs[,2])
+  sp::coordinates(pts) <- ~lon+lat
+  sp::proj4string(pts) <- sp::CRS("+proj=longlat +datum=WGS84")
+  pts.reproj <- sp::spTransform(pts, "+proj=sinu +a=6371007.181 +b=6371007.181 +units=m")
+  coords.reproj <- sp::coordinates(pts.reproj) %>% `colnames<-`(c("x", "y"))
+  # download data.
+  metadata <- NASA_DAAC_download(ul_lat = extent[4], 
+                                 ul_lon = extent[1], 
+                                 lr_lat = extent[3], 
+                                 lr_lon = extent[2], 
+                                 from = from,
+                                 to = to, 
+                                 just_path = F,
+                                 outdir = download.outdir,
+                                 doi = "10.5067/MODIS/MCD15A3H.061",
+                                 ncore = parallel::detectCores()-1)
+  # grab file paths for downloaded hdf files.
+  modis.out <- list.files(download.outdir, full.names = T, pattern = "*.hdf")
+  # grab id for each file.
+  ids <- basename(modis.out)
+  # split string.
+  splits <- strsplit(x = ids, split = ".", fixed = T)
+  # collect attributes table (date, ih, and iv).
+  dates <- ivs <- ihs <- c()
+  for (i in seq_along(ids)) {
+    temp <- splits[[i]]
+    # calculate dates based on year and day of year.
+    y.doy <- substr(temp[[2]], 2, 8)
+    year <- substr(y.doy, 1, 4)
+    doy <- substr(y.doy, 5, 7)
+    dates <- c(dates, as.character(as.Date(paste(year, doy), format="%Y %j")))
+    # extract tile id.
+    ihs <- c(ihs, substr(temp[[3]], 2, 3))
+    ivs <- c(ivs, substr(temp[[3]], 5, 6))
+  }
+  # create data frame of extents for MODIS tiles.
+  # record progress.
+  PEcAn.logger::logger.info("\nCreating information table for downloaded MODIS tiles.")
+  doSNOW::registerDoSNOW(cl)
+  pb <- utils::txtProgressBar(min=1, max=length(modis.out), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  MODIS.tiles.extents <- foreach::foreach(
+    tile = modis.out,
+    .packages=c("terra", "Kendall"),
+    .options.snow=opts
+  ) %dopar% {
+    return(as.vector(terra::ext(terra::rast(tile))))
+  } %>% dplyr::bind_rows()
+  MODIS.tiles.info <- cbind(dates, ihs, ivs, MODIS.tiles.extents)
+  # filter dates.
+  unique.dates <- unique(MODIS.tiles.info$dates)
+  unique.dates <- unique.dates[which(as.Date(unique.dates) >= from &
+                                       as.Date(unique.dates) <= to)]
+  # find inds of file paths that match each point.
+  PEcAn.logger::logger.info("\nFinding indexes of file paths that match each point.")
+  pb <- utils::txtProgressBar(min=1, max=length(pts), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  tile.ids <- foreach::foreach(
+    i = as.numeric(site.ids),
+    .packages="Kendall",
+    .options.snow=opts
+  ) %dopar% {
+    # filter tile position ids based on lat/lon range.
+    inds <- which(as.Date(MODIS.tiles.info$dates) >= from & 
+                    as.Date(MODIS.tiles.info$dates) <= to & 
+                    coords.reproj[i, "x"] > MODIS.tiles.info$xmin & 
+                    coords.reproj[i, "x"] < MODIS.tiles.info$xmax & 
+                    coords.reproj[i, "y"] > MODIS.tiles.info$ymin &
+                    coords.reproj[i, "y"] < MODIS.tiles.info$ymax)
+    names(inds) <- rep(i,length(inds))
+    return(inds)
+  } %>% unlist
+  # extract MODIS products.
+  # loop over tiles.
+  PEcAn.logger::logger.info("\nExtracting MODIS datasets by points.")
+  unique.tile.ids <- sort(unique(tile.ids)) 
+  pb <- utils::txtProgressBar(min=1, max=length(unique.tile.ids), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  outputs <- foreach::foreach(
+    i = unique.tile.ids,
+    .packages=c("purrr", "Kendall"),
+    .options.snow=opts
+  ) %dopar% {
+    temp.tile <- terra::rast(modis.out[i])
+    points.inds <- as.numeric(names(tile.ids)[which(tile.ids == i)])
+    temp.res <- terra::extract(temp.tile, data.frame(x = coords.reproj[points.inds, "x"],
+                                                     y = coords.reproj[points.inds, "y"]))
+    # convert QC flag.
+    qc <- temp.res[,4] %>% purrr::map(function(v){
+      qc_flag <- intToBits(as.integer(v)) # NB big-endian (ones place first)
+      qc_flag <- as.integer(rev(utils::head(qc_flag, 3))) # now ones place last
+      paste(qc_flag, collapse = "")
+    }) %>% unlist
+    # write into table.
+    temp.res <- data.frame(date = MODIS.tiles.info$dates[i],
+                           site_id = points.inds, 
+                           lat = site.locs[points.inds, "lat"],
+                           lon = site.locs[points.inds, "lon"],
+                           lai = temp.res[,3],
+                           sd = temp.res[,7],
+                           qc = qc)
+    return(temp.res)
+  } %>% dplyr::bind_rows()
+  # stop parallel.
+  parallel::stopCluster(cl)
+  foreach::registerDoSEQ()
+  # combine with previous CSV file.
+  if (!is.null(previous.csv)) {
+    outputs <- rbind(previous.csv, outputs)
+    outputs <- outputs[!duplicated(outputs),]
+  }
+  # filter by QC band.
+  outputs <- outputs[which(outputs$qc %in% c("000", "001")),]
+  # write into CSV file.
+  write.csv(outputs, file = file.path(csv.outdir, "LAI.csv"), row.names = F)
+  # delete downloaded files.
+  unlink(list.files(download.outdir, full.names = T), recursive = T)
+  return(outputs)
 }