From 6d88333f0d39d98249373cfcb6c9f82fba600551 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 10:19:37 -0500
Subject: [PATCH 1/9] Update NASA DAAC download function.

---
 modules/data.remote/R/NASA_DAAC_download.R    | 82 +++++++++++++++----
 .../data.remote/man/DAAC_Set_Credential.Rd    | 18 ++++
 modules/data.remote/man/NASA_DAAC_download.Rd |  3 -
 3 files changed, 85 insertions(+), 18 deletions(-)
 create mode 100644 modules/data.remote/man/DAAC_Set_Credential.Rd

diff --git a/modules/data.remote/R/NASA_DAAC_download.R b/modules/data.remote/R/NASA_DAAC_download.R
index 01fa50ac04..3cdf207f1a 100644
--- a/modules/data.remote/R/NASA_DAAC_download.R
+++ b/modules/data.remote/R/NASA_DAAC_download.R
@@ -14,7 +14,6 @@
 #' @param doi Character: data DOI on the NASA DAAC server, it can be obtained 
 #' directly from the NASA ORNL DAAC data portal (e.g., GEDI L4A through 
 #' https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2056).
-#' @param netrc_file Character: path to the credential file, default is NULL.
 #' @param just_path Boolean: if we just want the metadata and URL or proceed the actual download.
 #'
 #' @return A list containing meta data and physical path for each data downloaded.
@@ -50,19 +49,14 @@ NASA_DAAC_download <- function(ul_lat,
                                to,
                                outdir = getwd(),
                                doi,
-                               netrc_file = NULL,
                                just_path = FALSE) {
-  # if there is no credential file in the outdir.
-  # we will create a new one.
-  # this function is located within the GEDI_AGB_prep script.
+  # Determine if we have enough inputs.
   if (is.null(outdir) & !just_path) {
     PEcAn.logger::logger.info("Please provide outdir if you want to download the file.")
     return(0)
-  } else if (!is.null(outdir) & !just_path & is.null(netrc_file)) {
-    if (length(list.files(outdir, pattern = "netrc")) == 0) {
-      netrc_file <- getnetrc(outdir)
-    }
   }
+  # setup DAAC Credentials.
+  DAAC_Set_Credential()
   # setup arguments for URL.
   daterange <- c(from, to)
   # grab provider and concept id from CMR based on DOI.
@@ -92,6 +86,12 @@ NASA_DAAC_download <- function(ul_lat,
     granules_href <- c(granules_href, sapply(granules, function(x) x$links[[1]]$href))
     page <- page + 1
   }
+  # detect existing files.
+  same.file.inds <- which(basename(granules_href) %in% list.files(outdir))
+  if (length(same.file.inds) > 0) {
+    granules_href <- granules_href[-same.file.inds]
+  }
+  
   # if we need to download the data.
   if (length(granules_href) == 0) {
     return(NA)
@@ -99,24 +99,56 @@ NASA_DAAC_download <- function(ul_lat,
   if (!just_path) {
     # printing out parallel environment.
     message("using ", ncore, " core")
+    message("start downloading ", length(granules_href), " files.")
     # download
     # if we have (or assign) more than one core to be allocated.
     if (ncore > 1) {
       # setup the foreach parallel computation.
       cl <- parallel::makeCluster(ncore)
       doParallel::registerDoParallel(cl)
-      message("start download")
+      # record progress.
+      doSNOW::registerDoSNOW(cl)
+      pb <- utils::txtProgressBar(min=1, max=length(granules_href), style=3)
+      progress <- function(n) utils::setTxtProgressBar(pb, n)
+      opts <- list(progress=progress)
       foreach::foreach(
         i = 1:length(granules_href),
-        .packages = "httr"
+        .packages=c("httr","Kendall"),
+        .options.snow=opts
       ) %dopar% {
         response <-
           httr::GET(
             granules_href[i],
             httr::write_disk(file.path(outdir, basename(granules_href)[i]), overwrite = T),
-            httr::config(netrc = TRUE, netrc_file = netrc_file),
-            httr::set_cookies("LC" = "cookies")
+            httr::authenticate(user = Sys.getenv("ed_un"),
+                               password = Sys.getenv("ed_pw"))
           )
+        # Check if we can successfully open the downloaded file.
+        # if it's H5 file.
+        if (grepl(pattern = ".h5", x = basename(granules_href)[i], fixed = T)) {
+          while ("try-error" %in% class(try(hdf5r::H5File$new(file.path(outdir, basename(granules_href)[i]), mode = "r"), silent = T))) {
+            response <-
+              httr::GET(
+                granules_href[i],
+                httr::write_disk(file.path(outdir, basename(granules_href)[i]), overwrite = T),
+                httr::authenticate(user = Sys.getenv("ed_un"),
+                                   password = Sys.getenv("ed_pw"))
+              )
+          }
+          # if it's HDF4 or regular GeoTIFF file.
+        } else if (grepl(pattern = ".tif", x = basename(granules_href)[i], fixed = T) |
+                   grepl(pattern = ".tiff", x = basename(granules_href)[i], fixed = T) |
+                   grepl(pattern = ".hdf", x = basename(granules_href)[i], fixed = T)) {
+          while ("try-error" %in% class(try(terra::rast(file.path(outdir, basename(granules_href)[i])), silent = T))) {
+            response <-
+              httr::GET(
+                granules_href[i],
+                httr::write_disk(file.path(outdir, basename(granules_href)[i]), overwrite = T),
+                httr::authenticate(user = Sys.getenv("ed_un"),
+                                   password = Sys.getenv("ed_pw"))
+              )
+          }
+        }
       }
       parallel::stopCluster(cl)
       foreach::registerDoSEQ()
@@ -128,8 +160,8 @@ NASA_DAAC_download <- function(ul_lat,
           httr::GET(
             granules_href[i],
             httr::write_disk(file.path(outdir, basename(granules_href)[i]), overwrite = T),
-            httr::config(netrc = TRUE, netrc_file = netrc_file),
-            httr::set_cookies("LC" = "cookies")
+            httr::authenticate(user = Sys.getenv("ed_un"),
+                               password = Sys.getenv("ed_pw"))
           )
       }
     }
@@ -209,4 +241,24 @@ NASA_CMR_finder <- function(doi) {
   concept_id <- results$feed$entry %>% purrr::map("id") %>% unlist
   # return results.
   return(as.list(data.frame(cbind(provider, concept_id))))
+}
+
+#' Set NASA DAAC credentials to the current environment.
+#'
+#' @param replace Boolean: determine if we want to replace the current 
+#' credentials from the environment. 
+#'
+#' @author Dongchen Zhang
+DAAC_Set_Credential <- function(replace = FALSE) {
+  if (replace) {
+    PEcAn.logger::logger.info("Replace previous stored NASA DAAC credentials.")
+  }
+  if (replace | nchar(Sys.getenv("ed_un")) == 0 | nchar(Sys.getenv("ed_un")) == 0) {
+    Sys.setenv(ed_un = sprintf(
+      getPass::getPass(msg = "Enter NASA Earthdata Login Username \n (or create an account at urs.earthdata.nasa.gov) :")
+    ), 
+    ed_pw = sprintf(
+      getPass::getPass(msg = "Enter NASA Earthdata Login Password:")
+    ))
+  }
 }
\ No newline at end of file
diff --git a/modules/data.remote/man/DAAC_Set_Credential.Rd b/modules/data.remote/man/DAAC_Set_Credential.Rd
new file mode 100644
index 0000000000..c64fbf51ef
--- /dev/null
+++ b/modules/data.remote/man/DAAC_Set_Credential.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/NASA_DAAC_download.R
+\name{DAAC_Set_Credential}
+\alias{DAAC_Set_Credential}
+\title{Set NASA DAAC credentials to the current environment.}
+\usage{
+DAAC_Set_Credential(replace = FALSE)
+}
+\arguments{
+\item{replace}{Boolean: determine if we want to replace the current 
+credentials from the environment.}
+}
+\description{
+Set NASA DAAC credentials to the current environment.
+}
+\author{
+Dongchen Zhang
+}
diff --git a/modules/data.remote/man/NASA_DAAC_download.Rd b/modules/data.remote/man/NASA_DAAC_download.Rd
index b41cfc920c..a76ab85fe0 100644
--- a/modules/data.remote/man/NASA_DAAC_download.Rd
+++ b/modules/data.remote/man/NASA_DAAC_download.Rd
@@ -14,7 +14,6 @@ NASA_DAAC_download(
   to,
   outdir = getwd(),
   doi,
-  netrc_file = NULL,
   just_path = FALSE
 )
 }
@@ -42,8 +41,6 @@ downloaded files. Default is the current work directory(getwd()).}
 directly from the NASA ORNL DAAC data portal (e.g., GEDI L4A through 
 https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2056).}
 
-\item{netrc_file}{Character: path to the credential file, default is NULL.}
-
 \item{just_path}{Boolean: if we just want the metadata and URL or proceed the actual download.}
 }
 \value{

From 193b2d05706cdd1f180fe0bfb4bc4643ee220ee8 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 10:54:45 -0500
Subject: [PATCH 2/9] Add SMAP prep function using DAAC server.

---
 modules/data.remote/R/SMAP_SMP_prep.R         | 117 +++++++++++++++---
 .../man/Prep.SMAP.CSV.from.DAAC.Rd            |  39 ++++++
 2 files changed, 139 insertions(+), 17 deletions(-)
 create mode 100644 modules/data.remote/man/Prep.SMAP.CSV.from.DAAC.Rd

diff --git a/modules/data.remote/R/SMAP_SMP_prep.R b/modules/data.remote/R/SMAP_SMP_prep.R
index 5eca6d1675..31dca21f81 100644
--- a/modules/data.remote/R/SMAP_SMP_prep.R
+++ b/modules/data.remote/R/SMAP_SMP_prep.R
@@ -16,7 +16,7 @@
 #' @author Dongchen Zhang
 #' @importFrom magrittr %>%
 SMAP_SMP_prep <- function(site_info, start_date, end_date, time_points, 
-                      outdir, search_window = 30, export_csv = TRUE, update_csv = FALSE){
+                          outdir, search_window = 30, export_csv = TRUE, update_csv = FALSE){
   #note that, the SMAP_gee.csv file comes from Google Earth Engine (GEE) directly.
   #Code for generating this file can be found through this link: 
   #https://code.earthengine.google.com/ecbeb770e576d8ef72f72f5f12da3496
@@ -37,20 +37,20 @@ SMAP_SMP_prep <- function(site_info, start_date, end_date, time_points,
     }else{
       SMAP_CSV <- utils::read.csv(file.path(outdir, "SMAP_gee.csv"))[-1,2] %>% 
         furrr::future_map(function(string){
-        String <-  strsplit(gsub(",", "", gsub("\\[|\\]", "", string)), " ")[[1]]
-        date <- as.Date(strsplit(String[1], "_")[[1]][5], "%Y%m%d")
-        lon <- as.numeric(String[2])
-        lat <- as.numeric(String[3])
-        smp <- as.numeric(String[5]) * 100
-        sd <- 0.04 * 100 #From Daniel
-        
-        #Match current lon/lat with site_info
-        Longlat_matrix <- matrix(c(lon, site_info$lon, lat, site_info$lat), ncol=2)
-        Distance <- sp::spDistsN1(Longlat_matrix, Longlat_matrix[1,], longlat = TRUE)[-1]
-        distloc <- match(min(Distance), Distance)
-        site_id <- site_info$site_id[distloc]
-        list(date = date, site_id = site_id, lat = lat, lon = lon, smp = smp, sd = sd)#in date, id, lat, lon, smp, sd
-      }, .progress = T) %>% dplyr::bind_rows()
+          String <-  strsplit(gsub(",", "", gsub("\\[|\\]", "", string)), " ")[[1]]
+          date <- as.Date(strsplit(String[1], "_")[[1]][5], "%Y%m%d")
+          lon <- as.numeric(String[2])
+          lat <- as.numeric(String[3])
+          smp <- as.numeric(String[5]) * 100
+          sd <- 0.04 * 100 #From Daniel
+          
+          #Match current lon/lat with site_info
+          Longlat_matrix <- matrix(c(lon, site_info$lon, lat, site_info$lat), ncol=2)
+          Distance <- sp::spDistsN1(Longlat_matrix, Longlat_matrix[1,], longlat = TRUE)[-1]
+          distloc <- match(min(Distance), Distance)
+          site_id <- site_info$site_id[distloc]
+          list(date = date, site_id = site_id, lat = lat, lon = lon, smp = smp, sd = sd)#in date, id, lat, lon, smp, sd
+        }, .progress = T) %>% dplyr::bind_rows()
       #write out csv file.
       if(as.logical((export_csv))){
         utils::write.csv(SMAP_CSV, file = file.path(outdir, "SMAP.csv"), row.names = F)
@@ -58,8 +58,9 @@ SMAP_SMP_prep <- function(site_info, start_date, end_date, time_points,
     }
   }else{
     #TODO: When current SMAP.csv need to be updated
-    SMAP_CSV <- utils::read.csv(file.path(outdir, "SMAP.csv"))
-    Current_years <- sort(unique(lubridate::year(SMAP_CSV$date)))
+    SMAP_CSV <- utils::read.csv(file.path(outdir, "SMAP.csv"), 
+                                colClasses = c(rep("character", 2), rep("numeric", 4)))
+    Current_years <- sort(unique(lubridate::year(as.Date(SMAP_CSV$date))))
     Required_years <- lubridate::year(start_date):lubridate::year(end_date)
     Required_years <- Required_years[which(Required_years>=2015)] #SMAP data only available after year 2015.
     if(sum(!Required_years%in%Current_years)){
@@ -96,4 +97,86 @@ SMAP_SMP_prep <- function(site_info, start_date, end_date, time_points,
   }
   PEcAn.logger::logger.info("SMAP SMP Prep Completed!")
   list(SMP_Output = SMAP_Output, time_points = time_points, var = "SoilMoistFrac")
+}
+
+#' Prepare SMAP soil moisture profile (SMP) data from the NASA DAAC server for the SDA workflow.
+#' The CSV file that works for the `SMAP_SMP_prep` function will be exported.
+#'
+#' @param site_info list: Bety list of site info including site_id, lon, and lat.
+#' @param extent numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).
+#' @param from character: the start time for searching the MODIS products.
+#' @param to character: the end time for searching the MODIS products.
+#' @param download.outdir character: Where the MODIS tiles will be stored.
+#' @param csv.outdir character: Where the final CSV file will be stored.
+#' 
+#' @return A data frame containing SMP and sd for each site and each time step.
+#' @export
+#' 
+#' @author Dongchen Zhang
+#' @importFrom magrittr %>%
+Prep.SMAP.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdir, csv.outdir) {
+  # SMAP CRS, EPSG:6933.
+  smap.crs <- "+proj=cea +lat_ts=30 +lon_0=0 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs"
+  # load previous CSV file.
+  if (file.exists(file.path(csv.outdir, "SMAP.csv"))) {
+    previous.csv <- read.csv(file.path(csv.outdir, "SMAP.csv"), 
+                             colClasses = c("character", rep("numeric", 5)))
+  } else {
+    previous.csv <- NULL
+  }
+  # reproject site locations to MODIS projection.
+  site.ids <- site_info$site_id
+  site.locs <- cbind(site_info$lon, site_info$lon) %>%
+    `colnames<-`(c("lon","lat")) %>%
+    `rownames<-`(site.ids)
+  pts <- data.frame(lon = site.locs[,1], lat = site.locs[,2])
+  sp::coordinates(pts) <- ~lon+lat
+  sp::proj4string(pts) <- sp::CRS("+proj=longlat +datum=WGS84")
+  pts.reproj <- sp::spTransform(pts, smap.crs)
+  coords.reproj <- sp::coordinates(pts.reproj) %>% `colnames<-`(c("x", "y"))
+  # download data.
+  metadata <- NASA_DAAC_download(ul_lat = extent[4], 
+                                 ul_lon = extent[1], 
+                                 lr_lat = extent[3], 
+                                 lr_lon = extent[2], 
+                                 from = from,
+                                 to = to, 
+                                 just_path = F,
+                                 outdir = download.outdir,
+                                 doi = "10.5067/LWJ6TF5SZRG3",
+                                 ncore = parallel::detectCores()-1)
+  smap.out <- metadata$path
+  file <- smap.out[1] # select the first file.
+  # grab smap extents, it's from the ArcGIS report using the SMAP H5 file.
+  smap.ext <- c(-17363027.292480, 17367529.945160, -7319045.227051, 7310037.171387) %>% terra::ext()
+  # convert h5 file to raster.
+  smap <- terra::rast(file)
+  # add extent and crs to the raster.
+  terra::ext(smap) <- smap.ext#terra::ext(pr_extent)
+  terra::crs(smap) <- smap.crs
+  # extract values for smp and std.
+  smp <- terra::extract(smap[["sm_profile_analysis"]], coords.reproj) * 100
+  std <- terra::extract(smap[["sm_profile_analysis_ensstd"]], coords.reproj) * 100
+  # construct final data frame.
+  outputs <- c()
+  for (i in seq_along(site.ids)) {
+    outputs <- rbind(outputs, data.frame(date = as.character(from),
+                                         site_id = i, 
+                                         lat = site.locs[i, "lat"],
+                                         lon = site.locs[i, "lon"],
+                                         smp = smp[i,],
+                                         sd = std[i,]))
+  }
+  # remove NAs.
+  outputs <- outputs[which(!is.na(outputs$smp)),]
+  # combine with previous CSV file.
+  if (!is.null(previous.csv)) {
+    outputs <- rbind(previous.csv, outputs)
+    outputs <- outputs[!duplicated(outputs),]
+  }
+  # write into CSV file.
+  write.csv(outputs, file = file.path(csv.outdir, "SMAP.csv"), row.names = F)
+  # delete downloaded files.
+  unlink(list.files(download.outdir, full.names = T), recursive = T)
+  return(outputs)
 }
\ No newline at end of file
diff --git a/modules/data.remote/man/Prep.SMAP.CSV.from.DAAC.Rd b/modules/data.remote/man/Prep.SMAP.CSV.from.DAAC.Rd
new file mode 100644
index 0000000000..352b2616c6
--- /dev/null
+++ b/modules/data.remote/man/Prep.SMAP.CSV.from.DAAC.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SMAP_SMP_prep.R
+\name{Prep.SMAP.CSV.from.DAAC}
+\alias{Prep.SMAP.CSV.from.DAAC}
+\title{Prepare SMAP soil moisture profile (SMP) data from the NASA DAAC server for the SDA workflow.
+The CSV file that works for the `SMAP_SMP_prep` function will be exported.}
+\usage{
+Prep.SMAP.CSV.from.DAAC(
+  site_info,
+  extent,
+  from,
+  to,
+  download.outdir,
+  csv.outdir
+)
+}
+\arguments{
+\item{site_info}{list: Bety list of site info including site_id, lon, and lat.}
+
+\item{extent}{numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).}
+
+\item{from}{character: the start time for searching the MODIS products.}
+
+\item{to}{character: the end time for searching the MODIS products.}
+
+\item{download.outdir}{character: Where the MODIS tiles will be stored.}
+
+\item{csv.outdir}{character: Where the final CSV file will be stored.}
+}
+\value{
+A data frame containing SMP and sd for each site and each time step.
+}
+\description{
+Prepare SMAP soil moisture profile (SMP) data from the NASA DAAC server for the SDA workflow.
+The CSV file that works for the `SMAP_SMP_prep` function will be exported.
+}
+\author{
+Dongchen Zhang
+}

From 16335e423de0344a2290537d21a5873c6654c6ed Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 10:55:10 -0500
Subject: [PATCH 3/9] Add MODIS LAI prep function using the NASA DAAC server.

---
 modules/data.remote/R/MODIS_LAI_prep.R        | 178 ++++++++++++++++--
 .../man/Prep.MODIS.CSV.from.DAAC.Rd           |  37 ++++
 2 files changed, 204 insertions(+), 11 deletions(-)
 create mode 100644 modules/data.remote/man/Prep.MODIS.CSV.from.DAAC.Rd

diff --git a/modules/data.remote/R/MODIS_LAI_prep.R b/modules/data.remote/R/MODIS_LAI_prep.R
index 6982f19229..5709155165 100644
--- a/modules/data.remote/R/MODIS_LAI_prep.R
+++ b/modules/data.remote/R/MODIS_LAI_prep.R
@@ -1,19 +1,19 @@
 #' Prepare MODIS LAI data for the SDA workflow.
 #'
-#' @param site_info Bety list of site info including site_id, lon, and lat.
-#' @param time_points A vector contains each time point within the start and end date.
-#' @param outdir Where the final CSV file will be stored.
-#' @param search_window search window for locate available LAI values.
-#' @param export_csv Decide if we want to export the CSV file.
-#' @param sd_threshold Threshold for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
+#' @param site_info list: Bety list of site info including site_id, lon, and lat.
+#' @param time_points character: a vector contains each time point within the start and end date.
+#' @param outdir character: where the final CSV file will be stored.
+#' @param search_window numeric: search window for locate available LAI values.
+#' @param export_csv boolean: decide if we want to export the CSV file.
+#' @param sd_threshold numeric: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
+#' @param skip.download boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.
 #'
 #' @return A data frame containing LAI and sd for each site and each time step.
 #' @export
 #' 
-#' @examples
 #' @author Dongchen Zhang
 #' @importFrom magrittr %>%
-MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20){
+MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20, skip.download = TRUE){
   #initialize future parallel computation.
   if (future::supportsMulticore()) {
     future::plan(future::multicore, workers = 10)
@@ -37,7 +37,8 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   #if we have previous downloaded CSV file
   if(file.exists(file.path(outdir, "LAI.csv"))){
     PEcAn.logger::logger.info("Extracting previous LAI file!")
-    Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"))
+    Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"), 
+                                    colClasses = c(rep("character", 2), rep("numeric", 4), "character"))
     if (!is.null(sd_threshold)) {
       PEcAn.logger::logger.info("filtering out records with high standard errors!")
       ind.rm <- which(Previous_CSV$sd >= sd_threshold)
@@ -77,7 +78,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   #only Site that has NA for any time points need to be downloaded.
   new_site_info <- site_info %>% purrr::map(function(x)x[!stats::complete.cases(LAI_Output)])
   #TODO: only download data for specific date when we have missing data.
-  if(length(new_site_info$site_id) != 0){
+  if(length(new_site_info$site_id) != 0 && !skip.download){
     product <- "MCD15A3H"
     PEcAn.logger::logger.info("Extracting LAI mean products!")
     lai_mean <- split(as.data.frame(new_site_info), seq(nrow(as.data.frame(new_site_info)))) %>% 
@@ -159,7 +160,8 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                lat = new_site_info$lat[i],
                                lon = new_site_info$lon[i],
                                lai = lai_mean[[i]]$mean[j]*0.1,
-                               sd = lai_std[[i]][j]*0.1))
+                               sd = lai_std[[i]][j]*0.1,
+                               qc = lai_qc[[i]][j]))
       }
     }
     #Compare with existing CSV file. (We name the CSV file as LAI.csv)
@@ -192,4 +194,158 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   }
   PEcAn.logger::logger.info("MODIS LAI Prep Completed!")
   list(LAI_Output = LAI_Output, time_points = time_points, var = "LAI")
+}
+
+#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+#'
+#' @param site_info list: Bety list of site info including site_id, lon, and lat.
+#' @param extent numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).
+#' @param from character: the start time for searching the MODIS products.
+#' @param to character: the end time for searching the MODIS products.
+#' @param download.outdir character: Where the MODIS tiles will be stored.
+#' @param csv.outdir character: Where the final CSV file will be stored.
+#' 
+#' @return A data frame containing LAI and sd for each site and each time step.
+#' @export
+#' 
+#' @author Dongchen Zhang
+#' @importFrom magrittr %>%
+Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdir, csv.outdir) {
+  # load previous CSV file.
+  if (file.exists(file.path(csv.outdir, "LAI.csv"))) {
+    previous.csv <- read.csv(file.path(csv.outdir, "LAI.csv"), 
+                             colClasses = c("character", rep("numeric", 5), "character"))
+  } else {
+    previous.csv <- NULL
+  }
+  # setup the foreach parallel computation.
+  cl <- parallel::makeCluster(parallel::detectCores())
+  doParallel::registerDoParallel(cl)
+  # reproject site locations to MODIS projection.
+  site.ids <- site_info$site_id
+  site.locs <- cbind(site_info$lon, site_info$lon) %>%
+    `colnames<-`(c("lon","lat")) %>%
+    `rownames<-`(site.ids)
+  # create vector based on coordinates and MODIS projection.
+  pts <- data.frame(lon = site.locs[,1], lat = site.locs[,2])
+  sp::coordinates(pts) <- ~lon+lat
+  sp::proj4string(pts) <- sp::CRS("+proj=longlat +datum=WGS84")
+  pts.reproj <- sp::spTransform(pts, "+proj=sinu +a=6371007.181 +b=6371007.181 +units=m")
+  coords.reproj <- sp::coordinates(pts.reproj) %>% `colnames<-`(c("x", "y"))
+  # download data.
+  metadata <- NASA_DAAC_download(ul_lat = extent[4], 
+                                 ul_lon = extent[1], 
+                                 lr_lat = extent[3], 
+                                 lr_lon = extent[2], 
+                                 from = from,
+                                 to = to, 
+                                 just_path = F,
+                                 outdir = download.outdir,
+                                 doi = "10.5067/MODIS/MCD15A3H.061",
+                                 ncore = parallel::detectCores()-1)
+  # grab file paths for downloaded hdf files.
+  modis.out <- list.files(download.outdir, full.names = T, pattern = "*.hdf")
+  # grab id for each file.
+  ids <- basename(modis.out)
+  # split string.
+  splits <- strsplit(x = ids, split = ".", fixed = T)
+  # collect attributes table (date, ih, and iv).
+  dates <- ivs <- ihs <- c()
+  for (i in seq_along(ids)) {
+    temp <- splits[[i]]
+    # calculate dates based on year and day of year.
+    y.doy <- substr(temp[[2]], 2, 8)
+    year <- substr(y.doy, 1, 4)
+    doy <- substr(y.doy, 5, 7)
+    dates <- c(dates, as.character(as.Date(paste(year, doy), format="%Y %j")))
+    # extract tile id.
+    ihs <- c(ihs, substr(temp[[3]], 2, 3))
+    ivs <- c(ivs, substr(temp[[3]], 5, 6))
+  }
+  # create data frame of extents for MODIS tiles.
+  # record progress.
+  message("\nCreating information table for downloaded MODIS tiles.")
+  doSNOW::registerDoSNOW(cl)
+  pb <- utils::txtProgressBar(min=1, max=length(modis.out), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  MODIS.tiles.extents <- foreach::foreach(
+    tile = modis.out,
+    .packages=c("terra", "Kendall"),
+    .options.snow=opts
+  ) %dopar% {
+    return(as.vector(terra::ext(terra::rast(tile))))
+  } %>% dplyr::bind_rows()
+  MODIS.tiles.info <- cbind(dates, ihs, ivs, MODIS.tiles.extents)
+  # filter dates.
+  unique.dates <- unique(MODIS.tiles.info$dates)
+  unique.dates <- unique.dates[which(as.Date(unique.dates) >= from &
+                                       as.Date(unique.dates) <= to)]
+  # find inds of file paths that match each point.
+  message("\nFinding indexes of file paths that match each point.")
+  pb <- utils::txtProgressBar(min=1, max=length(pts), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  tile.ids <- foreach::foreach(
+    i = as.numeric(site.ids),
+    .packages="Kendall",
+    .options.snow=opts
+  ) %dopar% {
+    # filter tile position ids based on lat/lon range.
+    inds <- which(as.Date(MODIS.tiles.info$dates) >= from & 
+                    as.Date(MODIS.tiles.info$dates) <= to & 
+                    coords.reproj[i, "x"] > MODIS.tiles.info$xmin & 
+                    coords.reproj[i, "x"] < MODIS.tiles.info$xmax & 
+                    coords.reproj[i, "y"] > MODIS.tiles.info$ymin &
+                    coords.reproj[i, "y"] < MODIS.tiles.info$ymax)
+    names(inds) <- rep(i,length(inds))
+    return(inds)
+  } %>% unlist
+  # extract MODIS products.
+  # loop over tiles.
+  message("\nExtracting MODIS datasets by points.")
+  unique.tile.ids <- sort(unique(tile.ids)) 
+  pb <- utils::txtProgressBar(min=1, max=length(unique.tile.ids), style=3)
+  progress <- function(n) utils::setTxtProgressBar(pb, n)
+  opts <- list(progress=progress)
+  outputs <- foreach::foreach(
+    i = unique.tile.ids,
+    .packages=c("purrr", "Kendall"),
+    .options.snow=opts
+  ) %dopar% {
+    temp.tile <- terra::rast(modis.out[i])
+    points.inds <- as.numeric(names(tile.ids)[which(tile.ids == i)])
+    temp.res <- terra::extract(temp.tile, data.frame(x = coords.reproj[points.inds, "x"],
+                                                     y = coords.reproj[points.inds, "y"]))
+    # convert QC flag.
+    qc <- temp.res[,4] %>% purrr::map(function(v){
+      qc_flag <- intToBits(as.integer(v)) # NB big-endian (ones place first)
+      qc_flag <- as.integer(rev(utils::head(qc_flag, 3))) # now ones place last
+      paste(qc_flag, collapse = "")
+    }) %>% unlist
+    # write into table.
+    temp.res <- data.frame(date = MODIS.tiles.info$dates[i],
+                           site_id = points.inds, 
+                           lat = site.locs[points.inds, "lat"],
+                           lon = site.locs[points.inds, "lon"],
+                           lai = temp.res[,3],
+                           sd = temp.res[,7],
+                           qc = qc)
+    return(temp.res)
+  } %>% dplyr::bind_rows()
+  # stop parallel.
+  parallel::stopCluster(cl)
+  foreach::registerDoSEQ()
+  # combine with previous CSV file.
+  if (!is.null(previous.csv)) {
+    outputs <- rbind(previous.csv, outputs)
+    outputs <- outputs[!duplicated(outputs),]
+  }
+  # filter by QC band.
+  outputs <- outputs[which(outputs$qc %in% c("000", "001")),]
+  # write into CSV file.
+  write.csv(outputs, file = file.path(csv.outdir, "LAI.csv"), row.names = F)
+  # delete downloaded files.
+  unlink(list.files(download.outdir, full.names = T), recursive = T)
+  return(outputs)
 }
\ No newline at end of file
diff --git a/modules/data.remote/man/Prep.MODIS.CSV.from.DAAC.Rd b/modules/data.remote/man/Prep.MODIS.CSV.from.DAAC.Rd
new file mode 100644
index 0000000000..7aba4579b6
--- /dev/null
+++ b/modules/data.remote/man/Prep.MODIS.CSV.from.DAAC.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/MODIS_LAI_prep.R
+\name{Prep.MODIS.CSV.from.DAAC}
+\alias{Prep.MODIS.CSV.from.DAAC}
+\title{Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.}
+\usage{
+Prep.MODIS.CSV.from.DAAC(
+  site_info,
+  extent,
+  from,
+  to,
+  download.outdir,
+  csv.outdir
+)
+}
+\arguments{
+\item{site_info}{list: Bety list of site info including site_id, lon, and lat.}
+
+\item{extent}{numeric: A vector contains the bounding box that covers all sites (West longitude, East longitude, South latitude ,North latitude).}
+
+\item{from}{character: the start time for searching the MODIS products.}
+
+\item{to}{character: the end time for searching the MODIS products.}
+
+\item{download.outdir}{character: Where the MODIS tiles will be stored.}
+
+\item{csv.outdir}{character: Where the final CSV file will be stored.}
+}
+\value{
+A data frame containing LAI and sd for each site and each time step.
+}
+\description{
+Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+}
+\author{
+Dongchen Zhang
+}

From d3aca4ffbb1c6d5aa9c7514085f5be9922fab007 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 10:55:28 -0500
Subject: [PATCH 4/9] Improve the AGB IC prep function.

---
 .../R/Prep_AGB_IC_from_2010_global.R          | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/modules/data.remote/R/Prep_AGB_IC_from_2010_global.R b/modules/data.remote/R/Prep_AGB_IC_from_2010_global.R
index 8b3a4c9b09..cd561a4024 100644
--- a/modules/data.remote/R/Prep_AGB_IC_from_2010_global.R
+++ b/modules/data.remote/R/Prep_AGB_IC_from_2010_global.R
@@ -20,24 +20,27 @@ Prep_AGB_IC_from_2010_global <- function(site_info, paths.list, ens) {
     future::plan(future::multisession)
   }
   ## get coordinates and provide spatial info
-  site_coords <- data.frame(site_info$lon, site_info$lat)
-  names(site_coords) <- c("Longitude","Latitude")
-  coords_latlong <- sp::SpatialPoints(site_coords)
-  sp::proj4string(coords_latlong) <- sp::CRS("+init=epsg:4326")
+  coords_latlong <- terra::vect(cbind(site_info$lon, site_info$lat), 
+                                crs = "+proj=longlat +datum=WGS84 +no_defs")
   ## load gridded AGB data
-  raster_data <- lapply(paths.list, raster::raster)
+  raster_data <- terra::rast(unlist(paths.list))
   ## reproject Lat/Long site coords to AGB Albers Equal-Area
-  coords_AEA <- sp::spTransform(coords_latlong,
-                                raster::crs(raster::raster(raster_data[[1]])))
-  ## prepare product for extraction - stack requested years
-  raster_data_stack <- raster::stack(raster_data)
+  coords_AEA <- terra::project(coords_latlong, terra::crs(raster_data))
   ## extract
-  agb_pixel <- raster::extract(x = raster_data_stack, 
-                               y = coords_AEA, buffer=0, fun=NULL, df=FALSE)
-  sampled_ic <- agb_pixel %>% furrr::future_map(function(pixel){
-    ens_sample <- stats::rnorm(ens, pixel["mean"], pixel["uncertainty"])
-    ens_sample[which(ens_sample<0)] <- 0
-    ens_sample
-  }, .progress = T) %>% dplyr::bind_cols() %>% `colnames<-`(site_info$site_id)
+  agb_pixel <- terra::extract(x = raster_data, y = coords_AEA) %>%
+    `colnames<-`(c("site.id", "mean", "uncertainty"))
+  sampled_ic <- split(as.data.frame(agb_pixel[, c(2,3)]), 
+                      seq(nrow(as.data.frame(agb_pixel[, c(2,3)])))) %>% 
+    furrr::future_map(function(pixel){
+      # make sure there is never a zero uncertainty.
+      if (pixel[,"uncertainty"] == 0) {
+        uncertainty <- .1
+      } else {
+        uncertainty <- pixel[,"uncertainty"]
+      }
+      ens_sample <- stats::rnorm(n = ens, mean = pixel[,"mean"], sd = uncertainty)
+      ens_sample[which(ens_sample<0)] <- 0
+      ens_sample
+    }, .progress = T) %>% dplyr::bind_cols() %>% `colnames<-`(site_info$site_id)
   return(sampled_ic)
 }
\ No newline at end of file

From 43ffc00fdbcfe3bb2d71ba70b3b25a1f5fc38608 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 10:55:40 -0500
Subject: [PATCH 5/9] Update documentation.

---
 modules/data.remote/NAMESPACE             |  2 ++
 modules/data.remote/man/MODIS_LAI_prep.Rd | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/modules/data.remote/NAMESPACE b/modules/data.remote/NAMESPACE
index e54e6c5d77..53dee66ee5 100644
--- a/modules/data.remote/NAMESPACE
+++ b/modules/data.remote/NAMESPACE
@@ -5,6 +5,8 @@ export(Landtrendr_AGB_prep)
 export(MODIS_LAI_prep)
 export(MODIS_LC_prep)
 export(NASA_DAAC_download)
+export(Prep.MODIS.CSV.from.DAAC)
+export(Prep.SMAP.CSV.from.DAAC)
 export(Prep_AGB_IC_from_2010_global)
 export(SMAP_SMP_prep)
 export(call_MODIS)
diff --git a/modules/data.remote/man/MODIS_LAI_prep.Rd b/modules/data.remote/man/MODIS_LAI_prep.Rd
index d70b416b75..5c9b15410e 100644
--- a/modules/data.remote/man/MODIS_LAI_prep.Rd
+++ b/modules/data.remote/man/MODIS_LAI_prep.Rd
@@ -10,21 +10,24 @@ MODIS_LAI_prep(
   outdir = NULL,
   search_window = 30,
   export_csv = FALSE,
-  sd_threshold = 20
+  sd_threshold = 20,
+  skip.download = TRUE
 )
 }
 \arguments{
-\item{site_info}{Bety list of site info including site_id, lon, and lat.}
+\item{site_info}{list: Bety list of site info including site_id, lon, and lat.}
 
-\item{time_points}{A vector contains each time point within the start and end date.}
+\item{time_points}{character: a vector contains each time point within the start and end date.}
 
-\item{outdir}{Where the final CSV file will be stored.}
+\item{outdir}{character: where the final CSV file will be stored.}
 
-\item{search_window}{search window for locate available LAI values.}
+\item{search_window}{numeric: search window for locate available LAI values.}
 
-\item{export_csv}{Decide if we want to export the CSV file.}
+\item{export_csv}{boolean: decide if we want to export the CSV file.}
 
-\item{sd_threshold}{Threshold for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.}
+\item{sd_threshold}{numeric: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.}
+
+\item{skip.download}{boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.}
 }
 \value{
 A data frame containing LAI and sd for each site and each time step.

From d76875f8b308644b3d62e505767d8e561bdc9e9d Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-zp3.scc.bu.edu>
Date: Fri, 21 Feb 2025 11:56:08 -0500
Subject: [PATCH 6/9] Replace message with pecan function.

---
 modules/data.remote/R/MODIS_LAI_prep.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/data.remote/R/MODIS_LAI_prep.R b/modules/data.remote/R/MODIS_LAI_prep.R
index 5709155165..18342e1172 100644
--- a/modules/data.remote/R/MODIS_LAI_prep.R
+++ b/modules/data.remote/R/MODIS_LAI_prep.R
@@ -264,7 +264,7 @@ Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdi
   }
   # create data frame of extents for MODIS tiles.
   # record progress.
-  message("\nCreating information table for downloaded MODIS tiles.")
+  PEcAn.logger::logger.info("\nCreating information table for downloaded MODIS tiles.")
   doSNOW::registerDoSNOW(cl)
   pb <- utils::txtProgressBar(min=1, max=length(modis.out), style=3)
   progress <- function(n) utils::setTxtProgressBar(pb, n)
@@ -282,7 +282,7 @@ Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdi
   unique.dates <- unique.dates[which(as.Date(unique.dates) >= from &
                                        as.Date(unique.dates) <= to)]
   # find inds of file paths that match each point.
-  message("\nFinding indexes of file paths that match each point.")
+  PEcAn.logger::logger.info("\nFinding indexes of file paths that match each point.")
   pb <- utils::txtProgressBar(min=1, max=length(pts), style=3)
   progress <- function(n) utils::setTxtProgressBar(pb, n)
   opts <- list(progress=progress)
@@ -303,7 +303,7 @@ Prep.MODIS.CSV.from.DAAC <- function(site_info, extent, from, to, download.outdi
   } %>% unlist
   # extract MODIS products.
   # loop over tiles.
-  message("\nExtracting MODIS datasets by points.")
+  PEcAn.logger::logger.info("\nExtracting MODIS datasets by points.")
   unique.tile.ids <- sort(unique(tile.ids)) 
   pb <- utils::txtProgressBar(min=1, max=length(unique.tile.ids), style=3)
   progress <- function(n) utils::setTxtProgressBar(pb, n)

From 458aa74bb88a6f0a5ceb48a1c10fef8f0ea8c34e Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-gl2.scc.bu.edu>
Date: Fri, 28 Feb 2025 11:24:26 -0500
Subject: [PATCH 7/9] Add documentation and examples in the Create settings
 script for the updated MODIS LAI prep functions.

---
 book_source/03_topical_pages/03_pecan_xml.Rmd     | 15 +++++++++++++--
 .../MultiSite-Exs/SDA/Create_Multi_settings.R     |  6 +++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/book_source/03_topical_pages/03_pecan_xml.Rmd b/book_source/03_topical_pages/03_pecan_xml.Rmd
index 4e6e6715a6..9de19c75cd 100644
--- a/book_source/03_topical_pages/03_pecan_xml.Rmd
+++ b/book_source/03_topical_pages/03_pecan_xml.Rmd
@@ -806,6 +806,11 @@ The following tags can be used for state data assimilation. More detailed inform
      <unit>year</unit>
      <num>1</num>
     </timestep>
+    <sd_threshold>20</sd_threshold>
+    <boundary>
+     <upper_boundary>0.95</upper_boundary>
+     <lower_boundary>0.05</lower_boundary>
+    </boundary>
    </MODIS_LAI>
    <SMAP_SMP>
     <search_window>30</search_window>
@@ -909,7 +914,7 @@ The following tags can be used for observation preparation for the SDA workflow.
 <state.data.assimilation>
   <Obs_Prep>
    <Landtrendr_AGB>
-    <AGB_indir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_indir>
+    <AGB_input_dir>/projectnb/dietzelab/dongchen/Multi-site/download_500_sites/AGB</AGB_input_dir>
     <allow_download>TRUE</allow_download>
     <export_csv>TRUE</export_csv>
     <timestep>
@@ -926,6 +931,11 @@ The following tags can be used for observation preparation for the SDA workflow.
      <unit>year</unit>
      <num>1</num>
     </timestep>
+    <sd_threshold>20</sd_threshold>
+    <boundary>
+     <upper_boundary>0.95</upper_boundary>
+     <lower_boundary>0.05</lower_boundary>
+    </boundary>
    </MODIS_LAI>
    
    <SMAP_SMP>
@@ -969,7 +979,8 @@ For the MODIS LAI and SMAP soil moisture observations, the `search_window` speci
 * **search_window** : [optional] This tag is used for LAI and SMAP observation preparation functions, which defines the search window within which the closest observation to the target date is selected. The default value is 30 days.
 * **update_csv** : [optional] This tag is used for SMAP preparation function only, which decide if we want to update the CSV file. Normally, we will only set it to be TRUE if we redownload the SMAP_GEE csv file from Google Earth Engine, the tutorial can be found in the SMAP_SMP_prep function itself. The default value is FALSE.
 * **run_parallel** : [optional] This tag defines if you want to proceed the MODIS LAI function parallely, the default value is FALSE.
-
+* **sd_threshold** : [optional] This tag defines the maximum allowable value for the MODIS LAI. Any records that have higher uncertainty will be removed.
+* **boundary** : [optional] This tag defines the upper and lower boundaries (in quantiles) for filtering the noisy LAI time series.
 
 ### (experimental) Benchmarking {#xml-benchmarking}
 
diff --git a/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R b/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
index 0202d56ced..13890d7d1c 100644
--- a/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
+++ b/modules/assim.sequential/inst/MultiSite-Exs/SDA/Create_Multi_settings.R
@@ -31,6 +31,9 @@ LAI_search_window <- 30
 LAI_timestep <- list(unit="year", num=1)
 LAI_export_csv <- TRUE
 run_parallel <- TRUE
+sd_threshold <- 20
+upper_quantile <- 0.95
+lower_quantile <- 0.05
 
 #SMP
 SMP_search_window <- 30
@@ -128,7 +131,8 @@ template <- PEcAn.settings::Settings(list(
     
     Obs_Prep = structure(list(
       Landtrendr_AGB = structure(list(AGB_indir = AGB_indir, timestep = AGB_timestep, allow_download = allow_download, export_csv = AGB_export_csv)),
-      MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel)),
+      MODIS_LAI = structure(list(search_window = LAI_search_window, timestep = LAI_timestep, export_csv = LAI_export_csv, run_parallel = run_parallel,
+                                 sd_threshold = sd_threshold, boundary = structure(list(upper_quantile = upper_quantile, lower_quantile = lower_quantile)))),
       SMAP_SMP = structure(list(search_window = SMP_search_window, timestep = SMP_timestep, export_csv = SMP_export_csv, update_csv = update_csv)),
       Soilgrids_SoilC = structure(list(timestep = SoilC_timestep, export_csv = SoilC_export_csv)),
       start.date = obs_start_date,

From 32fe7e0ce6eddf745c725a52f2da33c9f86ba3f9 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-gl2.scc.bu.edu>
Date: Fri, 28 Feb 2025 11:24:53 -0500
Subject: [PATCH 8/9] Add the feature of time series filtering for the MODIS
 LAI records.

---
 modules/data.remote/NAMESPACE                 |  1 +
 modules/data.remote/R/MODIS_LAI_prep.R        | 58 +++++++++++++++----
 modules/data.remote/man/MODIS_LAI_prep.Rd     |  7 ++-
 .../data.remote/man/MODIS_LAI_ts_filter.Rd    | 22 +++++++
 4 files changed, 76 insertions(+), 12 deletions(-)
 create mode 100644 modules/data.remote/man/MODIS_LAI_ts_filter.Rd

diff --git a/modules/data.remote/NAMESPACE b/modules/data.remote/NAMESPACE
index 53dee66ee5..ea83882a80 100644
--- a/modules/data.remote/NAMESPACE
+++ b/modules/data.remote/NAMESPACE
@@ -3,6 +3,7 @@
 export(GEDI_AGB_plot)
 export(Landtrendr_AGB_prep)
 export(MODIS_LAI_prep)
+export(MODIS_LAI_ts_filter)
 export(MODIS_LC_prep)
 export(NASA_DAAC_download)
 export(Prep.MODIS.CSV.from.DAAC)
diff --git a/modules/data.remote/R/MODIS_LAI_prep.R b/modules/data.remote/R/MODIS_LAI_prep.R
index 18342e1172..20dff5c7f4 100644
--- a/modules/data.remote/R/MODIS_LAI_prep.R
+++ b/modules/data.remote/R/MODIS_LAI_prep.R
@@ -5,16 +5,21 @@
 #' @param outdir character: where the final CSV file will be stored.
 #' @param search_window numeric: search window for locate available LAI values.
 #' @param export_csv boolean: decide if we want to export the CSV file.
-#' @param sd_threshold numeric: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
+#' @param sd_threshold numeric or character: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.
 #' @param skip.download boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.
+#' @param boundary numeric vector or list: the upper and lower quantiles for filtering out noisy LAI values (e.g., c(0.05, 0.95) or list(0.05, 0.95)). The default is NULL.
 #'
 #' @return A data frame containing LAI and sd for each site and each time step.
 #' @export
 #' 
 #' @author Dongchen Zhang
 #' @importFrom magrittr %>%
-MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20, skip.download = TRUE){
-  #initialize future parallel computation.
+MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window = 30, export_csv = FALSE, sd_threshold = 20, skip.download = TRUE, boundary = NULL){
+  # unlist boundary if it's passing from the assembler function.
+  if (is.list(boundary)) {
+    boundary <- as.numeric(unlist(boundary))
+  }
+  # initialize future parallel computation.
   if (future::supportsMulticore()) {
     future::plan(future::multicore, workers = 10)
   } else {
@@ -39,13 +44,18 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
     PEcAn.logger::logger.info("Extracting previous LAI file!")
     Previous_CSV <- utils::read.csv(file.path(outdir, "LAI.csv"), 
                                     colClasses = c(rep("character", 2), rep("numeric", 4), "character"))
+    # filter out high uncertain LAI records.
     if (!is.null(sd_threshold)) {
       PEcAn.logger::logger.info("filtering out records with high standard errors!")
-      ind.rm <- which(Previous_CSV$sd >= sd_threshold)
+      ind.rm <- which(Previous_CSV$sd >= as.numeric(sd_threshold))
       if (length(ind.rm) > 0) {
         Previous_CSV <- Previous_CSV[-ind.rm,]
       }
     }
+    # filter out noisy lai records from time series.
+    if (!is.null(boundary)) {
+      Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
+    }
     LAI_Output <- matrix(NA, length(site_info$site_id), 2*length(time_points)+1) %>% 
       `colnames<-`(c("site_id", paste0(time_points, "_LAI"), paste0(time_points, "_SD"))) %>% as.data.frame()#we need: site_id, LAI, std, target time point.
     LAI_Output$site_id <- site_info$site_id
@@ -92,7 +102,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                                                            start = dates$start_date,
                                                                            end = dates$end_date,
                                                                            progress = FALSE)))) {
-              return(list(mean = mean$value, date = mean$calendar_date))
+              return(list(mean = mean$value * as.numeric(mean$scale), date = mean$calendar_date))
             } else {
               return(NA)
             }
@@ -110,7 +120,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                                                           start = dates$start_date,
                                                                           end = dates$end_date,
                                                                           progress = FALSE)))) {
-              return(std$value)
+              return(std$value * as.numeric(std$scale))
             } else {
               return(NA)
             }
@@ -151,7 +161,7 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
           next
         }
         if (!is.null(sd_threshold)) {
-          if (lai_std[[i]][j] >= sd_threshold) {
+          if (lai_std[[i]][j] >= as.numeric(sd_threshold)) {
             next
           }
         }
@@ -159,11 +169,15 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
                                site_id = new_site_info$site_id[i],
                                lat = new_site_info$lat[i],
                                lon = new_site_info$lon[i],
-                               lai = lai_mean[[i]]$mean[j]*0.1,
-                               sd = lai_std[[i]][j]*0.1,
+                               lai = lai_mean[[i]]$mean[j],
+                               sd = lai_std[[i]][j],
                                qc = lai_qc[[i]][j]))
       }
     }
+    # filter out noisy lai records from time series.
+    if (!is.null(boundary)) {
+      Previous_CSV <- MODIS_LAI_ts_filter(Previous_CSV, boundary = boundary)
+    }
     #Compare with existing CSV file. (We name the CSV file as LAI.csv)
     if(as.logical(export_csv)){
       if(exists("Previous_CSV")){#we already read the csv file previously.
@@ -195,7 +209,31 @@ MODIS_LAI_prep <- function(site_info, time_points, outdir = NULL, search_window
   PEcAn.logger::logger.info("MODIS LAI Prep Completed!")
   list(LAI_Output = LAI_Output, time_points = time_points, var = "LAI")
 }
-
+#' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+#'
+#' @param lai.csv data frame: A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
+#' @param boundary numeric: A vector contains the upper and lower quantiles for filtering lai records. The default is c(0.05, 0.95).
+#' 
+#' @return A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
+#' @export
+#' 
+#' @author Dongchen Zhang
+MODIS_LAI_ts_filter <- function(lai.csv, boundary = c(0.05, 0.95)) {
+  # get unique site ids.
+  site.ids <- sort(unique(lai.csv$site_id))
+  # loop over sites.
+  for (i in seq_along(site.ids)) {
+    # get the lai records for the current site.
+    inds <- which(lai.csv$site_id == site.ids[i])
+    temp.lai <- lai.csv$lai[inds]
+    # calculate the upper and lower boundaries for the current lai records.
+    minmax <- quantile(temp.lai, boundary)
+    # find and remove outliers.
+    inds.rm <- which(temp.lai < minmax[1] | temp.lai > minmax[2])
+    lai.csv <- lai.csv[-inds[inds.rm]]
+  }
+  return(lai.csv)
+}
 #' Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
 #'
 #' @param site_info list: Bety list of site info including site_id, lon, and lat.
diff --git a/modules/data.remote/man/MODIS_LAI_prep.Rd b/modules/data.remote/man/MODIS_LAI_prep.Rd
index 5c9b15410e..25140470b0 100644
--- a/modules/data.remote/man/MODIS_LAI_prep.Rd
+++ b/modules/data.remote/man/MODIS_LAI_prep.Rd
@@ -11,7 +11,8 @@ MODIS_LAI_prep(
   search_window = 30,
   export_csv = FALSE,
   sd_threshold = 20,
-  skip.download = TRUE
+  skip.download = TRUE,
+  boundary = NULL
 )
 }
 \arguments{
@@ -25,9 +26,11 @@ MODIS_LAI_prep(
 
 \item{export_csv}{boolean: decide if we want to export the CSV file.}
 
-\item{sd_threshold}{numeric: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.}
+\item{sd_threshold}{numeric or character: for filtering out any estimations with unrealistic high standard error, default is 20. The QC check will be skipped if it's set as NULL.}
 
 \item{skip.download}{boolean: determine if we want to use existing LAI.csv file and skip the MODIS LAI download part.}
+
+\item{boundary}{numeric vector or list: the upper and lower quantiles for filtering out noisy LAI values (e.g., c(0.05, 0.95) or list(0.05, 0.95)). The default is NULL.}
 }
 \value{
 A data frame containing LAI and sd for each site and each time step.
diff --git a/modules/data.remote/man/MODIS_LAI_ts_filter.Rd b/modules/data.remote/man/MODIS_LAI_ts_filter.Rd
new file mode 100644
index 0000000000..84b45ebb87
--- /dev/null
+++ b/modules/data.remote/man/MODIS_LAI_ts_filter.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/MODIS_LAI_prep.R
+\name{MODIS_LAI_ts_filter}
+\alias{MODIS_LAI_ts_filter}
+\title{Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.}
+\usage{
+MODIS_LAI_ts_filter(lai.csv, boundary = c(0.05, 0.95))
+}
+\arguments{
+\item{lai.csv}{data frame: A data frame containing date, site id, lat, lon, lai, sd, and qc columns.}
+
+\item{boundary}{numeric: A vector contains the upper and lower quantiles for filtering lai records. The default is c(0.05, 0.95).}
+}
+\value{
+A data frame containing date, site id, lat, lon, lai, sd, and qc columns.
+}
+\description{
+Prepare MODIS LAI data from the NASA DAAC server for the SDA workflow.
+}
+\author{
+Dongchen Zhang
+}

From b647bd20bafcc165895cea9cb1796ba89cfb2120 Mon Sep 17 00:00:00 2001
From: Dongchen Zhang <zhangdc@scc-gl2.scc.bu.edu>
Date: Fri, 28 Feb 2025 11:25:16 -0500
Subject: [PATCH 9/9] Some improvements for the NASA DAAC download function.

---
 modules/data.remote/R/NASA_DAAC_download.R    | 71 +++++++++++--------
 .../data.remote/man/DAAC_Set_Credential.Rd    |  7 +-
 modules/data.remote/man/NASA_DAAC_download.Rd |  4 ++
 3 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/modules/data.remote/R/NASA_DAAC_download.R b/modules/data.remote/R/NASA_DAAC_download.R
index 3cdf207f1a..5b115af4eb 100644
--- a/modules/data.remote/R/NASA_DAAC_download.R
+++ b/modules/data.remote/R/NASA_DAAC_download.R
@@ -11,6 +11,8 @@
 #'   "yyyy-mm-dd".
 #' @param outdir Character: path of the directory in which to save the
 #'   downloaded files. Default is the current work directory(getwd()).
+#' @param credential.folder Character: physical path to the folder that contains 
+#' the credential file. The default is NULL.
 #' @param doi Character: data DOI on the NASA DAAC server, it can be obtained 
 #' directly from the NASA ORNL DAAC data portal (e.g., GEDI L4A through 
 #' https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2056).
@@ -48,6 +50,7 @@ NASA_DAAC_download <- function(ul_lat,
                                from,
                                to,
                                outdir = getwd(),
+                               credential.folder = NULL,
                                doi,
                                just_path = FALSE) {
   # Determine if we have enough inputs.
@@ -56,7 +59,7 @@ NASA_DAAC_download <- function(ul_lat,
     return(0)
   }
   # setup DAAC Credentials.
-  DAAC_Set_Credential()
+  DAAC_Set_Credential(folder.path = credential.folder)
   # setup arguments for URL.
   daterange <- c(from, to)
   # grab provider and concept id from CMR based on DOI.
@@ -64,34 +67,38 @@ NASA_DAAC_download <- function(ul_lat,
   # setup page number and bounding box.
   page <- 1
   bbox <- paste(ul_lon, lr_lat, lr_lon, ul_lat, sep = ",")
-  # loop over page number.
   # initialize variable for storing data.
   granules_href <- entry <- c()
-  repeat {
-    request_url <- NASA_DAAC_URL(provider = provider_conceptID$provider[1],
-                                 concept_id = provider_conceptID$concept_id[1],
-                                 page = page, 
-                                 bbox = bbox, 
-                                 daterange = daterange)
-    response <- curl::curl_fetch_memory(request_url)
-    content <- rawToChar(response$content)
-    result <- jsonlite::parse_json(content)
-    entry <- c(entry, result$feed$entry)
-    if (response$status_code != 200) {
-      stop(paste("\n", result$errors, collapse = "\n"))
+  # loop over providers.
+  for (i in seq_along(provider_conceptID[[2]])) {
+    # loop over page number.
+    repeat {
+      request_url <- NASA_DAAC_URL(provider = provider_conceptID$provider[i],
+                                   concept_id = provider_conceptID$concept_id[i],
+                                   page = page, 
+                                   bbox = bbox, 
+                                   daterange = daterange)
+      response <- curl::curl_fetch_memory(request_url)
+      content <- rawToChar(response$content)
+      result <- jsonlite::parse_json(content)
+      entry <- c(entry, result$feed$entry)
+      if (response$status_code != 200) {
+        stop(paste("\n", result$errors, collapse = "\n"))
+      }
+      granules <- result$feed$entry
+      if (length(granules) == 0) 
+        break
+      granules_href <- c(granules_href, sapply(granules, function(x) x$links[[1]]$href))
+      page <- page + 1
     }
-    granules <- result$feed$entry
-    if (length(granules) == 0) 
-      break
-    granules_href <- c(granules_href, sapply(granules, function(x) x$links[[1]]$href))
-    page <- page + 1
   }
-  # detect existing files.
-  same.file.inds <- which(basename(granules_href) %in% list.files(outdir))
-  if (length(same.file.inds) > 0) {
-    granules_href <- granules_href[-same.file.inds]
+  # detect existing files if we want to download the files.
+  if (!just_path) {
+    same.file.inds <- which(basename(granules_href) %in% list.files(outdir))
+    if (length(same.file.inds) > 0) {
+      granules_href <- granules_href[-same.file.inds]
+    }
   }
-  
   # if we need to download the data.
   if (length(granules_href) == 0) {
     return(NA)
@@ -168,7 +175,7 @@ NASA_DAAC_download <- function(ul_lat,
     # return paths of downloaded data and the associated metadata.
     return(list(metadata = entry, path = file.path(outdir, basename(granules_href))))
   } else {
-    return(basename(granules_href))
+    return(granules_href)
   }
 }
 #' Create URL that can be used to request data from NASA DAAC server.
@@ -245,14 +252,22 @@ NASA_CMR_finder <- function(doi) {
 
 #' Set NASA DAAC credentials to the current environment.
 #'
-#' @param replace Boolean: determine if we want to replace the current 
-#' credentials from the environment. 
+#' @param replace Boolean: determine if we want to replace the current credentials from the environment. The default is FALSE.
+#' @param folder.path Character: physical path to the folder that contains the credential file. The default is NULL.
 #'
 #' @author Dongchen Zhang
-DAAC_Set_Credential <- function(replace = FALSE) {
+DAAC_Set_Credential <- function(replace = FALSE, folder.path = NULL) {
   if (replace) {
     PEcAn.logger::logger.info("Replace previous stored NASA DAAC credentials.")
   }
+  # if we have the credential file.
+  if (!is.null(folder.path)) {
+    if (file.exists(file.path(folder.path, ".nasadaacapirc"))) {
+      key <- readLines(file.path(folder.path, ".nasadaacapirc"))
+      Sys.setenv(ed_un = key[1], ed_pw = key[2])
+    }
+  }
+  # otherwise we will type the credentials manually.
   if (replace | nchar(Sys.getenv("ed_un")) == 0 | nchar(Sys.getenv("ed_un")) == 0) {
     Sys.setenv(ed_un = sprintf(
       getPass::getPass(msg = "Enter NASA Earthdata Login Username \n (or create an account at urs.earthdata.nasa.gov) :")
diff --git a/modules/data.remote/man/DAAC_Set_Credential.Rd b/modules/data.remote/man/DAAC_Set_Credential.Rd
index c64fbf51ef..fd4f566c2c 100644
--- a/modules/data.remote/man/DAAC_Set_Credential.Rd
+++ b/modules/data.remote/man/DAAC_Set_Credential.Rd
@@ -4,11 +4,12 @@
 \alias{DAAC_Set_Credential}
 \title{Set NASA DAAC credentials to the current environment.}
 \usage{
-DAAC_Set_Credential(replace = FALSE)
+DAAC_Set_Credential(replace = FALSE, folder.path = NULL)
 }
 \arguments{
-\item{replace}{Boolean: determine if we want to replace the current 
-credentials from the environment.}
+\item{replace}{Boolean: determine if we want to replace the current credentials from the environment. The default is FALSE.}
+
+\item{folder.path}{Character: physical path to the folder that contains the credential file. The default is NULL.}
 }
 \description{
 Set NASA DAAC credentials to the current environment.
diff --git a/modules/data.remote/man/NASA_DAAC_download.Rd b/modules/data.remote/man/NASA_DAAC_download.Rd
index a76ab85fe0..9c023186df 100644
--- a/modules/data.remote/man/NASA_DAAC_download.Rd
+++ b/modules/data.remote/man/NASA_DAAC_download.Rd
@@ -13,6 +13,7 @@ NASA_DAAC_download(
   from,
   to,
   outdir = getwd(),
+  credential.folder = NULL,
   doi,
   just_path = FALSE
 )
@@ -37,6 +38,9 @@ NASA_DAAC_download(
 \item{outdir}{Character: path of the directory in which to save the
 downloaded files. Default is the current work directory(getwd()).}
 
+\item{credential.folder}{Character: physical path to the folder that contains 
+the credential file. The default is NULL.}
+
 \item{doi}{Character: data DOI on the NASA DAAC server, it can be obtained 
 directly from the NASA ORNL DAAC data portal (e.g., GEDI L4A through 
 https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=2056).}