From 244235422fead49eb899ad2b78a4150164648300 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Mon, 4 Dec 2023 20:54:43 -0700
Subject: [PATCH 1/8] Initial changes

---
 R/assign_study_data_types.R    | 90 +++++++++++++---------------------
 man/assign_study_data_types.Rd | 45 +++++++----------
 2 files changed, 50 insertions(+), 85 deletions(-)

diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R
index cfc0bd82..1a1860ba 100644
--- a/R/assign_study_data_types.R
+++ b/R/assign_study_data_types.R
@@ -1,67 +1,43 @@
-#' Summarize file annotations into a STRINGLIST column on a study table.
-#' @description Summarize fileview annotations into a string-list column on another table.
-#' @description For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column.
-#' @param study_table_id The synapse id of the portal study table. Must have write access.
-#' @param fileview_id The Synapse ID of the portal fileview.
-#' @param group_colname The column name to group by and join on (such as the default = 'studyId')
-#' @param source_colname The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table.
-#' @param sep If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included.
-#' @param valid_values A vector of valid values for the source_colname. e.g. the output of running `get_valid_values_from_json_schema()`
-#' @param dry_run Default = TRUE. Skips upload to table and instead prints study tibble.
-#' @return If dry_run == T, returns study tibble and skips upload.
-#' @examples 
+#' Summarize data types for the study
+#'
+#' Data types are summarized, or "rolled-up", for the study based on its child file annotations.
+#' This summary is added as (and will overwrite the current) `dataType` annotation for the study.
+#' Contrast this with `update_study_annotations`, where study-level annotations are rolled down to child files.
+#'
+#' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`.
+#' @param fileview_id Synapse ID of the portal fileview.
+#' @param id_col Name of the id column in `study_table_id` and `fileview_id`.
+#' @param attribute The attribute that we are rolling up; defaults to `dataType`. Must be queryable in `fileview_id`.
+#' @param dry_run Default = TRUE. Skips updating the annotation and instead displays annotation object.
+#' @examples
 #' \dontrun{
-#' assign_study_data_types(study_table_id = 'syn16787123', 
-#'                        fileview_id = 'syn16858331', 
-#'                        group_colname = 'studyId', 
-#'                        source_colname = "dataType", 
-#'                        sep = ",", 
-#'                        valid_values = get_valid_values_from_json_schema(), 
-#'                        dry_run = T)
-#'}                        
+#' assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK
+#'                         fileview_id = 'syn16858331',
+#'                         id_col = 'studyId',
+#'                         id_col = 'studyId',
+#'                         dry_run = T)
+#'}
 #' @export
-assign_study_data_types <- function(study_table_id, fileview_id, group_colname = "studyId",
-                                    source_colname = "dataType", sep = ",", valid_values, dry_run = TRUE){
+assign_study_data_types <- function(study_table_id,
+                                    fileview_id,
+                                    id_col = "studyId",
+                                    attribute = "dataType",
+                                    dry_run = TRUE) {
 
   .check_login()
 
-  ##query the study table
-  query <- .syn$tableQuery(glue::glue("select {group_colname}, {source_colname} from {study_table_id}", includeRowIdAndRowVersion=T))
+  # get studies from study table
+  studies <- .syn$tableQuery(glue::glue("select {id_col} from {study_table_id}", includeRowIdAndRowVersion=T))
 
-  studies <- query$filepath %>%
-    readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns
-
-  ##query the fileview
-  fv <- .syn$tableQuery(glue::glue('select {group_colname},{source_colname} from {fileview_id} where type = \'file\' and {group_colname} is not null and {source_colname} is not null'))$filepath %>%
+  # query the fileview
+  fv <- .syn$tableQuery(
+    glue::glue('select {id_col},{attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null'))$filepath %>%
     readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column
 
-  #TODO:: add support for stringlist-ed values
-
-  ##make simplified data table for stringlist-ing
-  data_types <- fv %>%
-    dplyr::select(one_of({{group_colname}}, {{source_colname}})) %>%
-    dplyr::distinct() %>%
-    tidyr::separate_rows({{source_colname}}, sep = {{sep}}) %>% ##this handles comma seperated or other delimited values
-    dplyr::filter(!!rlang::sym(source_colname) %in% valid_values)
-
-  studies <- dplyr::select(studies, ROW_ID, ROW_VERSION, {{group_colname}})
-
-  ##create stringlisted data
-  ids <- data_types %>% dplyr::group_by_at(group_colname) %>%
-    dplyr::summarise(!!rlang::sym(source_colname) := jsonlite::toJSON(!!rlang::sym(source_colname)))
-
-  ##join study table to stringlisted values, filter NA rows out, we don't need to update those
-  studies_updated <- dplyr::left_join(studies, ids) %>%
-    dplyr::filter(!is.na(!!rlang::sym(source_colname)))
+  if(dry_run == FALSE){
 
-  #TODO: could add check here to report number of updated rows vs original...
+  } else{
 
-  if(dry_run == FALSE){
-    .update_table_data(table_id = study_table_id,
-                       new_data = studies_updated,
-                       etag = query$etag)
-  }else{
-    studies_updated
   }
 }
 
@@ -77,11 +53,11 @@ get_valid_values_from_json_schema <- function(schema_url = 'https://raw.githubus
                                               parent_context = 'bts'){
 
   parent_id <- paste0(parent_context, ':', parent_name)
-  
-  subclasses <- 
+
+  subclasses <-
     jsonlite::fromJSON(schema_url) %>%
     purrr::pluck("@graph") %>%
-    dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>% 
+    dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>%
     dplyr::pull(`sms:displayName`)
 
   subclasses
diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd
index 51cf10e3..e43f7458 100644
--- a/man/assign_study_data_types.Rd
+++ b/man/assign_study_data_types.Rd
@@ -2,49 +2,38 @@
 % Please edit documentation in R/assign_study_data_types.R
 \name{assign_study_data_types}
 \alias{assign_study_data_types}
-\title{Summarize file annotations into a STRINGLIST column on a study table.}
+\title{Summarize data types for the study}
 \usage{
 assign_study_data_types(
   study_table_id,
   fileview_id,
-  group_colname = "studyId",
-  source_colname = "dataType",
-  sep = ",",
-  valid_values,
+  id_col = "studyId",
+  attribute = "dataType",
   dry_run = TRUE
 )
 }
 \arguments{
-\item{study_table_id}{The synapse id of the portal study table. Must have write access.}
+\item{study_table_id}{Synapse ID of the portal study table/view that lists relevant studies in column \code{id} or \code{studyId}.}
 
-\item{fileview_id}{The Synapse ID of the portal fileview.}
+\item{fileview_id}{Synapse ID of the portal fileview.}
 
-\item{group_colname}{The column name to group by and join on (such as the default = 'studyId')}
+\item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.}
 
-\item{source_colname}{The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table.}
+\item{attribute}{The attribute that we are rolling up; defaults to \code{dataType}. Must be queryable in \code{fileview_id}.}
 
-\item{sep}{If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included.}
-
-\item{valid_values}{A vector of valid values for the source_colname. e.g. the output of running \code{get_valid_values_from_json_schema()}}
-
-\item{dry_run}{Default = TRUE. Skips upload to table and instead prints study tibble.}
-}
-\value{
-If dry_run == T, returns study tibble and skips upload.
+\item{dry_run}{Default = TRUE. Skips updating the annotation and instead displays annotation object.}
 }
 \description{
-Summarize fileview annotations into a string-list column on another table.
-
-For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column.
+Data types are summarized, or "rolled-up", for the study based on its child file annotations.
+This summary is added as (and will overwrite the current) \code{dataType} annotation for the study.
+Contrast this with \code{update_study_annotations}, where study-level annotations are rolled down to child files.
 }
 \examples{
 \dontrun{
-assign_study_data_types(study_table_id = 'syn16787123', 
-                       fileview_id = 'syn16858331', 
-                       group_colname = 'studyId', 
-                       source_colname = "dataType", 
-                       sep = ",", 
-                       valid_values = get_valid_values_from_json_schema(), 
-                       dry_run = T)
-}                        
+assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK
+                        fileview_id = 'syn16858331',
+                        id_col = 'studyId',
+                        attribute = 'dataType',
+                        dry_run = T)
+}
 }

From d2378a6b7b64f64138200470d0c87789a6d57d65 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Wed, 13 Dec 2023 20:19:50 -0700
Subject: [PATCH 2/8] Test and refine update

---
 R/assign_study_data_types.R    | 25 +++++++++++++++----------
 man/assign_study_data_types.Rd |  9 +++++----
 2 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R
index 1a1860ba..f00a5ff2 100644
--- a/R/assign_study_data_types.R
+++ b/R/assign_study_data_types.R
@@ -7,11 +7,12 @@
 #' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`.
 #' @param fileview_id Synapse ID of the portal fileview.
 #' @param id_col Name of the id column in `study_table_id` and `fileview_id`.
-#' @param attribute The attribute that we are rolling up; defaults to `dataType`. Must be queryable in `fileview_id`.
-#' @param dry_run Default = TRUE. Skips updating the annotation and instead displays annotation object.
+#' @param attribute The attribute that we are rolling up; name should not contain spaces.
+#' Defaults to `dataType`. Must be queryable in `fileview_id`.
+#' @param dry_run Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.
 #' @examples
 #' \dontrun{
-#' assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK
+#' assign_study_data_types(study_table_id = 'syn52694652',
 #'                         fileview_id = 'syn16858331',
 #'                         id_col = 'studyId',
 #'                         id_col = 'studyId',
@@ -27,18 +28,22 @@ assign_study_data_types <- function(study_table_id,
   .check_login()
 
   # get studies from study table
-  studies <- .syn$tableQuery(glue::glue("select {id_col} from {study_table_id}", includeRowIdAndRowVersion=T))
+  studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist()
 
   # query the fileview
   fv <- .syn$tableQuery(
-    glue::glue('select {id_col},{attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null'))$filepath %>%
-    readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column
-
-  if(dry_run == FALSE){
-
-  } else{
+    glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"),
+    includeRowIdAndRowVersion = F)$asDataFrame()
+  meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces occasional issue
+  names(meta) <- fv[[id_col]]
 
+  dry_list <- list()
+  for(study in names(meta)) {
+    study_meta <- .syn$get_annotations(study)
+    study_meta[attribute] <- meta[[study]]
+    if(dry_run) dry_list[[study]] <- study_meta else .syn$set_annotations(study_meta)
   }
+  if(dry_run) dry_list
 }
 
 #' Retrieve valid subclasses of a value in a JSON-LD schema
diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd
index e43f7458..2b88c812 100644
--- a/man/assign_study_data_types.Rd
+++ b/man/assign_study_data_types.Rd
@@ -19,9 +19,10 @@ assign_study_data_types(
 
 \item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.}
 
-\item{attribute}{The attribute that we are rolling up; defaults to \code{dataType}. Must be queryable in \code{fileview_id}.}
+\item{attribute}{The attribute that we are rolling up; name should not contain spaces.
+Defaults to \code{dataType}. Must be queryable in \code{fileview_id}.}
 
-\item{dry_run}{Default = TRUE. Skips updating the annotation and instead displays annotation object.}
+\item{dry_run}{Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.}
 }
 \description{
 Data types are summarized, or "rolled-up", for the study based on its child file annotations.
@@ -30,10 +31,10 @@ Contrast this with \code{update_study_annotations}, where study-level annotation
 }
 \examples{
 \dontrun{
-assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK
+assign_study_data_types(study_table_id = 'syn52694652',
                         fileview_id = 'syn16858331',
                         id_col = 'studyId',
-                        attribute = 'dataType',
+                        id_col = 'studyId',
                         dry_run = T)
 }
 }

From 0188eb9be26dfc2dd3f0602aa4dac89adf7ce9e7 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 14 Dec 2023 12:57:20 -0700
Subject: [PATCH 3/8] Lint and add check

---
 R/assign_study_data_types.R | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R
index f00a5ff2..4fae36d7 100644
--- a/R/assign_study_data_types.R
+++ b/R/assign_study_data_types.R
@@ -1,15 +1,15 @@
 #' Summarize data types for the study
 #'
 #' Data types are summarized, or "rolled-up", for the study based on its child file annotations.
-#' This summary is added as (and will overwrite the current) `dataType` annotation for the study.
-#' Contrast this with `update_study_annotations`, where study-level annotations are rolled down to child files.
+#' Summary values are added back and overwrites the current `dataType` annotation for the study.
+#' See also `update_study_annotations`, where study-level annotations are *rolled down* to child files.
 #'
 #' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`.
 #' @param fileview_id Synapse ID of the portal fileview.
 #' @param id_col Name of the id column in `study_table_id` and `fileview_id`.
-#' @param attribute The attribute that we are rolling up; name should not contain spaces.
-#' Defaults to `dataType`. Must be queryable in `fileview_id`.
-#' @param dry_run Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.
+#' @param attribute The attribute that we are summarizing from `fileview_id`; name should not contain spaces. Defaults to `dataType`.
+#' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects.
+#' @return List of annotations objects.
 #' @examples
 #' \dontrun{
 #' assign_study_data_types(study_table_id = 'syn52694652',
@@ -34,16 +34,26 @@ assign_study_data_types <- function(study_table_id,
   fv <- .syn$tableQuery(
     glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"),
     includeRowIdAndRowVersion = F)$asDataFrame()
-  meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces occasional issue
+  meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces still occasional issue
   names(meta) <- fv[[id_col]]
 
-  dry_list <- list()
+  result_list <- list()
   for(study in names(meta)) {
     study_meta <- .syn$get_annotations(study)
     study_meta[attribute] <- meta[[study]]
-    if(dry_run) dry_list[[study]] <- study_meta else .syn$set_annotations(study_meta)
+    result_list[[study]] <- study_meta
+    if(!dry_run) {
+      # Also submit study_meta conditional on hard-coded check / roll-ups should not really exceed 50 values
+      if(length(meta[[study]] > 50)) {
+        warning(glue::glue("There are over 50 values. Since this exceeds typical length limits and might indicate data issues, skipping update for {study}."))
+      } else {
+        .syn$set_annotations(study_meta)
+        message(glue::glue("Updated {study} {attribute} summary"))
+      }
+    }
   }
-  if(dry_run) dry_list
+
+  invisible(result_list)
 }
 
 #' Retrieve valid subclasses of a value in a JSON-LD schema

From b1546ac81c8f6b7088087497b4f2d9cf215ce0fc Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 19 Dec 2023 10:25:38 -0700
Subject: [PATCH 4/8] Generalize initial update

---
 R/assign_study_data_types.R | 88 +++++++++++++++++++++++++++----------
 R/basic_utils.R             | 40 ++++++++---------
 2 files changed, 85 insertions(+), 43 deletions(-)

diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R
index 4fae36d7..8b03c179 100644
--- a/R/assign_study_data_types.R
+++ b/R/assign_study_data_types.R
@@ -1,13 +1,14 @@
 #' Summarize data types for the study
 #'
 #' Data types are summarized, or "rolled-up", for the study based on its child file annotations.
-#' Summary values are added back and overwrites the current `dataType` annotation for the study.
-#' See also `update_study_annotations`, where study-level annotations are *rolled down* to child files.
+#' Summary values are added back as and overwrites the current `dataType` annotation for the study.
+#' See also the related `update_study_annotations`, where study-level annotations are *rolled down* to child files.
+#' Note that under-the-hood this now wraps a generalized util `summarize_attribute`.
 #'
-#' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`.
-#' @param fileview_id Synapse ID of the portal fileview.
-#' @param id_col Name of the id column in `study_table_id` and `fileview_id`.
-#' @param attribute The attribute that we are summarizing from `fileview_id`; name should not contain spaces. Defaults to `dataType`.
+#' @param study_table_id Synapse ID of reference portal study table. Used to get study ids.
+#' @param fileview_id Synapse ID of the reference portal fileview.
+#' @param id_col Name of the study id column in `study_table_id` and `fileview_id`. Defaults to `studyId`.
+#' @param attribute Attribute being summarized using fileview. Defaults to `dataType`.
 #' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects.
 #' @return List of annotations objects.
 #' @examples
@@ -15,7 +16,7 @@
 #' assign_study_data_types(study_table_id = 'syn52694652',
 #'                         fileview_id = 'syn16858331',
 #'                         id_col = 'studyId',
-#'                         id_col = 'studyId',
+#'                         attribute = 'dataType',
 #'                         dry_run = T)
 #'}
 #' @export
@@ -27,35 +28,76 @@ assign_study_data_types <- function(study_table_id,
 
   .check_login()
 
-  # get studies from study table
+  # get studies within scope from study table
   studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist()
 
   # query the fileview
-  fv <- .syn$tableQuery(
-    glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"),
-    includeRowIdAndRowVersion = F)$asDataFrame()
-  meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces still occasional issue
-  names(meta) <- fv[[id_col]]
+  query <- glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id}
+                      where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}")
+
+  check_fun <- function(values) {
+    if(length(values) > 50) {
+      warning(glue::glue("Over 50 values found, which will break the schema."))
+      return(FALSE)
+    } else {
+      return(TRUE)
+    }
+  }
+
+  summarize_attribute(summary_query = query,
+                      attribute = attribute,
+                      entity_id = id_col,
+                      dry_run = dry_run,
+                      check_fun = check_fun)
+}
+
+
+#' Helper summarization util
+#'
+#' Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity.
+#' The entity could be X itself or another entity Y, e.g. a parent container entity.
+#' Example 1: With datasets, summarize `species` for all the files.
+#' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`).
+#'
+#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row.
+#' @param attribute Name of attribute to update as annotation.
+#' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids.
+#' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects.
+#' @param check_fun An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = `FALSE`.
+#' It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).
+#' @export
+summarize_attribute <- function(summary_query,
+                                attribute,
+                                entity_id = NULL,
+                                dry_run = TRUE,
+                                check_fun = NULL) {
+
+  values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame()
+  meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces
+  if(is_valid_syn_id(entity_id)) {
+    names(meta) <- entity_id
+  } else {
+    names(meta) <- values[[entity_id]]
+  }
 
   result_list <- list()
-  for(study in names(meta)) {
-    study_meta <- .syn$get_annotations(study)
-    study_meta[attribute] <- meta[[study]]
-    result_list[[study]] <- study_meta
+  for(entity in names(meta)) {
+    entity_meta <- .syn$get_annotations(entity)
+    entity_meta[attribute] <- meta[[entity]]
+    result_list[[entity]] <- entity_meta
     if(!dry_run) {
-      # Also submit study_meta conditional on hard-coded check / roll-ups should not really exceed 50 values
-      if(length(meta[[study]] > 50)) {
-        warning(glue::glue("There are over 50 values. Since this exceeds typical length limits and might indicate data issues, skipping update for {study}."))
+      if(is.function(check_fun)) {
+        if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.")
       } else {
-        .syn$set_annotations(study_meta)
-        message(glue::glue("Updated {study} {attribute} summary"))
+        .syn$set_annotations(entity_meta)
+        message(glue::glue("Updated {entity} {attribute}."))
       }
     }
   }
-
   invisible(result_list)
 }
 
+
 #' Retrieve valid subclasses of a value in a JSON-LD schema
 #' @description Retrieve valid subclasses of a value in a JSON-LD schema generated by schematic.
 #' @param schema_url Default: the NF-OSI JSON-LD schema.
diff --git a/R/basic_utils.R b/R/basic_utils.R
index 7ac0b487..572d5a56 100644
--- a/R/basic_utils.R
+++ b/R/basic_utils.R
@@ -1,5 +1,5 @@
 #' Create copy of entity
-#' 
+#'
 #' Create a copy of syn entity; mostly used to create a copy on which to test out changes.
 #' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy
 #' @param entity Entity to copy.
@@ -7,28 +7,28 @@
 #' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE.
 #' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE.
 #' @keywords internal
-copy <- function(entity, 
-                 destination_id, 
-                 skip_copy_wiki_page = FALSE, 
+copy <- function(entity,
+                 destination_id,
+                 skip_copy_wiki_page = FALSE,
                  skip_copy_annotations = FALSE) {
-  
+
   .check_login()
   # load synapseutils as needed
-  
-  
-  synapseutils$copy(.syn, 
-                    entity = entity, 
-                    destinationId = destination_id, 
-                    skipCopyWikiPage = skip_copy_wiki_page, 
+
+
+  synapseutils$copy(.syn,
+                    entity = entity,
+                    destinationId = destination_id,
+                    skipCopyWikiPage = skip_copy_wiki_page,
                     skipCopyAnnotations = skip_copy_annotations)
-  
+
 }
 
 
 #' Download and read file to `data.table`
 #'
 #' Convenience function for reading a delimited local file or one on Synapse.
-#' 
+#'
 #' @keywords internal
 #' @import data.table
 dt_read <- function(file) {
@@ -46,13 +46,13 @@ dt_read <- function(file) {
 }
 
 #' Extract synapse id from URI or other string
-#' 
+#'
 #' @param uri URI or string containing embedded Synapse id.
 #' @keywords internal
 bare_syn_id <- function(uri) {
   not_na <- which(!is.na(uri))
   x <- uri[not_na]
-  syn <- regmatches(x, regexpr("syn[0-9]{8,9}", x))
+  syn <- regmatches(x, regexpr("syn[0-9]{8,12}", x))
   uri[not_na] <- syn
   return(uri)
 }
@@ -63,21 +63,21 @@ bare_syn_id <- function(uri) {
 #' @param id Id string.
 #' @keywords internal
 is_valid_syn_id <- function(id) {
-  result <- grepl("^syn[0-9]{8,9}$", id)
+  result <- grepl("^syn[0-9]{8,12}$", id)
   result
 }
 
 #' Walk through a directory
-#' 
+#'
 #' For now, an internal util imported from `synapseutils`.
 #' @param syn_id Synapse id of directory root to traverse.
-#' @param as_list 
-#' @return An R list or Py generator object. 
+#' @param as_list
+#' @return An R list or Py generator object.
 #' @keywords internal
 walk <- function(syn_id, as_list = TRUE) {
   .check_login()
   x <- synapseutils$walk(.syn, syn_id)
   if(as_list) reticulate::iterate(x) else x
-  
+
 }
 

From 24d2640ccf1664080a8c3a35eb87bfa05012eb84 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 19 Dec 2023 10:26:08 -0700
Subject: [PATCH 5/8] Update docs, exports

---
 NAMESPACE                      |  1 +
 man/assign_study_data_types.Rd | 21 ++++++++++++---------
 man/summarize_attribute.Rd     | 32 ++++++++++++++++++++++++++++++++
 man/walk.Rd                    |  2 --
 4 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 man/summarize_attribute.Rd

diff --git a/NAMESPACE b/NAMESPACE
index a5b4842f..07871f65 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -80,6 +80,7 @@ export(register_study)
 export(register_study_files)
 export(remove_button)
 export(remove_wiki_subpage)
+export(summarize_attribute)
 export(summarize_file_access)
 export(swap_col)
 export(syn_login)
diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd
index 2b88c812..82ab2264 100644
--- a/man/assign_study_data_types.Rd
+++ b/man/assign_study_data_types.Rd
@@ -13,28 +13,31 @@ assign_study_data_types(
 )
 }
 \arguments{
-\item{study_table_id}{Synapse ID of the portal study table/view that lists relevant studies in column \code{id} or \code{studyId}.}
+\item{study_table_id}{Synapse ID of reference portal study table. Used to get study ids.}
 
-\item{fileview_id}{Synapse ID of the portal fileview.}
+\item{fileview_id}{Synapse ID of the reference portal fileview.}
 
-\item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.}
+\item{id_col}{Name of the study id column in \code{study_table_id} and \code{fileview_id}. Defaults to \code{studyId}.}
 
-\item{attribute}{The attribute that we are rolling up; name should not contain spaces.
-Defaults to \code{dataType}. Must be queryable in \code{fileview_id}.}
+\item{attribute}{Attribute being summarized using fileview. Defaults to \code{dataType}.}
 
-\item{dry_run}{Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.}
+\item{dry_run}{Default = TRUE. Whether to update as well or just return list of annotation objects.}
+}
+\value{
+List of annotations objects.
 }
 \description{
 Data types are summarized, or "rolled-up", for the study based on its child file annotations.
-This summary is added as (and will overwrite the current) \code{dataType} annotation for the study.
-Contrast this with \code{update_study_annotations}, where study-level annotations are rolled down to child files.
+Summary values are added back as and overwrites the current \code{dataType} annotation for the study.
+See also the related \code{update_study_annotations}, where study-level annotations are \emph{rolled down} to child files.
+Note that under-the-hood this now wraps a generalized util \code{summarize_attribute}.
 }
 \examples{
 \dontrun{
 assign_study_data_types(study_table_id = 'syn52694652',
                         fileview_id = 'syn16858331',
                         id_col = 'studyId',
-                        id_col = 'studyId',
+                        attribute = 'dataType',
                         dry_run = T)
 }
 }
diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd
new file mode 100644
index 00000000..ac7d500a
--- /dev/null
+++ b/man/summarize_attribute.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/assign_study_data_types.R
+\name{summarize_attribute}
+\alias{summarize_attribute}
+\title{Helper summarization util}
+\usage{
+summarize_attribute(
+  summary_query,
+  attribute,
+  entity_id = NULL,
+  dry_run = TRUE,
+  check_fun = NULL
+)
+}
+\arguments{
+\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row.}
+
+\item{attribute}{Name of attribute to update as annotation.}
+
+\item{entity_id}{Either a single valid Synapse id of the entity for which to update the attribute \emph{or} a column present in \code{summary_query} that stores ids.}
+
+\item{dry_run}{Default = \code{TRUE}. Whether to update as well or just return list of annotation objects.}
+
+\item{check_fun}{An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = \code{FALSE}.
+It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).}
+}
+\description{
+Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity.
+The entity could be X itself or another entity Y, e.g. a parent container entity.
+Example 1: With datasets, summarize \code{species} for all the files.
+Example 2: With projects, summarize \code{dataType} for all the files (in fact, see \code{assign_study_data_types}).
+}
diff --git a/man/walk.Rd b/man/walk.Rd
index 46e6c177..9f94b335 100644
--- a/man/walk.Rd
+++ b/man/walk.Rd
@@ -8,8 +8,6 @@ walk(syn_id, as_list = TRUE)
 }
 \arguments{
 \item{syn_id}{Synapse id of directory root to traverse.}
-
-\item{as_list}{}
 }
 \value{
 An R list or Py generator object.

From bded195ef19749a52ba01a6348dcde50b1be2bb9 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 19 Dec 2023 10:27:59 -0700
Subject: [PATCH 6/8] Update pkgdown index

---
 _pkgdown.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 469f6c74..fc12b754 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -13,6 +13,7 @@ reference:
   - register_study
   - add_people_from_table
   - register_study_files
+  - summarize_attribute
 - subtitle: Lower-level table maintenance
 - contents:
   - adjust_view

From e147da4e86c09cb686baa42cae1a6d2d130daf9a Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com>
Date: Thu, 21 Dec 2023 12:36:30 -0700
Subject: [PATCH 7/8] Update R/assign_study_data_types.R

Co-authored-by: Robert Allaway <allaway@users.noreply.github.com>
---
 R/assign_study_data_types.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R
index 8b03c179..6cb92a56 100644
--- a/R/assign_study_data_types.R
+++ b/R/assign_study_data_types.R
@@ -59,7 +59,7 @@ assign_study_data_types <- function(study_table_id,
 #' Example 1: With datasets, summarize `species` for all the files.
 #' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`).
 #'
-#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row.
+#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add `group_concat`, `distinct`, and or `unnest` to the query to get the correct list of distinct values depending on your data (e.g.`select group_concat(distinct unnest(tumorType)) as tumorType from ...`).
 #' @param attribute Name of attribute to update as annotation.
 #' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids.
 #' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects.

From 37122872ac4835c648ca8173e94fdf53760addd7 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Fri, 22 Dec 2023 11:49:50 -0700
Subject: [PATCH 8/8] Docs

---
 R/basic_utils.R            | 3 ++-
 README.md                  | 2 ++
 man/dt_read.Rd             | 3 +++
 man/summarize_attribute.Rd | 2 +-
 man/walk.Rd                | 2 ++
 5 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/R/basic_utils.R b/R/basic_utils.R
index 572d5a56..99308fcb 100644
--- a/R/basic_utils.R
+++ b/R/basic_utils.R
@@ -29,6 +29,7 @@ copy <- function(entity,
 #'
 #' Convenience function for reading a delimited local file or one on Synapse.
 #'
+#' @param file File Synapse id or local path.
 #' @keywords internal
 #' @import data.table
 dt_read <- function(file) {
@@ -71,7 +72,7 @@ is_valid_syn_id <- function(id) {
 #'
 #' For now, an internal util imported from `synapseutils`.
 #' @param syn_id Synapse id of directory root to traverse.
-#' @param as_list
+#' @param as_list Whether to return as R list.
 #' @return An R list or Py generator object.
 #' @keywords internal
 walk <- function(syn_id, as_list = TRUE) {
diff --git a/README.md b/README.md
index 1e829e59..586697ab 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,8 @@ You can install `nfportalutils` from here:
 remotes::install_github("nf-osi/nfportalutils")
 ```
 
+
+
 ## Additional Notes for Users
 
 - View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html). 
diff --git a/man/dt_read.Rd b/man/dt_read.Rd
index 7ab29e28..fec9a5ea 100644
--- a/man/dt_read.Rd
+++ b/man/dt_read.Rd
@@ -6,6 +6,9 @@
 \usage{
 dt_read(file)
 }
+\arguments{
+\item{file}{File Synapse id or local path.}
+}
 \description{
 Convenience function for reading a delimited local file or one on Synapse.
 }
diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd
index ac7d500a..1edb4c90 100644
--- a/man/summarize_attribute.Rd
+++ b/man/summarize_attribute.Rd
@@ -13,7 +13,7 @@ summarize_attribute(
 )
 }
 \arguments{
-\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row.}
+\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add \code{group_concat}, \code{distinct}, and or \code{unnest} to the query to get the correct list of distinct values depending on your data (e.g.\verb{select group_concat(distinct unnest(tumorType)) as tumorType from ...}).}
 
 \item{attribute}{Name of attribute to update as annotation.}
 
diff --git a/man/walk.Rd b/man/walk.Rd
index 9f94b335..cf4529f3 100644
--- a/man/walk.Rd
+++ b/man/walk.Rd
@@ -8,6 +8,8 @@ walk(syn_id, as_list = TRUE)
 }
 \arguments{
 \item{syn_id}{Synapse id of directory root to traverse.}
+
+\item{as_list}{Whether to return as R list.}
 }
 \value{
 An R list or Py generator object.