From 244235422fead49eb899ad2b78a4150164648300 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Mon, 4 Dec 2023 20:54:43 -0700 Subject: [PATCH 1/8] Initial changes --- R/assign_study_data_types.R | 90 +++++++++++++--------------------- man/assign_study_data_types.Rd | 45 +++++++---------- 2 files changed, 50 insertions(+), 85 deletions(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index cfc0bd82..1a1860ba 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -1,67 +1,43 @@ -#' Summarize file annotations into a STRINGLIST column on a study table. -#' @description Summarize fileview annotations into a string-list column on another table. -#' @description For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column. -#' @param study_table_id The synapse id of the portal study table. Must have write access. -#' @param fileview_id The Synapse ID of the portal fileview. -#' @param group_colname The column name to group by and join on (such as the default = 'studyId') -#' @param source_colname The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table. -#' @param sep If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included. -#' @param valid_values A vector of valid values for the source_colname. e.g. the output of running `get_valid_values_from_json_schema()` -#' @param dry_run Default = TRUE. Skips upload to table and instead prints study tibble. -#' @return If dry_run == T, returns study tibble and skips upload. -#' @examples +#' Summarize data types for the study +#' +#' Data types are summarized, or "rolled-up", for the study based on its child file annotations. +#' This summary is added as (and will overwrite the current) `dataType` annotation for the study. +#' Contrast this with `update_study_annotations`, where study-level annotations are rolled down to child files. +#' +#' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`. +#' @param fileview_id Synapse ID of the portal fileview. +#' @param id_col Name of the id column in `study_table_id` and `fileview_id`. +#' @param attribute The attribute that we are rolling up; defaults to `dataType`. Must be queryable in `fileview_id`. +#' @param dry_run Default = TRUE. Skips updating the annotation and instead displays annotation object. +#' @examples #' \dontrun{ -#' assign_study_data_types(study_table_id = 'syn16787123', -#' fileview_id = 'syn16858331', -#' group_colname = 'studyId', -#' source_colname = "dataType", -#' sep = ",", -#' valid_values = get_valid_values_from_json_schema(), -#' dry_run = T) -#'} +#' assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK +#' fileview_id = 'syn16858331', +#' id_col = 'studyId', +#' id_col = 'studyId', +#' dry_run = T) +#'} #' @export -assign_study_data_types <- function(study_table_id, fileview_id, group_colname = "studyId", - source_colname = "dataType", sep = ",", valid_values, dry_run = TRUE){ +assign_study_data_types <- function(study_table_id, + fileview_id, + id_col = "studyId", + attribute = "dataType", + dry_run = TRUE) { .check_login() - ##query the study table - query <- .syn$tableQuery(glue::glue("select {group_colname}, {source_colname} from {study_table_id}", includeRowIdAndRowVersion=T)) + # get studies from study table + studies <- .syn$tableQuery(glue::glue("select {id_col} from {study_table_id}", includeRowIdAndRowVersion=T)) - studies <- query$filepath %>% - readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns - - ##query the fileview - fv <- .syn$tableQuery(glue::glue('select {group_colname},{source_colname} from {fileview_id} where type = \'file\' and {group_colname} is not null and {source_colname} is not null'))$filepath %>% + # query the fileview + fv <- .syn$tableQuery( + glue::glue('select {id_col},{attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null'))$filepath %>% readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column - #TODO:: add support for stringlist-ed values - - ##make simplified data table for stringlist-ing - data_types <- fv %>% - dplyr::select(one_of({{group_colname}}, {{source_colname}})) %>% - dplyr::distinct() %>% - tidyr::separate_rows({{source_colname}}, sep = {{sep}}) %>% ##this handles comma seperated or other delimited values - dplyr::filter(!!rlang::sym(source_colname) %in% valid_values) - - studies <- dplyr::select(studies, ROW_ID, ROW_VERSION, {{group_colname}}) - - ##create stringlisted data - ids <- data_types %>% dplyr::group_by_at(group_colname) %>% - dplyr::summarise(!!rlang::sym(source_colname) := jsonlite::toJSON(!!rlang::sym(source_colname))) - - ##join study table to stringlisted values, filter NA rows out, we don't need to update those - studies_updated <- dplyr::left_join(studies, ids) %>% - dplyr::filter(!is.na(!!rlang::sym(source_colname))) + if(dry_run == FALSE){ - #TODO: could add check here to report number of updated rows vs original... + } else{ - if(dry_run == FALSE){ - .update_table_data(table_id = study_table_id, - new_data = studies_updated, - etag = query$etag) - }else{ - studies_updated } } @@ -77,11 +53,11 @@ get_valid_values_from_json_schema <- function(schema_url = 'https://raw.githubus parent_context = 'bts'){ parent_id <- paste0(parent_context, ':', parent_name) - - subclasses <- + + subclasses <- jsonlite::fromJSON(schema_url) %>% purrr::pluck("@graph") %>% - dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>% + dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>% dplyr::pull(`sms:displayName`) subclasses diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd index 51cf10e3..e43f7458 100644 --- a/man/assign_study_data_types.Rd +++ b/man/assign_study_data_types.Rd @@ -2,49 +2,38 @@ % Please edit documentation in R/assign_study_data_types.R \name{assign_study_data_types} \alias{assign_study_data_types} -\title{Summarize file annotations into a STRINGLIST column on a study table.} +\title{Summarize data types for the study} \usage{ assign_study_data_types( study_table_id, fileview_id, - group_colname = "studyId", - source_colname = "dataType", - sep = ",", - valid_values, + id_col = "studyId", + attribute = "dataType", dry_run = TRUE ) } \arguments{ -\item{study_table_id}{The synapse id of the portal study table. Must have write access.} +\item{study_table_id}{Synapse ID of the portal study table/view that lists relevant studies in column \code{id} or \code{studyId}.} -\item{fileview_id}{The Synapse ID of the portal fileview.} +\item{fileview_id}{Synapse ID of the portal fileview.} -\item{group_colname}{The column name to group by and join on (such as the default = 'studyId')} +\item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.} -\item{source_colname}{The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table.} +\item{attribute}{The attribute that we are rolling up; defaults to \code{dataType}. Must be queryable in \code{fileview_id}.} -\item{sep}{If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included.} - -\item{valid_values}{A vector of valid values for the source_colname. e.g. the output of running \code{get_valid_values_from_json_schema()}} - -\item{dry_run}{Default = TRUE. Skips upload to table and instead prints study tibble.} -} -\value{ -If dry_run == T, returns study tibble and skips upload. +\item{dry_run}{Default = TRUE. Skips updating the annotation and instead displays annotation object.} } \description{ -Summarize fileview annotations into a string-list column on another table. - -For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column. +Data types are summarized, or "rolled-up", for the study based on its child file annotations. +This summary is added as (and will overwrite the current) \code{dataType} annotation for the study. +Contrast this with \code{update_study_annotations}, where study-level annotations are rolled down to child files. } \examples{ \dontrun{ -assign_study_data_types(study_table_id = 'syn16787123', - fileview_id = 'syn16858331', - group_colname = 'studyId', - source_colname = "dataType", - sep = ",", - valid_values = get_valid_values_from_json_schema(), - dry_run = T) -} +assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK + fileview_id = 'syn16858331', + id_col = 'studyId', + attribute = 'dataType', + dry_run = T) +} } From d2378a6b7b64f64138200470d0c87789a6d57d65 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Wed, 13 Dec 2023 20:19:50 -0700 Subject: [PATCH 2/8] Test and refine update --- R/assign_study_data_types.R | 25 +++++++++++++++---------- man/assign_study_data_types.Rd | 9 +++++---- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index 1a1860ba..f00a5ff2 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -7,11 +7,12 @@ #' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`. #' @param fileview_id Synapse ID of the portal fileview. #' @param id_col Name of the id column in `study_table_id` and `fileview_id`. -#' @param attribute The attribute that we are rolling up; defaults to `dataType`. Must be queryable in `fileview_id`. -#' @param dry_run Default = TRUE. Skips updating the annotation and instead displays annotation object. +#' @param attribute The attribute that we are rolling up; name should not contain spaces. +#' Defaults to `dataType`. Must be queryable in `fileview_id`. +#' @param dry_run Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list. #' @examples #' \dontrun{ -#' assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK +#' assign_study_data_types(study_table_id = 'syn52694652', #' fileview_id = 'syn16858331', #' id_col = 'studyId', #' id_col = 'studyId', @@ -27,18 +28,22 @@ assign_study_data_types <- function(study_table_id, .check_login() # get studies from study table - studies <- .syn$tableQuery(glue::glue("select {id_col} from {study_table_id}", includeRowIdAndRowVersion=T)) + studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist() # query the fileview fv <- .syn$tableQuery( - glue::glue('select {id_col},{attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null'))$filepath %>% - readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column - - if(dry_run == FALSE){ - - } else{ + glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"), + includeRowIdAndRowVersion = F)$asDataFrame() + meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces occasional issue + names(meta) <- fv[[id_col]] + dry_list <- list() + for(study in names(meta)) { + study_meta <- .syn$get_annotations(study) + study_meta[attribute] <- meta[[study]] + if(dry_run) dry_list[[study]] <- study_meta else .syn$set_annotations(study_meta) } + if(dry_run) dry_list } #' Retrieve valid subclasses of a value in a JSON-LD schema diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd index e43f7458..2b88c812 100644 --- a/man/assign_study_data_types.Rd +++ b/man/assign_study_data_types.Rd @@ -19,9 +19,10 @@ assign_study_data_types( \item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.} -\item{attribute}{The attribute that we are rolling up; defaults to \code{dataType}. Must be queryable in \code{fileview_id}.} +\item{attribute}{The attribute that we are rolling up; name should not contain spaces. +Defaults to \code{dataType}. Must be queryable in \code{fileview_id}.} -\item{dry_run}{Default = TRUE. Skips updating the annotation and instead displays annotation object.} +\item{dry_run}{Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.} } \description{ Data types are summarized, or "rolled-up", for the study based on its child file annotations. @@ -30,10 +31,10 @@ Contrast this with \code{update_study_annotations}, where study-level annotation } \examples{ \dontrun{ -assign_study_data_types(study_table_id = 'syn52677631', # either syn52677631 or syn52694652 OK +assign_study_data_types(study_table_id = 'syn52694652', fileview_id = 'syn16858331', id_col = 'studyId', - attribute = 'dataType', + id_col = 'studyId', dry_run = T) } } From 0188eb9be26dfc2dd3f0602aa4dac89adf7ce9e7 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 14 Dec 2023 12:57:20 -0700 Subject: [PATCH 3/8] Lint and add check --- R/assign_study_data_types.R | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index f00a5ff2..4fae36d7 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -1,15 +1,15 @@ #' Summarize data types for the study #' #' Data types are summarized, or "rolled-up", for the study based on its child file annotations. -#' This summary is added as (and will overwrite the current) `dataType` annotation for the study. -#' Contrast this with `update_study_annotations`, where study-level annotations are rolled down to child files. +#' Summary values are added back and overwrites the current `dataType` annotation for the study. +#' See also `update_study_annotations`, where study-level annotations are *rolled down* to child files. #' #' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`. #' @param fileview_id Synapse ID of the portal fileview. #' @param id_col Name of the id column in `study_table_id` and `fileview_id`. -#' @param attribute The attribute that we are rolling up; name should not contain spaces. -#' Defaults to `dataType`. Must be queryable in `fileview_id`. -#' @param dry_run Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list. +#' @param attribute The attribute that we are summarizing from `fileview_id`; name should not contain spaces. Defaults to `dataType`. +#' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects. +#' @return List of annotations objects. #' @examples #' \dontrun{ #' assign_study_data_types(study_table_id = 'syn52694652', @@ -34,16 +34,26 @@ assign_study_data_types <- function(study_table_id, fv <- .syn$tableQuery( glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"), includeRowIdAndRowVersion = F)$asDataFrame() - meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces occasional issue + meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces still occasional issue names(meta) <- fv[[id_col]] - dry_list <- list() + result_list <- list() for(study in names(meta)) { study_meta <- .syn$get_annotations(study) study_meta[attribute] <- meta[[study]] - if(dry_run) dry_list[[study]] <- study_meta else .syn$set_annotations(study_meta) + result_list[[study]] <- study_meta + if(!dry_run) { + # Also submit study_meta conditional on hard-coded check / roll-ups should not really exceed 50 values + if(length(meta[[study]] > 50)) { + warning(glue::glue("There are over 50 values. Since this exceeds typical length limits and might indicate data issues, skipping update for {study}.")) + } else { + .syn$set_annotations(study_meta) + message(glue::glue("Updated {study} {attribute} summary")) + } + } } - if(dry_run) dry_list + + invisible(result_list) } #' Retrieve valid subclasses of a value in a JSON-LD schema From b1546ac81c8f6b7088087497b4f2d9cf215ce0fc Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 19 Dec 2023 10:25:38 -0700 Subject: [PATCH 4/8] Generalize initial update --- R/assign_study_data_types.R | 88 +++++++++++++++++++++++++++---------- R/basic_utils.R | 40 ++++++++--------- 2 files changed, 85 insertions(+), 43 deletions(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index 4fae36d7..8b03c179 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -1,13 +1,14 @@ #' Summarize data types for the study #' #' Data types are summarized, or "rolled-up", for the study based on its child file annotations. -#' Summary values are added back and overwrites the current `dataType` annotation for the study. -#' See also `update_study_annotations`, where study-level annotations are *rolled down* to child files. +#' Summary values are added back as and overwrites the current `dataType` annotation for the study. +#' See also the related `update_study_annotations`, where study-level annotations are *rolled down* to child files. +#' Note that under-the-hood this now wraps a generalized util `summarize_attribute`. #' -#' @param study_table_id Synapse ID of the portal study table/view that lists relevant studies in column `id` or `studyId`. -#' @param fileview_id Synapse ID of the portal fileview. -#' @param id_col Name of the id column in `study_table_id` and `fileview_id`. -#' @param attribute The attribute that we are summarizing from `fileview_id`; name should not contain spaces. Defaults to `dataType`. +#' @param study_table_id Synapse ID of reference portal study table. Used to get study ids. +#' @param fileview_id Synapse ID of the reference portal fileview. +#' @param id_col Name of the study id column in `study_table_id` and `fileview_id`. Defaults to `studyId`. +#' @param attribute Attribute being summarized using fileview. Defaults to `dataType`. #' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects. #' @return List of annotations objects. #' @examples @@ -15,7 +16,7 @@ #' assign_study_data_types(study_table_id = 'syn52694652', #' fileview_id = 'syn16858331', #' id_col = 'studyId', -#' id_col = 'studyId', +#' attribute = 'dataType', #' dry_run = T) #'} #' @export @@ -27,35 +28,76 @@ assign_study_data_types <- function(study_table_id, .check_login() - # get studies from study table + # get studies within scope from study table studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist() # query the fileview - fv <- .syn$tableQuery( - glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}"), - includeRowIdAndRowVersion = F)$asDataFrame() - meta <- lapply(fv[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # stray whitespaces still occasional issue - names(meta) <- fv[[id_col]] + query <- glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} + where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}") + + check_fun <- function(values) { + if(length(values) > 50) { + warning(glue::glue("Over 50 values found, which will break the schema.")) + return(FALSE) + } else { + return(TRUE) + } + } + + summarize_attribute(summary_query = query, + attribute = attribute, + entity_id = id_col, + dry_run = dry_run, + check_fun = check_fun) +} + + +#' Helper summarization util +#' +#' Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity. +#' The entity could be X itself or another entity Y, e.g. a parent container entity. +#' Example 1: With datasets, summarize `species` for all the files. +#' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`). +#' +#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. +#' @param attribute Name of attribute to update as annotation. +#' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids. +#' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects. +#' @param check_fun An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = `FALSE`. +#' It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values). +#' @export +summarize_attribute <- function(summary_query, + attribute, + entity_id = NULL, + dry_run = TRUE, + check_fun = NULL) { + + values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame() + meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces + if(is_valid_syn_id(entity_id)) { + names(meta) <- entity_id + } else { + names(meta) <- values[[entity_id]] + } result_list <- list() - for(study in names(meta)) { - study_meta <- .syn$get_annotations(study) - study_meta[attribute] <- meta[[study]] - result_list[[study]] <- study_meta + for(entity in names(meta)) { + entity_meta <- .syn$get_annotations(entity) + entity_meta[attribute] <- meta[[entity]] + result_list[[entity]] <- entity_meta if(!dry_run) { - # Also submit study_meta conditional on hard-coded check / roll-ups should not really exceed 50 values - if(length(meta[[study]] > 50)) { - warning(glue::glue("There are over 50 values. Since this exceeds typical length limits and might indicate data issues, skipping update for {study}.")) + if(is.function(check_fun)) { + if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.") } else { - .syn$set_annotations(study_meta) - message(glue::glue("Updated {study} {attribute} summary")) + .syn$set_annotations(entity_meta) + message(glue::glue("Updated {entity} {attribute}.")) } } } - invisible(result_list) } + #' Retrieve valid subclasses of a value in a JSON-LD schema #' @description Retrieve valid subclasses of a value in a JSON-LD schema generated by schematic. #' @param schema_url Default: the NF-OSI JSON-LD schema. diff --git a/R/basic_utils.R b/R/basic_utils.R index 7ac0b487..572d5a56 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -1,5 +1,5 @@ #' Create copy of entity -#' +#' #' Create a copy of syn entity; mostly used to create a copy on which to test out changes. #' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy #' @param entity Entity to copy. @@ -7,28 +7,28 @@ #' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE. #' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE. #' @keywords internal -copy <- function(entity, - destination_id, - skip_copy_wiki_page = FALSE, +copy <- function(entity, + destination_id, + skip_copy_wiki_page = FALSE, skip_copy_annotations = FALSE) { - + .check_login() # load synapseutils as needed - - - synapseutils$copy(.syn, - entity = entity, - destinationId = destination_id, - skipCopyWikiPage = skip_copy_wiki_page, + + + synapseutils$copy(.syn, + entity = entity, + destinationId = destination_id, + skipCopyWikiPage = skip_copy_wiki_page, skipCopyAnnotations = skip_copy_annotations) - + } #' Download and read file to `data.table` #' #' Convenience function for reading a delimited local file or one on Synapse. -#' +#' #' @keywords internal #' @import data.table dt_read <- function(file) { @@ -46,13 +46,13 @@ dt_read <- function(file) { } #' Extract synapse id from URI or other string -#' +#' #' @param uri URI or string containing embedded Synapse id. #' @keywords internal bare_syn_id <- function(uri) { not_na <- which(!is.na(uri)) x <- uri[not_na] - syn <- regmatches(x, regexpr("syn[0-9]{8,9}", x)) + syn <- regmatches(x, regexpr("syn[0-9]{8,12}", x)) uri[not_na] <- syn return(uri) } @@ -63,21 +63,21 @@ bare_syn_id <- function(uri) { #' @param id Id string. #' @keywords internal is_valid_syn_id <- function(id) { - result <- grepl("^syn[0-9]{8,9}$", id) + result <- grepl("^syn[0-9]{8,12}$", id) result } #' Walk through a directory -#' +#' #' For now, an internal util imported from `synapseutils`. #' @param syn_id Synapse id of directory root to traverse. -#' @param as_list -#' @return An R list or Py generator object. +#' @param as_list +#' @return An R list or Py generator object. #' @keywords internal walk <- function(syn_id, as_list = TRUE) { .check_login() x <- synapseutils$walk(.syn, syn_id) if(as_list) reticulate::iterate(x) else x - + } From 24d2640ccf1664080a8c3a35eb87bfa05012eb84 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 19 Dec 2023 10:26:08 -0700 Subject: [PATCH 5/8] Update docs, exports --- NAMESPACE | 1 + man/assign_study_data_types.Rd | 21 ++++++++++++--------- man/summarize_attribute.Rd | 32 ++++++++++++++++++++++++++++++++ man/walk.Rd | 2 -- 4 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 man/summarize_attribute.Rd diff --git a/NAMESPACE b/NAMESPACE index a5b4842f..07871f65 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -80,6 +80,7 @@ export(register_study) export(register_study_files) export(remove_button) export(remove_wiki_subpage) +export(summarize_attribute) export(summarize_file_access) export(swap_col) export(syn_login) diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd index 2b88c812..82ab2264 100644 --- a/man/assign_study_data_types.Rd +++ b/man/assign_study_data_types.Rd @@ -13,28 +13,31 @@ assign_study_data_types( ) } \arguments{ -\item{study_table_id}{Synapse ID of the portal study table/view that lists relevant studies in column \code{id} or \code{studyId}.} +\item{study_table_id}{Synapse ID of reference portal study table. Used to get study ids.} -\item{fileview_id}{Synapse ID of the portal fileview.} +\item{fileview_id}{Synapse ID of the reference portal fileview.} -\item{id_col}{Name of the id column in \code{study_table_id} and \code{fileview_id}.} +\item{id_col}{Name of the study id column in \code{study_table_id} and \code{fileview_id}. Defaults to \code{studyId}.} -\item{attribute}{The attribute that we are rolling up; name should not contain spaces. -Defaults to \code{dataType}. Must be queryable in \code{fileview_id}.} +\item{attribute}{Attribute being summarized using fileview. Defaults to \code{dataType}.} -\item{dry_run}{Default = TRUE. Skips updating the annotation and instead returns annotation object(s) list.} +\item{dry_run}{Default = TRUE. Whether to update as well or just return list of annotation objects.} +} +\value{ +List of annotations objects. } \description{ Data types are summarized, or "rolled-up", for the study based on its child file annotations. -This summary is added as (and will overwrite the current) \code{dataType} annotation for the study. -Contrast this with \code{update_study_annotations}, where study-level annotations are rolled down to child files. +Summary values are added back as and overwrites the current \code{dataType} annotation for the study. +See also the related \code{update_study_annotations}, where study-level annotations are \emph{rolled down} to child files. +Note that under-the-hood this now wraps a generalized util \code{summarize_attribute}. } \examples{ \dontrun{ assign_study_data_types(study_table_id = 'syn52694652', fileview_id = 'syn16858331', id_col = 'studyId', - id_col = 'studyId', + attribute = 'dataType', dry_run = T) } } diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd new file mode 100644 index 00000000..ac7d500a --- /dev/null +++ b/man/summarize_attribute.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign_study_data_types.R +\name{summarize_attribute} +\alias{summarize_attribute} +\title{Helper summarization util} +\usage{ +summarize_attribute( + summary_query, + attribute, + entity_id = NULL, + dry_run = TRUE, + check_fun = NULL +) +} +\arguments{ +\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row.} + +\item{attribute}{Name of attribute to update as annotation.} + +\item{entity_id}{Either a single valid Synapse id of the entity for which to update the attribute \emph{or} a column present in \code{summary_query} that stores ids.} + +\item{dry_run}{Default = \code{TRUE}. Whether to update as well or just return list of annotation objects.} + +\item{check_fun}{An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = \code{FALSE}. +It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).} +} +\description{ +Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity. +The entity could be X itself or another entity Y, e.g. a parent container entity. +Example 1: With datasets, summarize \code{species} for all the files. +Example 2: With projects, summarize \code{dataType} for all the files (in fact, see \code{assign_study_data_types}). +} diff --git a/man/walk.Rd b/man/walk.Rd index 46e6c177..9f94b335 100644 --- a/man/walk.Rd +++ b/man/walk.Rd @@ -8,8 +8,6 @@ walk(syn_id, as_list = TRUE) } \arguments{ \item{syn_id}{Synapse id of directory root to traverse.} - -\item{as_list}{} } \value{ An R list or Py generator object. From bded195ef19749a52ba01a6348dcde50b1be2bb9 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 19 Dec 2023 10:27:59 -0700 Subject: [PATCH 6/8] Update pkgdown index --- _pkgdown.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 469f6c74..fc12b754 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,6 +13,7 @@ reference: - register_study - add_people_from_table - register_study_files + - summarize_attribute - subtitle: Lower-level table maintenance - contents: - adjust_view From e147da4e86c09cb686baa42cae1a6d2d130daf9a Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu <32753274+anngvu@users.noreply.github.com> Date: Thu, 21 Dec 2023 12:36:30 -0700 Subject: [PATCH 7/8] Update R/assign_study_data_types.R Co-authored-by: Robert Allaway --- R/assign_study_data_types.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index 8b03c179..6cb92a56 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -59,7 +59,7 @@ assign_study_data_types <- function(study_table_id, #' Example 1: With datasets, summarize `species` for all the files. #' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`). #' -#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. +#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add `group_concat`, `distinct`, and or `unnest` to the query to get the correct list of distinct values depending on your data (e.g.`select group_concat(distinct unnest(tumorType)) as tumorType from ...`). #' @param attribute Name of attribute to update as annotation. #' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids. #' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects. From 37122872ac4835c648ca8173e94fdf53760addd7 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Fri, 22 Dec 2023 11:49:50 -0700 Subject: [PATCH 8/8] Docs --- R/basic_utils.R | 3 ++- README.md | 2 ++ man/dt_read.Rd | 3 +++ man/summarize_attribute.Rd | 2 +- man/walk.Rd | 2 ++ 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/R/basic_utils.R b/R/basic_utils.R index 572d5a56..99308fcb 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -29,6 +29,7 @@ copy <- function(entity, #' #' Convenience function for reading a delimited local file or one on Synapse. #' +#' @param file File Synapse id or local path. #' @keywords internal #' @import data.table dt_read <- function(file) { @@ -71,7 +72,7 @@ is_valid_syn_id <- function(id) { #' #' For now, an internal util imported from `synapseutils`. #' @param syn_id Synapse id of directory root to traverse. -#' @param as_list +#' @param as_list Whether to return as R list. #' @return An R list or Py generator object. #' @keywords internal walk <- function(syn_id, as_list = TRUE) { diff --git a/README.md b/README.md index 1e829e59..586697ab 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ You can install `nfportalutils` from here: remotes::install_github("nf-osi/nfportalutils") ``` + + ## Additional Notes for Users - View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html). diff --git a/man/dt_read.Rd b/man/dt_read.Rd index 7ab29e28..fec9a5ea 100644 --- a/man/dt_read.Rd +++ b/man/dt_read.Rd @@ -6,6 +6,9 @@ \usage{ dt_read(file) } +\arguments{ +\item{file}{File Synapse id or local path.} +} \description{ Convenience function for reading a delimited local file or one on Synapse. } diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd index ac7d500a..1edb4c90 100644 --- a/man/summarize_attribute.Rd +++ b/man/summarize_attribute.Rd @@ -13,7 +13,7 @@ summarize_attribute( ) } \arguments{ -\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row.} +\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add \code{group_concat}, \code{distinct}, and or \code{unnest} to the query to get the correct list of distinct values depending on your data (e.g.\verb{select group_concat(distinct unnest(tumorType)) as tumorType from ...}).} \item{attribute}{Name of attribute to update as annotation.} diff --git a/man/walk.Rd b/man/walk.Rd index 9f94b335..cf4529f3 100644 --- a/man/walk.Rd +++ b/man/walk.Rd @@ -8,6 +8,8 @@ walk(syn_id, as_list = TRUE) } \arguments{ \item{syn_id}{Synapse id of directory root to traverse.} + +\item{as_list}{Whether to return as R list.} } \value{ An R list or Py generator object.