diff --git a/NAMESPACE b/NAMESPACE index a5b4842f..07871f65 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -80,6 +80,7 @@ export(register_study) export(register_study_files) export(remove_button) export(remove_wiki_subpage) +export(summarize_attribute) export(summarize_file_access) export(swap_col) export(syn_login) diff --git a/R/assign_study_data_types.R b/R/assign_study_data_types.R index cfc0bd82..6cb92a56 100644 --- a/R/assign_study_data_types.R +++ b/R/assign_study_data_types.R @@ -1,70 +1,103 @@ -#' Summarize file annotations into a STRINGLIST column on a study table. -#' @description Summarize fileview annotations into a string-list column on another table. -#' @description For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column. -#' @param study_table_id The synapse id of the portal study table. Must have write access. -#' @param fileview_id The Synapse ID of the portal fileview. -#' @param group_colname The column name to group by and join on (such as the default = 'studyId') -#' @param source_colname The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table. -#' @param sep If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included. -#' @param valid_values A vector of valid values for the source_colname. e.g. the output of running `get_valid_values_from_json_schema()` -#' @param dry_run Default = TRUE. Skips upload to table and instead prints study tibble. -#' @return If dry_run == T, returns study tibble and skips upload. -#' @examples +#' Summarize data types for the study +#' +#' Data types are summarized, or "rolled-up", for the study based on its child file annotations. +#' Summary values are added back as and overwrites the current `dataType` annotation for the study. +#' See also the related `update_study_annotations`, where study-level annotations are *rolled down* to child files. +#' Note that under-the-hood this now wraps a generalized util `summarize_attribute`. +#' +#' @param study_table_id Synapse ID of reference portal study table. Used to get study ids. +#' @param fileview_id Synapse ID of the reference portal fileview. +#' @param id_col Name of the study id column in `study_table_id` and `fileview_id`. Defaults to `studyId`. +#' @param attribute Attribute being summarized using fileview. Defaults to `dataType`. +#' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects. +#' @return List of annotations objects. +#' @examples #' \dontrun{ -#' assign_study_data_types(study_table_id = 'syn16787123', -#' fileview_id = 'syn16858331', -#' group_colname = 'studyId', -#' source_colname = "dataType", -#' sep = ",", -#' valid_values = get_valid_values_from_json_schema(), -#' dry_run = T) -#'} +#' assign_study_data_types(study_table_id = 'syn52694652', +#' fileview_id = 'syn16858331', +#' id_col = 'studyId', +#' attribute = 'dataType', +#' dry_run = T) +#'} #' @export -assign_study_data_types <- function(study_table_id, fileview_id, group_colname = "studyId", - source_colname = "dataType", sep = ",", valid_values, dry_run = TRUE){ +assign_study_data_types <- function(study_table_id, + fileview_id, + id_col = "studyId", + attribute = "dataType", + dry_run = TRUE) { .check_login() - ##query the study table - query <- .syn$tableQuery(glue::glue("select {group_colname}, {source_colname} from {study_table_id}", includeRowIdAndRowVersion=T)) + # get studies within scope from study table + studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist() - studies <- query$filepath %>% - readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns + # query the fileview + query <- glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id} + where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}") - ##query the fileview - fv <- .syn$tableQuery(glue::glue('select {group_colname},{source_colname} from {fileview_id} where type = \'file\' and {group_colname} is not null and {source_colname} is not null'))$filepath %>% - readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column - - #TODO:: add support for stringlist-ed values - - ##make simplified data table for stringlist-ing - data_types <- fv %>% - dplyr::select(one_of({{group_colname}}, {{source_colname}})) %>% - dplyr::distinct() %>% - tidyr::separate_rows({{source_colname}}, sep = {{sep}}) %>% ##this handles comma seperated or other delimited values - dplyr::filter(!!rlang::sym(source_colname) %in% valid_values) + check_fun <- function(values) { + if(length(values) > 50) { + warning(glue::glue("Over 50 values found, which will break the schema.")) + return(FALSE) + } else { + return(TRUE) + } + } - studies <- dplyr::select(studies, ROW_ID, ROW_VERSION, {{group_colname}}) + summarize_attribute(summary_query = query, + attribute = attribute, + entity_id = id_col, + dry_run = dry_run, + check_fun = check_fun) +} - ##create stringlisted data - ids <- data_types %>% dplyr::group_by_at(group_colname) %>% - dplyr::summarise(!!rlang::sym(source_colname) := jsonlite::toJSON(!!rlang::sym(source_colname))) - ##join study table to stringlisted values, filter NA rows out, we don't need to update those - studies_updated <- dplyr::left_join(studies, ids) %>% - dplyr::filter(!is.na(!!rlang::sym(source_colname))) +#' Helper summarization util +#' +#' Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity. +#' The entity could be X itself or another entity Y, e.g. a parent container entity. +#' Example 1: With datasets, summarize `species` for all the files. +#' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`). +#' +#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add `group_concat`, `distinct`, and or `unnest` to the query to get the correct list of distinct values depending on your data (e.g.`select group_concat(distinct unnest(tumorType)) as tumorType from ...`). +#' @param attribute Name of attribute to update as annotation. +#' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids. +#' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects. +#' @param check_fun An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = `FALSE`. +#' It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values). +#' @export +summarize_attribute <- function(summary_query, + attribute, + entity_id = NULL, + dry_run = TRUE, + check_fun = NULL) { - #TODO: could add check here to report number of updated rows vs original... + values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame() + meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces + if(is_valid_syn_id(entity_id)) { + names(meta) <- entity_id + } else { + names(meta) <- values[[entity_id]] + } - if(dry_run == FALSE){ - .update_table_data(table_id = study_table_id, - new_data = studies_updated, - etag = query$etag) - }else{ - studies_updated + result_list <- list() + for(entity in names(meta)) { + entity_meta <- .syn$get_annotations(entity) + entity_meta[attribute] <- meta[[entity]] + result_list[[entity]] <- entity_meta + if(!dry_run) { + if(is.function(check_fun)) { + if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.") + } else { + .syn$set_annotations(entity_meta) + message(glue::glue("Updated {entity} {attribute}.")) + } + } } + invisible(result_list) } + #' Retrieve valid subclasses of a value in a JSON-LD schema #' @description Retrieve valid subclasses of a value in a JSON-LD schema generated by schematic. #' @param schema_url Default: the NF-OSI JSON-LD schema. @@ -77,11 +110,11 @@ get_valid_values_from_json_schema <- function(schema_url = 'https://raw.githubus parent_context = 'bts'){ parent_id <- paste0(parent_context, ':', parent_name) - - subclasses <- + + subclasses <- jsonlite::fromJSON(schema_url) %>% purrr::pluck("@graph") %>% - dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>% + dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>% dplyr::pull(`sms:displayName`) subclasses diff --git a/R/basic_utils.R b/R/basic_utils.R index 7ac0b487..99308fcb 100644 --- a/R/basic_utils.R +++ b/R/basic_utils.R @@ -1,5 +1,5 @@ #' Create copy of entity -#' +#' #' Create a copy of syn entity; mostly used to create a copy on which to test out changes. #' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy #' @param entity Entity to copy. @@ -7,28 +7,29 @@ #' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE. #' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE. #' @keywords internal -copy <- function(entity, - destination_id, - skip_copy_wiki_page = FALSE, +copy <- function(entity, + destination_id, + skip_copy_wiki_page = FALSE, skip_copy_annotations = FALSE) { - + .check_login() # load synapseutils as needed - - - synapseutils$copy(.syn, - entity = entity, - destinationId = destination_id, - skipCopyWikiPage = skip_copy_wiki_page, + + + synapseutils$copy(.syn, + entity = entity, + destinationId = destination_id, + skipCopyWikiPage = skip_copy_wiki_page, skipCopyAnnotations = skip_copy_annotations) - + } #' Download and read file to `data.table` #' #' Convenience function for reading a delimited local file or one on Synapse. -#' +#' +#' @param file File Synapse id or local path. #' @keywords internal #' @import data.table dt_read <- function(file) { @@ -46,13 +47,13 @@ dt_read <- function(file) { } #' Extract synapse id from URI or other string -#' +#' #' @param uri URI or string containing embedded Synapse id. #' @keywords internal bare_syn_id <- function(uri) { not_na <- which(!is.na(uri)) x <- uri[not_na] - syn <- regmatches(x, regexpr("syn[0-9]{8,9}", x)) + syn <- regmatches(x, regexpr("syn[0-9]{8,12}", x)) uri[not_na] <- syn return(uri) } @@ -63,21 +64,21 @@ bare_syn_id <- function(uri) { #' @param id Id string. #' @keywords internal is_valid_syn_id <- function(id) { - result <- grepl("^syn[0-9]{8,9}$", id) + result <- grepl("^syn[0-9]{8,12}$", id) result } #' Walk through a directory -#' +#' #' For now, an internal util imported from `synapseutils`. #' @param syn_id Synapse id of directory root to traverse. -#' @param as_list -#' @return An R list or Py generator object. +#' @param as_list Whether to return as R list. +#' @return An R list or Py generator object. #' @keywords internal walk <- function(syn_id, as_list = TRUE) { .check_login() x <- synapseutils$walk(.syn, syn_id) if(as_list) reticulate::iterate(x) else x - + } diff --git a/README.md b/README.md index 1e829e59..586697ab 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ You can install `nfportalutils` from here: remotes::install_github("nf-osi/nfportalutils") ``` + + ## Additional Notes for Users - View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html). diff --git a/_pkgdown.yml b/_pkgdown.yml index 469f6c74..fc12b754 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,6 +13,7 @@ reference: - register_study - add_people_from_table - register_study_files + - summarize_attribute - subtitle: Lower-level table maintenance - contents: - adjust_view diff --git a/man/assign_study_data_types.Rd b/man/assign_study_data_types.Rd index 51cf10e3..82ab2264 100644 --- a/man/assign_study_data_types.Rd +++ b/man/assign_study_data_types.Rd @@ -2,49 +2,42 @@ % Please edit documentation in R/assign_study_data_types.R \name{assign_study_data_types} \alias{assign_study_data_types} -\title{Summarize file annotations into a STRINGLIST column on a study table.} +\title{Summarize data types for the study} \usage{ assign_study_data_types( study_table_id, fileview_id, - group_colname = "studyId", - source_colname = "dataType", - sep = ",", - valid_values, + id_col = "studyId", + attribute = "dataType", dry_run = TRUE ) } \arguments{ -\item{study_table_id}{The synapse id of the portal study table. Must have write access.} +\item{study_table_id}{Synapse ID of reference portal study table. Used to get study ids.} -\item{fileview_id}{The Synapse ID of the portal fileview.} +\item{fileview_id}{Synapse ID of the reference portal fileview.} -\item{group_colname}{The column name to group by and join on (such as the default = 'studyId')} +\item{id_col}{Name of the study id column in \code{study_table_id} and \code{fileview_id}. Defaults to \code{studyId}.} -\item{source_colname}{The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table.} +\item{attribute}{Attribute being summarized using fileview. Defaults to \code{dataType}.} -\item{sep}{If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included.} - -\item{valid_values}{A vector of valid values for the source_colname. e.g. the output of running \code{get_valid_values_from_json_schema()}} - -\item{dry_run}{Default = TRUE. Skips upload to table and instead prints study tibble.} +\item{dry_run}{Default = TRUE. Whether to update as well or just return list of annotation objects.} } \value{ -If dry_run == T, returns study tibble and skips upload. +List of annotations objects. } \description{ -Summarize fileview annotations into a string-list column on another table. - -For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column. +Data types are summarized, or "rolled-up", for the study based on its child file annotations. +Summary values are added back as and overwrites the current \code{dataType} annotation for the study. +See also the related \code{update_study_annotations}, where study-level annotations are \emph{rolled down} to child files. +Note that under-the-hood this now wraps a generalized util \code{summarize_attribute}. } \examples{ \dontrun{ -assign_study_data_types(study_table_id = 'syn16787123', - fileview_id = 'syn16858331', - group_colname = 'studyId', - source_colname = "dataType", - sep = ",", - valid_values = get_valid_values_from_json_schema(), - dry_run = T) -} +assign_study_data_types(study_table_id = 'syn52694652', + fileview_id = 'syn16858331', + id_col = 'studyId', + attribute = 'dataType', + dry_run = T) +} } diff --git a/man/dt_read.Rd b/man/dt_read.Rd index 7ab29e28..fec9a5ea 100644 --- a/man/dt_read.Rd +++ b/man/dt_read.Rd @@ -6,6 +6,9 @@ \usage{ dt_read(file) } +\arguments{ +\item{file}{File Synapse id or local path.} +} \description{ Convenience function for reading a delimited local file or one on Synapse. } diff --git a/man/summarize_attribute.Rd b/man/summarize_attribute.Rd new file mode 100644 index 00000000..1edb4c90 --- /dev/null +++ b/man/summarize_attribute.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assign_study_data_types.R +\name{summarize_attribute} +\alias{summarize_attribute} +\title{Helper summarization util} +\usage{ +summarize_attribute( + summary_query, + attribute, + entity_id = NULL, + dry_run = TRUE, + check_fun = NULL +) +} +\arguments{ +\item{summary_query}{Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add \code{group_concat}, \code{distinct}, and or \code{unnest} to the query to get the correct list of distinct values depending on your data (e.g.\verb{select group_concat(distinct unnest(tumorType)) as tumorType from ...}).} + +\item{attribute}{Name of attribute to update as annotation.} + +\item{entity_id}{Either a single valid Synapse id of the entity for which to update the attribute \emph{or} a column present in \code{summary_query} that stores ids.} + +\item{dry_run}{Default = \code{TRUE}. Whether to update as well or just return list of annotation objects.} + +\item{check_fun}{An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = \code{FALSE}. +It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).} +} +\description{ +Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity. +The entity could be X itself or another entity Y, e.g. a parent container entity. +Example 1: With datasets, summarize \code{species} for all the files. +Example 2: With projects, summarize \code{dataType} for all the files (in fact, see \code{assign_study_data_types}). +} diff --git a/man/walk.Rd b/man/walk.Rd index 46e6c177..cf4529f3 100644 --- a/man/walk.Rd +++ b/man/walk.Rd @@ -9,7 +9,7 @@ walk(syn_id, as_list = TRUE) \arguments{ \item{syn_id}{Synapse id of directory root to traverse.} -\item{as_list}{} +\item{as_list}{Whether to return as R list.} } \value{ An R list or Py generator object.