Skip to content

Commit

Permalink
Summarize attributes as annotations (#163)
Browse files Browse the repository at this point in the history
* Initial changes

* Test and refine update

* Lint and add check

* Generalize initial update

* Update docs, exports

* Update pkgdown index

* Update R/assign_study_data_types.R

Co-authored-by: Robert Allaway <allaway@users.noreply.github.com>

* Docs

---------

Co-authored-by: Robert Allaway <allaway@users.noreply.github.com>
  • Loading branch information
anngvu and allaway authored Dec 22, 2023
1 parent b3e25c3 commit c432501
Show file tree
Hide file tree
Showing 9 changed files with 168 additions and 102 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ export(register_study)
export(register_study_files)
export(remove_button)
export(remove_wiki_subpage)
export(summarize_attribute)
export(summarize_file_access)
export(swap_col)
export(syn_login)
Expand Down
143 changes: 88 additions & 55 deletions R/assign_study_data_types.R
Original file line number Diff line number Diff line change
@@ -1,70 +1,103 @@
#' Summarize file annotations into a STRINGLIST column on a study table.
#' @description Summarize fileview annotations into a string-list column on another table.
#' @description For example, use this function to summarize all of the "dataType" annotations for a each study into a STRINGLIST annotation on the Study table of a portal. Overwrites whatever is currently in the target column.
#' @param study_table_id The synapse id of the portal study table. Must have write access.
#' @param fileview_id The Synapse ID of the portal fileview.
#' @param group_colname The column name to group by and join on (such as the default = 'studyId')
#' @param source_colname The column name to summarize and add to the study_table_id table. The column must exist in both schemas, and must be a STRINGLIST-type column in the "study_table_id" table.
#' @param sep If any delimited values exist in the source_colname column, pass the delimiter here so that these cases are included.
#' @param valid_values A vector of valid values for the source_colname. e.g. the output of running `get_valid_values_from_json_schema()`
#' @param dry_run Default = TRUE. Skips upload to table and instead prints study tibble.
#' @return If dry_run == T, returns study tibble and skips upload.
#' @examples
#' Summarize data types for the study
#'
#' Data types are summarized, or "rolled-up", for the study based on its child file annotations.
#' Summary values are added back as and overwrites the current `dataType` annotation for the study.
#' See also the related `update_study_annotations`, where study-level annotations are *rolled down* to child files.
#' Note that under-the-hood this now wraps a generalized util `summarize_attribute`.
#'
#' @param study_table_id Synapse ID of reference portal study table. Used to get study ids.
#' @param fileview_id Synapse ID of the reference portal fileview.
#' @param id_col Name of the study id column in `study_table_id` and `fileview_id`. Defaults to `studyId`.
#' @param attribute Attribute being summarized using fileview. Defaults to `dataType`.
#' @param dry_run Default = TRUE. Whether to update as well or just return list of annotation objects.
#' @return List of annotations objects.
#' @examples
#' \dontrun{
#' assign_study_data_types(study_table_id = 'syn16787123',
#' fileview_id = 'syn16858331',
#' group_colname = 'studyId',
#' source_colname = "dataType",
#' sep = ",",
#' valid_values = get_valid_values_from_json_schema(),
#' dry_run = T)
#'}
#' assign_study_data_types(study_table_id = 'syn52694652',
#' fileview_id = 'syn16858331',
#' id_col = 'studyId',
#' attribute = 'dataType',
#' dry_run = T)
#'}
#' @export
assign_study_data_types <- function(study_table_id, fileview_id, group_colname = "studyId",
source_colname = "dataType", sep = ",", valid_values, dry_run = TRUE){
assign_study_data_types <- function(study_table_id,
fileview_id,
id_col = "studyId",
attribute = "dataType",
dry_run = TRUE) {

.check_login()

##query the study table
query <- .syn$tableQuery(glue::glue("select {group_colname}, {source_colname} from {study_table_id}", includeRowIdAndRowVersion=T))
# get studies within scope from study table
studies <- table_query(table_id = study_table_id, columns = id_col) %>% unlist()

studies <- query$filepath %>%
readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as columns
# query the fileview
query <- glue::glue("select {id_col},group_concat(distinct {attribute}) as {attribute} from {fileview_id}
where type = \'file\' and {attribute} is not null and {id_col} is not null group by {id_col}")

##query the fileview
fv <- .syn$tableQuery(glue::glue('select {group_colname},{source_colname} from {fileview_id} where type = \'file\' and {group_colname} is not null and {source_colname} is not null'))$filepath %>%
readr::read_csv(na=character()) ##asDataFrame() & reticulate return rowIdAndRowVersion as concatenated rownames, read_csv reads them in as column

#TODO:: add support for stringlist-ed values

##make simplified data table for stringlist-ing
data_types <- fv %>%
dplyr::select(one_of({{group_colname}}, {{source_colname}})) %>%
dplyr::distinct() %>%
tidyr::separate_rows({{source_colname}}, sep = {{sep}}) %>% ##this handles comma seperated or other delimited values
dplyr::filter(!!rlang::sym(source_colname) %in% valid_values)
check_fun <- function(values) {
if(length(values) > 50) {
warning(glue::glue("Over 50 values found, which will break the schema."))
return(FALSE)
} else {
return(TRUE)
}
}

studies <- dplyr::select(studies, ROW_ID, ROW_VERSION, {{group_colname}})
summarize_attribute(summary_query = query,
attribute = attribute,
entity_id = id_col,
dry_run = dry_run,
check_fun = check_fun)
}

##create stringlisted data
ids <- data_types %>% dplyr::group_by_at(group_colname) %>%
dplyr::summarise(!!rlang::sym(source_colname) := jsonlite::toJSON(!!rlang::sym(source_colname)))

##join study table to stringlisted values, filter NA rows out, we don't need to update those
studies_updated <- dplyr::left_join(studies, ids) %>%
dplyr::filter(!is.na(!!rlang::sym(source_colname)))
#' Helper summarization util
#'
#' Given some table X that provides values, summarize the values for an attribute and add summary as annotations on some entity.
#' The entity could be X itself or another entity Y, e.g. a parent container entity.
#' Example 1: With datasets, summarize `species` for all the files.
#' Example 2: With projects, summarize `dataType` for all the files (in fact, see `assign_study_data_types`).
#'
#' @param summary_query Query (usually of a fileview) that returns appropriate aggregation per row. You may need to add `group_concat`, `distinct`, and or `unnest` to the query to get the correct list of distinct values depending on your data (e.g.`select group_concat(distinct unnest(tumorType)) as tumorType from ...`).
#' @param attribute Name of attribute to update as annotation.
#' @param entity_id Either a single valid Synapse id of the entity for which to update the attribute *or* a column present in `summary_query` that stores ids.
#' @param dry_run Default = `TRUE`. Whether to update as well or just return list of annotation objects.
#' @param check_fun An optional custom check function to apply to the values being updated in order for update to go through. Should return a boolean. Used only if dry_run = `FALSE`.
#' It can be tailored towards the attribute/entity being updated (i.e. taking into account the schema and valid values).
#' @export
summarize_attribute <- function(summary_query,
attribute,
entity_id = NULL,
dry_run = TRUE,
check_fun = NULL) {

#TODO: could add check here to report number of updated rows vs original...
values <- .syn$tableQuery(summary_query,includeRowIdAndRowVersion = F)$asDataFrame()
meta <- lapply(values[[attribute]], function(x) unique(trimws(strsplit(x, split = ",")[[1]]))) # in case of stray whitespaces
if(is_valid_syn_id(entity_id)) {
names(meta) <- entity_id
} else {
names(meta) <- values[[entity_id]]
}

if(dry_run == FALSE){
.update_table_data(table_id = study_table_id,
new_data = studies_updated,
etag = query$etag)
}else{
studies_updated
result_list <- list()
for(entity in names(meta)) {
entity_meta <- .syn$get_annotations(entity)
entity_meta[attribute] <- meta[[entity]]
result_list[[entity]] <- entity_meta
if(!dry_run) {
if(is.function(check_fun)) {
if(check_fun(meta[[entity]])) .syn$set_annotations(entity_meta) else message("Skipped update for {entity}.")
} else {
.syn$set_annotations(entity_meta)
message(glue::glue("Updated {entity} {attribute}."))
}
}
}
invisible(result_list)
}


#' Retrieve valid subclasses of a value in a JSON-LD schema
#' @description Retrieve valid subclasses of a value in a JSON-LD schema generated by schematic.
#' @param schema_url Default: the NF-OSI JSON-LD schema.
Expand All @@ -77,11 +110,11 @@ get_valid_values_from_json_schema <- function(schema_url = 'https://raw.githubus
parent_context = 'bts'){

parent_id <- paste0(parent_context, ':', parent_name)
subclasses <-

subclasses <-
jsonlite::fromJSON(schema_url) %>%
purrr::pluck("@graph") %>%
dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>%
dplyr::filter(purrr::map_lgl(`rdfs:subClassOf`, ~ parent_id %in% .x)) %>%
dplyr::pull(`sms:displayName`)

subclasses
Expand Down
41 changes: 21 additions & 20 deletions R/basic_utils.R
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
#' Create copy of entity
#'
#'
#' Create a copy of syn entity; mostly used to create a copy on which to test out changes.
#' See https://python-docs.synapse.org/build/html/synapseutils.html?highlight=copy#synapseutils.copy_functions.copy
#' @param entity Entity to copy.
#' @param destination_id Id of destination project/container that entity will be copied to.
#' @param skip_copy_wiki_page Whether to skip copying wiki; defaults FALSE.
#' @param skip_copy_annotations Whether to skip copying annotations; defaults FALSE.
#' @keywords internal
copy <- function(entity,
destination_id,
skip_copy_wiki_page = FALSE,
copy <- function(entity,
destination_id,
skip_copy_wiki_page = FALSE,
skip_copy_annotations = FALSE) {

.check_login()
# load synapseutils as needed
synapseutils$copy(.syn,
entity = entity,
destinationId = destination_id,
skipCopyWikiPage = skip_copy_wiki_page,


synapseutils$copy(.syn,
entity = entity,
destinationId = destination_id,
skipCopyWikiPage = skip_copy_wiki_page,
skipCopyAnnotations = skip_copy_annotations)

}


#' Download and read file to `data.table`
#'
#' Convenience function for reading a delimited local file or one on Synapse.
#'
#'
#' @param file File Synapse id or local path.
#' @keywords internal
#' @import data.table
dt_read <- function(file) {
Expand All @@ -46,13 +47,13 @@ dt_read <- function(file) {
}

#' Extract synapse id from URI or other string
#'
#'
#' @param uri URI or string containing embedded Synapse id.
#' @keywords internal
bare_syn_id <- function(uri) {
not_na <- which(!is.na(uri))
x <- uri[not_na]
syn <- regmatches(x, regexpr("syn[0-9]{8,9}", x))
syn <- regmatches(x, regexpr("syn[0-9]{8,12}", x))
uri[not_na] <- syn
return(uri)
}
Expand All @@ -63,21 +64,21 @@ bare_syn_id <- function(uri) {
#' @param id Id string.
#' @keywords internal
is_valid_syn_id <- function(id) {
result <- grepl("^syn[0-9]{8,9}$", id)
result <- grepl("^syn[0-9]{8,12}$", id)
result
}

#' Walk through a directory
#'
#'
#' For now, an internal util imported from `synapseutils`.
#' @param syn_id Synapse id of directory root to traverse.
#' @param as_list
#' @return An R list or Py generator object.
#' @param as_list Whether to return as R list.
#' @return An R list or Py generator object.
#' @keywords internal
walk <- function(syn_id, as_list = TRUE) {
.check_login()
x <- synapseutils$walk(.syn, syn_id)
if(as_list) reticulate::iterate(x) else x

}

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ You can install `nfportalutils` from here:
remotes::install_github("nf-osi/nfportalutils")
```



## Additional Notes for Users

- View function reference on docs site at [Reference](https://nf-osi.github.io/nfportalutils/reference/index.html).
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ reference:
- register_study
- add_people_from_table
- register_study_files
- summarize_attribute
- subtitle: Lower-level table maintenance
- contents:
- adjust_view
Expand Down
45 changes: 19 additions & 26 deletions man/assign_study_data_types.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/dt_read.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

32 changes: 32 additions & 0 deletions man/summarize_attribute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c432501

Please sign in to comment.