Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add meta-qc utils per # 87 #123

Merged
merged 11 commits into from
Sep 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Imports:
crayon,
emoji,
plyr,
readxl,
yaml
URL: https://github.com/nf-osi/nfportalutils
BugReports: https://github.com/nf-osi/nfportalutils/issues
Expand Down
9 changes: 7 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -39,32 +39,37 @@ export(copy_annotations)
export(data_curator_app_subpage)
export(delete_provenance)
export(find_child)
export(find_data_root)
export(find_in)
export(find_nf_asset)
export(from_pubmed)
export(get_by_prop_from_json_schema)
export(get_dependency_from_json_schema)
export(get_project_wiki)
export(get_valid_values_from_json_schema)
export(grant_specific_file_access)
export(infer_data_type)
export(key_label_to_id)
export(make_admin)
export(make_folder)
export(make_public)
export(make_public_viewable)
export(manifest_generate)
export(manifest_validate)
export(map_sample_input_ss)
export(map_sample_io)
export(map_sample_output_rnaseq)
export(map_sample_output_sarek)
export(meta_qc_dataset)
export(meta_qc_project)
export(missing_annotation_email)
export(new_dataset)
export(new_project)
export(nf_cnv_dataset)
export(nf_find_asset)
export(nf_sarek_datasets)
export(nf_star_salmon_datasets)
export(nf_workflow_version)
export(processing_flowchart)
export(qc_manifest)
export(register_study)
export(register_study_files)
export(remove_button)
Expand Down
260 changes: 260 additions & 0 deletions R/annotation_qc.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
#' Generate manifest via schematic service
#'
#' See [schematic manifest generation](https://schematic.api.sagebionetworks.org/v1/ui/#/Manifest%20Operations/schematic_api.api.routes.get_manifest_route).
#' Note that this uses the access token of user that should already by logged in with `syn_login`.
#'
#' @param data_type Data type of the manifest to generate (aka Component).
#' @param dataset_id Optional, if given this fills out manifest for existing dataset instead of generating a blank manifest.
#' @param title Optional, custom title.
#' @param schema_url Optional, defaults to main NF 'latest' data model.
#' @param asset_view Optional, defaults to main NF portal fileview.
#' @param output_format Format of 'excel', 'google_sheet', or 'dataframe'. Defaults to 'excel'.
#' @param use_annotations Use annotations if filling out manifest for existing dataset. Defaults to TRUE for NF.
#' @param service Service endpoint to use. Defaults to the schematic production endpoint.
#' @returns For excel, path to local file; for google_sheet, URL to sheet; for dataframe, JSON string of data.
#' @export
manifest_generate <- function(data_type,
dataset_id = NULL,
title = data_type,
schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
asset_view = "syn16858331",
output_format = "excel",
use_annotations = TRUE,
service = "https://schematic.api.sagebionetworks.org/v1/manifest/generate") {

# yes, param has been re-encoded like this for 'dataframe'
output_format_param <- if (output_format == "dataframe") "dataframe (only if getting existing manifests)" else output_format
access_token <- .syn$credentials$secret
use_annotations <- tolower(as.character(use_annotations))

req <- httr::GET(service,
query = list(
schema_url = schema_url,
title = title,
data_type = data_type,
use_annotations = use_annotations,
dataset_id = dataset_id,
asset_view = asset_view,
output_format = output_format_param,
access_token = access_token
))

status <- httr::status_code(req)
if(status != 200L) stop("Unsuccessful request, received status code: ", status)

if(output_format == "excel") {
file <- "manifest.xlsx"
message(glue::glue("Manifest generated and saved as {file}"))
bin <- httr::content(req, "raw")
writeBin(bin, file)
return(file)
}

if(output_format == "google_sheet") {
message("Manifest generated as Googlesheet(s)")
url <- httr::content(req)
return(url)
}

if(output_format == "dataframe") {
message("Manifest(s) generated as JSON doc")
json_str <- httr::content(req, "text", encoding = "UTF-8")
# json_str <- jsonlite::minify(json_str)
return(json_str)
}
}


#' Validate manifest via schematic service
#'
#' See [schematic validation](https://schematic.api.sagebionetworks.org/v1/ui/#/Model%20Operations/schematic_api.api.routes.validate_manifest_route).
#' Get validation results from schematic service. Downstream utils can consume these results for custom display/report.
#'
#' @inheritParams manifest_generate
#' @param json_str JSON string representing metadata.
#' @param file_name Path to file, has to be `.csv`. Ignored if `json_str` is given.
#' @param restrict_rules Use only basic schematic validation instead of extended validation with Great Expectations, default `FALSE`.
#' @export
manifest_validate <- function(data_type,
json_str = NULL,
file_name = NULL,
restrict_rules = FALSE,
schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
service = "https://schematic.api.sagebionetworks.org/v1/model/validate") {

restrict_rules <- tolower(as.character(restrict_rules))

# json_str can't be more than 4-8 KiB (nginx server typical limits), otherwise header overflow error
if(!is.null(json_str)) {
if(object.size(json_str) > 40000) stop("Data of this size should be submitted as a file instead of JSON string")
req <- httr::POST(service,
query = list(
json_str = json_str,
schema_url = schema_url,
data_type = data_type,
restrict_rules = restrict_rules))
} else if(!is.null(file_name)) {
if(!file.exists(file_name)) stop("Trying to validate ", file_name, " that can't be found.")
req <- httr::POST(service,
query = list(
schema_url = schema_url,
data_type = data_type,
restrict_rules = restrict_rules),
body = list(file_name = httr::upload_file(file_name), type = "text/csv"))
} else {
stop("Must provide manifest data as either JSON or local file path.")
}

status <- httr::status_code(req)
if(status != 200L) stop("Unsuccessful request, received status code: ", status)
result <- httr::content(req)
result
}

#' Terse error messages please
#'
#' @param error An error object from schematic.
#' @keywords internal
tersely <- function(error) {
row <- error[[1]]
column <- error[[2]]
message <- error[[3]]
value <- error[[4]]

wording <- if(grepl("is not one of", message)) paste(shQuote(value), "is not a valid value for", column) else message
wording
}

#' Provide a pass/fail summary result
#'
#' @param result Result list data from schematic service.
#' @returns Boolean for whether passed.
#' @returns List of structure `list(result = result, notes = notes)`, where `result` indicates whether the dataset passed.
#' @keywords internal
manifest_passed <- function(result) {

errors <- length(result$errors)
if(errors) {
messages <- unique(sapply(result$errors, tersely))
notes <- paste(messages, collapse = ", ")
return(list(result = FALSE, notes = notes))
} else {
return(list(result = TRUE, notes = "No errors"))
}
}

#' Infer data type of a dataset folder
#'
#' Infer the data type by checking the first few files.
#' TODO: Check `dataType` instead of Component and derive Component
#' because some older files does not have Component explicitly.
#'
#' @inheritParams manifest_generate
#' @return List of structure `list(result = result, notes = notes)`, where `result` can be `NA`.
#' @export
infer_data_type <- function(dataset_id) {

children <- .syn$getChildren(dataset_id)
children <- reticulate::iterate(children)
if(!length(children)) return(list(result = NA, notes = "Empty dataset folder"))
children <- first(children, 3)
data_type <- c()
for (entity in children) {
e <- .syn$getAnnotations(entity)
data_type <- append(data_type, e$Component)
}
data_type <- unique(data_type)
if(is.null(data_type)) return(list(result = NA, notes = "Metadata insufficient to infer data type."))
if(length(data_type) > 1) return(list(result = NA, notes = "Conflicting data types observed."))
return(list(result = data_type, notes = ""))
}


#' QC dataset metadata with pass/fail result
#'
#' R wrapper for validation workflow with schematic. Because there is no validation-only service endpoint,
#' we move metadata around twice (generating manifest from server and submitting back to server),
#' so once schematic has a validation-only service endpoint that would be much more efficient.
#' A dataset in this context is a folder, usually tagged with `contentType` = "dataset".
#'
#' Note that we prefer to wrap the schematic web API over a local installation because:
#' - Will not require user to go through local schematic setup for this to be functional
#' - API more likely reflects an up-to-date version of schematic and consistent with current DCA deployment
#'
#' When `data_type` can't be inferred based on annotations, this is treated as a fail.
#'
#' Status: alpha and likely to change based on changes in `schematic`.
#'
#' @param dataset_id Id of folder that represents a dataset, not actual Synapse dataset entity -- see details.
#' @param data_type A specific data type to validate against, otherwise tries to infer based on annotations. See details.
#' @param asset_view A reference view, defaults to the main NF portal fileview.
#' @param schema_url Schema URL, points by default to 'latest' main NF schema, can change to use a specific released version.
#' @param cleanup Whether to automatically remove reconstituted manifests once done. Default `TRUE`.
#' @returns List of structure `list(result = result, notes = notes)`,
#' where `result` indicates passing or `NA` if no data or if couldn't be validated for other reasons.
#' @export
meta_qc_dataset <- function(dataset_id,
data_type = NULL,
asset_view = "syn16787123",
schema_url = "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/NF.jsonld",
cleanup = TRUE) {

files <- reticulate::iterate(.syn$getChildren(dataset_id))
if(!length(files)) return(list(result = NA, notes = "Empty dataset"))

if(is.null(data_type)) {
data_type <- infer_data_type(dataset_id)$result
if(is.na(data_type)) return(list(result = FALSE, notes = "Metadata quality insufficient to even infer data type"))
}

# Reconstitute metadata manifest -- using excel as the safest option for now
tryCatch({
xl <- manifest_generate(data_type, dataset_id, output_format = "excel")
csv <- readxl::read_excel(xl, sheet = 1)
write.csv(csv, file = "manifest.csv")
# Validate
results <- manifest_validate(data_type = data_type, file_name = "manifest.csv")
if(cleanup) file.remove(xl, "manifest.csv")
manifest_passed(results)
}, error = function(e) {
list(result = NA, notes = e$message)
})
}


#' QC metadata at the project level with pass/fail result
#'
#' For projects with a relatively standard structure that also corresponds to what the DCA expects,
#' this is an adequate wrapper to go through the datasets and do basic QC in a one-stop-shop manner.
#' For selective validation or other (e.g. milestone-based or ad hoc) structures, look at `meta_qc_dataset`.
#'
#' @param project_id Synapse project id.
#' @return A table of with rows for the datasets QC'd, with dataset id, name, TRUE/FALSE pass result, and summary;
#' otherwise `NA`.
#' @export
meta_qc_project <- function(project_id) {

data_root <- find_data_root(project_id)
if(is.null(data_root)) {
message("Data root could not be located. Project structure may require custom assessment and dropping down to `meta_qc_dataset`.")
return(NA)
}
in_data <- .syn$getChildren(data_root)
in_data <- reticulate::iterate(in_data)
# Select only folders in data and ignore files at this level
datasets <- Filter(function(x) x$type == "org.sagebionetworks.repo.model.Folder", in_data)
if(length(datasets)) {
dataset_names <- sapply(datasets, `[[`, "name")
dataset_ids <- sapply(datasets, `[[`, "id")
message("Datasets for QC:\n", glue::glue_collapse(dataset_names, sep = "\n"))
results <- lapply(dataset_ids, meta_qc_dataset)
report <- rbindlist(results)
report$dataset_name <- dataset_names
report$dataset_id <- dataset_ids
report
} else {
message("No datasets found under data root.")
return(NA)
}
}

26 changes: 0 additions & 26 deletions R/annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,29 +62,3 @@ copy_annotations <- function(entity_from,
}
}


#' QC a derived manifest
#'
#' Check missing annotations, usually because input files are missing values so nothing could be copied.
#' It might also be helpful to visualize the manifest via a package like `naniar::vis_miss(manifest)`.
#'
#' @param manifest A manifest, usually from one of the `annotate_*` functions.
#' @param sample_io Input/output mapping used; only used if missing annotations detected.
#' @return NULL if no problems, otherwise a table of entity ids affected, the attributes missing, and inputs used.
#' @export
qc_manifest <- function(manifest, sample_io) {
missing <- apply(manifest, 2, function(x) sum(is.na(x)))
if(all(missing == 0)) {
message("All values are present in manifest")
return()
} else {
message("Some values not present in manifest...")
na_attrs <- apply(manifest, 1, function(x) names(x[is.na(x)]))
na_subset <- manifest[lengths(na_attrs) > 0, .(entityId)]
na_subset[, attributes := sapply(na_attrs[lengths(na_attrs) > 0], paste, collapse = ",")]
na_subset <- merge(na_subset, sample_io[, .(output_id, output_name, input_id, sample)], by.x = "entityId", by.y = "output_id")
return(na_subset)
}
}


18 changes: 16 additions & 2 deletions R/find.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ find_child <- function(child_name, parent) {
child_id
}

#' Find data folder
#'
#' Convenience function to find data folder, which can have slight name variations, in a project.
#'
#' @param project_id Synapse project id.
#' @export
find_data_root <- function(project_id) {

data_root <- find_child("Data", parent = project_id)
if(is.null(data_root)) data_root <- find_child("Raw Data", parent = project_id)
data_root
}


# Find nextflow assets --------------------------------------------------------- #

# Convenience functions for getting Synapse ids of nextflow assets
Expand All @@ -56,7 +70,7 @@ find_child <- function(child_name, parent) {
#' @param asset Name of asset to find.
#' @returns Id of samplesheet.
#' @export
nf_find_asset <- function(syn_out,
find_nf_asset <- function(syn_out,
asset = c("software_versions", "multiqc_report", "samplesheet", "samtools_stats")) {

asset <- match.arg(asset)
Expand All @@ -75,7 +89,7 @@ nf_find_asset <- function(syn_out,

#' Return workflow version according to workflow meta
#'
#' @inheritParams nf_find_asset
#' @inheritParams find_nf_asset
#' @returns Version string.
#' @export
nf_workflow_version <- function(syn_out) {
Expand Down
Loading