From c569cfff41656a9af1553cfa1cd20160b3897739 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 13 Feb 2025 16:22:17 -0800 Subject: [PATCH 1/5] Update source --- R/cbioportal.R | 5 ++--- R/cboilerplate.R | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/R/cbioportal.R b/R/cbioportal.R index 13504f30..7300f4d5 100644 --- a/R/cbioportal.R +++ b/R/cbioportal.R @@ -224,12 +224,11 @@ cbp_add_cna <- function(cna_data, verbose = TRUE) { #' #' This should be run in an existing dataset package root. #' Note that there are a number of different options generated by the STAR Salmon pipeline. -#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv` and, -#' _though not used_, find it helpful to also have raw counts `gene_counts.tsv`. +#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv`. #' #' @inheritParams cbp_new_study #' @param expression_data Syn id of normalized gene counts results (default to TPM). See details. -#' @param expression_data_raw (Optional) Syn id of raw counts results. See details. +#' @param expression_data_raw (Optional) Syn id of raw counts if curators explicitly ask for it. #' @export cbp_add_expression <- function(expression_data, expression_data_raw = NULL, diff --git a/R/cboilerplate.R b/R/cboilerplate.R index b4791302..6e326a51 100644 --- a/R/cboilerplate.R +++ b/R/cboilerplate.R @@ -65,11 +65,13 @@ get_cbio_filename <- function(clinical_type = c("SAMPLE", "PATIENT")) { #' `df` is expected to be a table containing clinical data available, and maybe even some irrelevant data #' (since NF data is not well-normalized and there is a single table with everything). #' -#' This relies on a `ref_map` specification to know which clinical data to include for cBioPortal -#' and how to segregate the clinical attributes into the right files. -#' For example, say `df` contains clinical variables A-X, but mappings are only specified for -#' variables A-C, L-M and others are not meant to be surfaced/made public. This will subset the `df` to what's specified in the mapping. -#' Conversely, if there is a mapping for variable Z that is _not_ in the clinical data, this _will_ throw error. +#' This depends on a `ref_map` specification to know which clinical data to include for cBioPortal +#' and how to segregate the clinical attributes into the right files. +#' Basically, `ref_map` decides what variables can be made public and how they should be represented in cBioPortal. +#' For example, given a table `T` on Synapse with variables A-Z and mappings in `ref_map` for A-C + L-M, +#' we take the intersection of variables present. +#' But first, check that *required* variables in *ref_map* are present. +#' So first the subset `df` is created from `T`. #' #' @inheritParams use_ref_map #' @inheritParams make_cbio_clinical_header @@ -92,7 +94,9 @@ write_cbio_clinical <- function(df, # Attribute checks message("Clinical attributes present are: ", paste(present, collapse = ", ")) if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", ")) - if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ",")) + if(!all(present %in% attributes)) { + warning("Variables not mapped with be ignored (potentially non-public/non-clinical data): ", paste(setdiff(present, attributes), collapse = ",")) + } # Take care of list columns and NA .df <- data.table::copy(df) @@ -108,13 +112,15 @@ write_cbio_clinical <- function(df, files <- list() m <- split(m, by = "attribute_type") if("individualID" %in% names(.df)) { - patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)]) + patient_df <- .df[, c(names(.df) %in% m$PATIENT$source)] + patient_df <- unique(patient_df) header <- make_cbio_clinical_header(patient_df, m$PATIENT) patient_df <- rbind(header, patient_df) files[["PATIENT"]] <- patient_df } { sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)] + sample_df <- unique(sample_df) header <- make_cbio_clinical_header(sample_df, m$SAMPLE) sample_df <- rbind(header, sample_df) files[["SAMPLE"]] <- sample_df From 715777e67ea4ad6b9a7e5ab4b8a5ed573d796da3 Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 13 Feb 2025 16:22:35 -0800 Subject: [PATCH 2/5] Update vignette --- .../bringing-portal-data-to-other-platforms-cbioportal.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd index 9e65f185..aac5fee0 100644 --- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd +++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd @@ -117,7 +117,7 @@ cbp_add_expression(mrna_data, ```{r add_clinical, eval=FALSE} clinical_data <- "select * from syn43278088" # query when the table already contains just the releasable patients -ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml" +ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal/cBioPortal.yaml" cbp_add_clinical(clinical_data, ref_map) ``` From a46eab76d11fc237db2505d85370b738667c13ae Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 13 Feb 2025 16:23:19 -0800 Subject: [PATCH 3/5] Regenerate docs --- man/cbp_add_expression.Rd | 5 ++--- man/write_cbio_clinical.Rd | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd index 348994d7..994df550 100644 --- a/man/cbp_add_expression.Rd +++ b/man/cbp_add_expression.Rd @@ -9,13 +9,12 @@ cbp_add_expression(expression_data, expression_data_raw = NULL, verbose = TRUE) \arguments{ \item{expression_data}{Syn id of normalized gene counts results (default to TPM). See details.} -\item{expression_data_raw}{(Optional) Syn id of raw counts results. See details.} +\item{expression_data_raw}{(Optional) Syn id of raw counts if curators explicitly ask for it.} \item{verbose}{Whether to be chatty.} } \description{ This should be run in an existing dataset package root. Note that there are a number of different options generated by the STAR Salmon pipeline. -cBioPortal has confirmed that they prefer normalized counts \code{gene_tpm.tsv} and, -\emph{though not used}, find it helpful to also have raw counts \code{gene_counts.tsv}. +cBioPortal has confirmed that they prefer normalized counts \code{gene_tpm.tsv}. } diff --git a/man/write_cbio_clinical.Rd b/man/write_cbio_clinical.Rd index 9e0e1b93..3d8f7598 100644 --- a/man/write_cbio_clinical.Rd +++ b/man/write_cbio_clinical.Rd @@ -33,10 +33,12 @@ The PATIENT file is actually optional, so there are only checks for making sure (since NF data is not well-normalized and there is a single table with everything). } \details{ -This relies on a \code{ref_map} specification to know which clinical data to include for cBioPortal +This depends on a \code{ref_map} specification to know which clinical data to include for cBioPortal and how to segregate the clinical attributes into the right files. -For example, say \code{df} contains clinical variables A-X, but mappings are only specified for -variables A-C, L-M and others are not meant to be surfaced/made public. This will subset the \code{df} to what's specified in the mapping. -Conversely, if there is a mapping for variable Z that is \emph{not} in the clinical data, this \emph{will} throw error. +Basically, \code{ref_map} decides what variables can be made public and how they should be represented in cBioPortal. +For example, given a table \code{T} on Synapse with variables A-Z and mappings in \code{ref_map} for A-C + L-M, +we take the intersection of variables present. +But first, check that \emph{required} variables in \emph{ref_map} are present. +So first the subset \code{df} is created from \code{T}. } \keyword{internal} From a14a1a38a6cce1a3dd163574340e16ced959e6ae Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 13 Feb 2025 18:23:59 -0800 Subject: [PATCH 4/5] Update source --- R/cbioportal.R | 3 +++ R/cboilerplate.R | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/R/cbioportal.R b/R/cbioportal.R index 7300f4d5..0b8694f2 100644 --- a/R/cbioportal.R +++ b/R/cbioportal.R @@ -188,6 +188,9 @@ cbp_add_maf <- function(maf_data, verbose = TRUE) { if(verbose) checked_message("Making maf meta file") make_meta_maf(cancer_study_identifier, verbose = verbose) + + if(verbose) checked_message("Making required _sequenced case list for mutation data") + make_case_list_maf(cancer_study_identifier) if(verbose) checked_message("Done with adding MAF data") diff --git a/R/cboilerplate.R b/R/cboilerplate.R index 6e326a51..969441dc 100644 --- a/R/cboilerplate.R +++ b/R/cboilerplate.R @@ -411,6 +411,31 @@ make_meta_study_generic <- function(cancer_study_identifier, return(meta) } +#--- Generating case list files ------------------------------------------------ # + +#' Case lists for mutation samples +#' +#' https://docs.cbioportal.org/file-formats/#case-lists +#' @keywords internal +make_case_list_maf <- function(cancer_study_identifier, verbose = TRUE) { + + mut <- fread("data_mutations.txt") + mut_samples <- unique(mut$Tumor_Sample_Barcode) + n <- length(mut_samples) + case_list_ids <- paste(mut_samples,collapse = "\t") + meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>% + append_kv("stable_id", paste0(cancer_study_identifier, "_sequenced")) %>% + append_kv("case_list_name", "Samples with mutation data from sequencing") %>% + append_kv("case_list_description", paste0("Samples with mutation data from sequencing ", "(", n, ")")) %>% + append_kv("case_list_ids", case_list_ids) + + if(!dir.exists("case_lists")) { + if(verbose) checked_message(glue::glue("Creating case_lists study directory")) + dir.create(glue::glue("./case_lists")) + } + + writeLines(meta, "case_lists/case-list.txt") +} # --- Other utils -------------------------------------------------------------- # @@ -443,3 +468,4 @@ use_ref_map <- function(ref_map, as_dt = TRUE) { return(ref_map_ls) } } + From 3690afc5b77185ac3f4e3d3f9a7f6902cf39ea1c Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Thu, 13 Feb 2025 18:24:36 -0800 Subject: [PATCH 5/5] Documentation --- man/make_case_list_maf.Rd | 12 +++++++ ...tal-data-to-other-platforms-cbioportal.Rmd | 34 +++++++++++-------- 2 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 man/make_case_list_maf.Rd diff --git a/man/make_case_list_maf.Rd b/man/make_case_list_maf.Rd new file mode 100644 index 00000000..ff1ffb1d --- /dev/null +++ b/man/make_case_list_maf.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cboilerplate.R +\name{make_case_list_maf} +\alias{make_case_list_maf} +\title{Case lists for mutation samples} +\usage{ +make_case_list_maf(cancer_study_identifier, verbose = TRUE) +} +\description{ +https://docs.cbioportal.org/file-formats/#case-lists +} +\keyword{internal} diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd index aac5fee0..94b317f4 100644 --- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd +++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd @@ -14,12 +14,17 @@ knitr::opts_chunk$set( ) ``` -**Document Status:** Draft +**Document Status:** Working **Estimated Reading Time:** 8 min ## Special acknowledgments -Functionality demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). +Utils demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). + +## Important note + +The requirements for cBioPortal change, just like with any software or database. +The package is updated to keep up on a yearly submission basis, but there may be occasional points in time when the workflow is out-of-date with this external system. ## Intro @@ -47,7 +52,10 @@ syn_login() ## Create a new study dataset First create the study dataset "package" where we can put together the data. -Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc. +Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc. +Meta can be edited after the file has been created. +This will also set the working directory to the new study directory. + ```{r cbp_new_study, eval=FALSE} @@ -64,15 +72,15 @@ These functions download data files and create the meta for them. Note that: -- These should be run with the working directory set to the study dataset directory as set up above to ensure consistent metadata. +- These should be run with the working directory set to the study directory as set up above to ensure consistent metadata. - **Defaults are for known NF-OSI processed data outputs**. - If these defaults don't apply because of changes in the scenario, take a look at the lower-level utils `make_meta_*` or edit the files manually after. - Data types can vary in how much additional work is needed in remapping, reformatting, custom sanity checks, etc. ### Add mutations data -- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. -- This data file type requires no further modifications except renaming. +- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline (vcf2maf) OK for public release. +- Under the hood, a required case list file is also generated. ```{r add_maf, eval=FALSE} @@ -109,10 +117,8 @@ cbp_add_expression(mrna_data, ### Add clinical data -- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly. - `clinical_data` is prepared from an existing Synapse table. The table can be a subsetted version of those released in the study dataset, or pass in a query that can be used for getting the subset. For example, the full clinical cohort comprises patients 1-50, but the dataset can only release data for patients 1-20 for expression data and data patients 15-20 for cna data. Here, `clinical_data` can be a smaller table of just those 1-30, or it can be the original table but pass in a suitable additional filter, e.g. `where release = 'batch1'`. - Clinical data requires mapping to be as consistent with other public datasets as possible. `ref_map` defines the mapping of clinical variables from the NF-OSI data dictionary to cBioPortal's. Only variables in the mapping are exported to cBioPortal. Follow link below to inspect the default file and format used. -- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly. ```{r add_clinical, eval=FALSE} @@ -124,15 +130,13 @@ cbp_add_clinical(clinical_data, ref_map) ## Validation -There are additional steps such as generating case lists and validation that have to be done _outside_ of the package with a cBioPortal backend, where each portal may have specific configurations (such as genomic reference) to validate against. -See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator/). +Validation has to be done with a cBioPortal instance. Each portal may have specific configurations (such as genomic reference) to validate against. -For the _public_ portal, the suggested step using the public server is given below. - -Assuming your present working directory is `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like: +For an example simple *offline* validation, assuming you are at `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like: ``` STUDY=npst_nfosi_ntap_2022 -sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:5.4.7 validateStudies.py -d /datahub -l $STUDY -u http://cbioportal.org -html /datahub/$STUDY/html_report +sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:6.0.25 validateData.py -s datahub/$STUDY -n -v ``` -The html report will list issues by data types to help with any corrections needed. +**See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator) for more examples.** +