nf-osi · anngvu · Feb 18, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/R/cbioportal.R b/R/cbioportal.R
@@ -188,6 +188,9 @@ cbp_add_maf <- function(maf_data, verbose = TRUE) {
 
   if(verbose) checked_message("Making maf meta file")
   make_meta_maf(cancer_study_identifier, verbose = verbose)
+
+  if(verbose) checked_message("Making required _sequenced case list for mutation data")
+  make_case_list_maf(cancer_study_identifier)
 
   if(verbose) checked_message("Done with adding MAF data")
 
@@ -224,12 +227,11 @@ cbp_add_cna <- function(cna_data, verbose = TRUE) {
 #'
 #' This should be run in an existing dataset package root.
 #' Note that there are a number of different options generated by the STAR Salmon pipeline.
-#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv` and,
-#' _though not used_, find it helpful to also have raw counts `gene_counts.tsv`.
+#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv`.
 #'
 #' @inheritParams cbp_new_study
 #' @param expression_data Syn id of normalized gene counts results (default to TPM). See details.
-#' @param expression_data_raw (Optional) Syn id of raw counts results. See details.
+#' @param expression_data_raw (Optional) Syn id of raw counts if curators explicitly ask for it.
 #' @export
 cbp_add_expression <- function(expression_data,
                                expression_data_raw = NULL,

diff --git a/R/cboilerplate.R b/R/cboilerplate.R
@@ -65,11 +65,13 @@ get_cbio_filename <- function(clinical_type = c("SAMPLE", "PATIENT")) {
 #' `df` is expected to be a table containing clinical data available, and maybe even some irrelevant data
 #' (since NF data is not well-normalized and there is a single table with everything).
 #'
-#' This relies on a `ref_map` specification to know which clinical data to include for cBioPortal
-#' and how to segregate the clinical attributes into the right files.
-#' For example, say `df` contains clinical variables A-X, but mappings are only specified for
-#' variables A-C, L-M and others are not meant to be surfaced/made public. This will subset the `df` to what's specified in the mapping.
-#' Conversely, if there is a mapping for variable Z that is _not_ in the clinical data, this _will_ throw error.
+#' This depends on a `ref_map` specification to know which clinical data to include for cBioPortal
+#' and how to segregate the clinical attributes into the right files. 
+#' Basically, `ref_map` decides what variables can be made public and how they should be represented in cBioPortal.
+#' For example, given a table `T` on Synapse with variables A-Z and mappings in `ref_map` for A-C + L-M,
+#' we take the intersection of variables present.
+#' But first, check that *required* variables in *ref_map* are present.
+#' So first the subset `df` is created from `T`. 
 #'
 #' @inheritParams use_ref_map
 #' @inheritParams make_cbio_clinical_header
@@ -92,7 +94,9 @@ write_cbio_clinical <- function(df,
   # Attribute checks
   message("Clinical attributes present are: ", paste(present, collapse = ", "))
   if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", "))
-  if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ","))
+  if(!all(present %in% attributes)) {
+    warning("Variables not mapped with be ignored (potentially non-public/non-clinical data): ", paste(setdiff(present, attributes), collapse = ","))
+  }
 
   # Take care of list columns and NA
   .df <- data.table::copy(df)
@@ -108,13 +112,15 @@ write_cbio_clinical <- function(df,
   files <- list()
   m <- split(m, by = "attribute_type")
   if("individualID" %in% names(.df)) {
-    patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)])
+    patient_df <- .df[, c(names(.df) %in% m$PATIENT$source)]
+    patient_df <- unique(patient_df)
     header <- make_cbio_clinical_header(patient_df, m$PATIENT)
     patient_df <- rbind(header, patient_df)
     files[["PATIENT"]] <- patient_df
   }
   {
     sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)]
+    sample_df <- unique(sample_df)
     header <- make_cbio_clinical_header(sample_df, m$SAMPLE)
     sample_df <- rbind(header, sample_df)
     files[["SAMPLE"]] <- sample_df
@@ -405,6 +411,31 @@ make_meta_study_generic <- function(cancer_study_identifier,
   return(meta)
 }
 
+#--- Generating case list files ------------------------------------------------ #
+
+#' Case lists for mutation samples
+#'
+#' https://docs.cbioportal.org/file-formats/#case-lists
+#' @keywords internal
+make_case_list_maf <- function(cancer_study_identifier, verbose = TRUE) {
+
+  mut <- fread("data_mutations.txt")
+  mut_samples <- unique(mut$Tumor_Sample_Barcode)
+  n <- length(mut_samples)
+  case_list_ids <- paste(mut_samples,collapse = "\t")
+  meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>%
+    append_kv("stable_id", paste0(cancer_study_identifier, "_sequenced")) %>%
+    append_kv("case_list_name", "Samples with mutation data from sequencing") %>%
+    append_kv("case_list_description", paste0("Samples with mutation data from sequencing ", "(", n, ")")) %>%
+    append_kv("case_list_ids", case_list_ids)
+
+  if(!dir.exists("case_lists")) {
+    if(verbose) checked_message(glue::glue("Creating case_lists study directory"))
+    dir.create(glue::glue("./case_lists"))
+  }
+
+  writeLines(meta, "case_lists/case-list.txt")
+}
 
 # --- Other utils -------------------------------------------------------------- #
 
@@ -437,3 +468,4 @@ use_ref_map <- function(ref_map, as_dt = TRUE) {
     return(ref_map_ls)
   }
 }
+
diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
diff --git a/man/make_case_list_maf.Rd b/man/make_case_list_maf.Rd
diff --git a/man/write_cbio_clinical.Rd b/man/write_cbio_clinical.Rd
diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -14,12 +14,17 @@ knitr::opts_chunk$set(
 )
 ```
 
-**Document Status:** Draft  
+**Document Status:** Working  
 **Estimated Reading Time:** 8 min
 
 ## Special acknowledgments 
 
-Functionality demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). 
+Utils demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). 
+
+## Important note
+
+The requirements for cBioPortal change, just like with any software or database. 
+The package is updated to keep up on a yearly submission basis, but there may be occasional points in time when the workflow is out-of-date with this external system. 
 
 ## Intro
 
@@ -47,7 +52,10 @@ syn_login()
 ## Create a new study dataset
 
 First create the study dataset "package" where we can put together the data. 
-Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc.
+Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc. 
+Meta can be edited after the file has been created. 
+This will also set the working directory to the new study directory.
+
 
 ```{r cbp_new_study, eval=FALSE}
 
@@ -64,15 +72,15 @@ These functions download data files and create the meta for them.
 
 Note that:
 
-- These should be run with the working directory set to the study dataset directory as set up above to ensure consistent metadata.
+- These should be run with the working directory set to the study directory as set up above to ensure consistent metadata.
 - **Defaults are for known NF-OSI processed data outputs**. 
 - If these defaults don't apply because of changes in the scenario, take a look at the lower-level utils `make_meta_*` or edit the files manually after.
 - Data types can vary in how much additional work is needed in remapping, reformatting, custom sanity checks, etc.
 
 ### Add mutations data
 
-- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. 
-- This data file type requires no further modifications except renaming.
+- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline (vcf2maf) OK for public release. 
+- Under the hood, a required case list file is also generated.
 
 ```{r add_maf, eval=FALSE}
 
@@ -109,30 +117,26 @@ cbp_add_expression(mrna_data,
 
 ### Add clinical data
 
-- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly. 
 - `clinical_data` is prepared from an existing Synapse table. The table can be a subsetted version of those released in the study dataset, or pass in a query that can be used for getting the subset. For example, the full clinical cohort comprises patients 1-50, but the dataset can only release data for patients 1-20 for expression data and data patients 15-20 for cna data. Here, `clinical_data` can be a smaller table of just those 1-30, or it can be the original table but pass in a suitable additional filter, e.g. `where release = 'batch1'`.
 - Clinical data requires mapping to be as consistent with other public datasets as possible. `ref_map` defines the mapping of clinical variables from the NF-OSI data dictionary to cBioPortal's. Only variables in the mapping are exported to cBioPortal. Follow link below to inspect the default file and format used.
-- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly.
 
 ```{r add_clinical, eval=FALSE}
 
 clinical_data <- "select * from syn43278088" # query when the table already contains just the releasable patients
-ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml"
+ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal/cBioPortal.yaml"
 
 cbp_add_clinical(clinical_data, ref_map)
 ```
 
 ## Validation
 
-There are additional steps such as generating case lists and validation that have to be done _outside_ of the package with a cBioPortal backend, where each portal may have specific configurations (such as genomic reference) to validate against.
-See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator/).
+Validation has to be done with a cBioPortal instance. Each portal may have specific configurations (such as genomic reference) to validate against.
 
-For the _public_ portal, the suggested step using the public server is given below.  
-
-Assuming your present working directory is `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like:  
+For an example simple *offline* validation, assuming you are at `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like:  
 ```
 STUDY=npst_nfosi_ntap_2022
-sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:5.4.7 validateStudies.py -d /datahub -l $STUDY -u http://cbioportal.org -html /datahub/$STUDY/html_report
+sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:6.0.25 validateData.py -s datahub/$STUDY -n -v
 ```
 
-The html report will list issues by data types to help with any corrections needed.  
+**See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator) for more examples.**
+