From c569cfff41656a9af1553cfa1cd20160b3897739 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 13 Feb 2025 16:22:17 -0800
Subject: [PATCH 1/5] Update source

---
 R/cbioportal.R   |  5 ++---
 R/cboilerplate.R | 20 +++++++++++++-------
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/R/cbioportal.R b/R/cbioportal.R
index 13504f30..7300f4d5 100644
--- a/R/cbioportal.R
+++ b/R/cbioportal.R
@@ -224,12 +224,11 @@ cbp_add_cna <- function(cna_data, verbose = TRUE) {
 #'
 #' This should be run in an existing dataset package root.
 #' Note that there are a number of different options generated by the STAR Salmon pipeline.
-#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv` and,
-#' _though not used_, find it helpful to also have raw counts `gene_counts.tsv`.
+#' cBioPortal has confirmed that they prefer normalized counts `gene_tpm.tsv`.
 #'
 #' @inheritParams cbp_new_study
 #' @param expression_data Syn id of normalized gene counts results (default to TPM). See details.
-#' @param expression_data_raw (Optional) Syn id of raw counts results. See details.
+#' @param expression_data_raw (Optional) Syn id of raw counts if curators explicitly ask for it.
 #' @export
 cbp_add_expression <- function(expression_data,
                                expression_data_raw = NULL,
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index b4791302..6e326a51 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -65,11 +65,13 @@ get_cbio_filename <- function(clinical_type = c("SAMPLE", "PATIENT")) {
 #' `df` is expected to be a table containing clinical data available, and maybe even some irrelevant data
 #' (since NF data is not well-normalized and there is a single table with everything).
 #'
-#' This relies on a `ref_map` specification to know which clinical data to include for cBioPortal
-#' and how to segregate the clinical attributes into the right files.
-#' For example, say `df` contains clinical variables A-X, but mappings are only specified for
-#' variables A-C, L-M and others are not meant to be surfaced/made public. This will subset the `df` to what's specified in the mapping.
-#' Conversely, if there is a mapping for variable Z that is _not_ in the clinical data, this _will_ throw error.
+#' This depends on a `ref_map` specification to know which clinical data to include for cBioPortal
+#' and how to segregate the clinical attributes into the right files. 
+#' Basically, `ref_map` decides what variables can be made public and how they should be represented in cBioPortal.
+#' For example, given a table `T` on Synapse with variables A-Z and mappings in `ref_map` for A-C + L-M,
+#' we take the intersection of variables present.
+#' But first, check that *required* variables in *ref_map* are present.
+#' So first the subset `df` is created from `T`. 
 #'
 #' @inheritParams use_ref_map
 #' @inheritParams make_cbio_clinical_header
@@ -92,7 +94,9 @@ write_cbio_clinical <- function(df,
   # Attribute checks
   message("Clinical attributes present are: ", paste(present, collapse = ", "))
   if(!all(required %in% present)) stop("Missing required clinical element(s):", paste(setdiff(required, present), collapse = ", "))
-  if(!all(present %in% attributes)) stop("Missing mapping for:", paste(setdiff(present, attributes), collapse = ","))
+  if(!all(present %in% attributes)) {
+    warning("Variables not mapped with be ignored (potentially non-public/non-clinical data): ", paste(setdiff(present, attributes), collapse = ","))
+  }
 
   # Take care of list columns and NA
   .df <- data.table::copy(df)
@@ -108,13 +112,15 @@ write_cbio_clinical <- function(df,
   files <- list()
   m <- split(m, by = "attribute_type")
   if("individualID" %in% names(.df)) {
-    patient_df <- unique(.df[, c(names(.df) %in% m$PATIENT$source)])
+    patient_df <- .df[, c(names(.df) %in% m$PATIENT$source)]
+    patient_df <- unique(patient_df)
     header <- make_cbio_clinical_header(patient_df, m$PATIENT)
     patient_df <- rbind(header, patient_df)
     files[["PATIENT"]] <- patient_df
   }
   {
     sample_df <- .df[, c(names(.df) %in% m$SAMPLE$source)]
+    sample_df <- unique(sample_df)
     header <- make_cbio_clinical_header(sample_df, m$SAMPLE)
     sample_df <- rbind(header, sample_df)
     files[["SAMPLE"]] <- sample_df

From 715777e67ea4ad6b9a7e5ab4b8a5ed573d796da3 Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 13 Feb 2025 16:22:35 -0800
Subject: [PATCH 2/5] Update vignette

---
 .../bringing-portal-data-to-other-platforms-cbioportal.Rmd      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index 9e65f185..aac5fee0 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -117,7 +117,7 @@ cbp_add_expression(mrna_data,
 ```{r add_clinical, eval=FALSE}
 
 clinical_data <- "select * from syn43278088" # query when the table already contains just the releasable patients
-ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal.yaml"
+ref_map <- "https://raw.githubusercontent.com/nf-osi/nf-metadata-dictionary/main/mappings/cBioPortal/cBioPortal.yaml"
 
 cbp_add_clinical(clinical_data, ref_map)
 ```

From a46eab76d11fc237db2505d85370b738667c13ae Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 13 Feb 2025 16:23:19 -0800
Subject: [PATCH 3/5] Regenerate docs

---
 man/cbp_add_expression.Rd  |  5 ++---
 man/write_cbio_clinical.Rd | 10 ++++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/man/cbp_add_expression.Rd b/man/cbp_add_expression.Rd
index 348994d7..994df550 100644
--- a/man/cbp_add_expression.Rd
+++ b/man/cbp_add_expression.Rd
@@ -9,13 +9,12 @@ cbp_add_expression(expression_data, expression_data_raw = NULL, verbose = TRUE)
 \arguments{
 \item{expression_data}{Syn id of normalized gene counts results (default to TPM). See details.}
 
-\item{expression_data_raw}{(Optional) Syn id of raw counts results. See details.}
+\item{expression_data_raw}{(Optional) Syn id of raw counts if curators explicitly ask for it.}
 
 \item{verbose}{Whether to be chatty.}
 }
 \description{
 This should be run in an existing dataset package root.
 Note that there are a number of different options generated by the STAR Salmon pipeline.
-cBioPortal has confirmed that they prefer normalized counts \code{gene_tpm.tsv} and,
-\emph{though not used}, find it helpful to also have raw counts \code{gene_counts.tsv}.
+cBioPortal has confirmed that they prefer normalized counts \code{gene_tpm.tsv}.
 }
diff --git a/man/write_cbio_clinical.Rd b/man/write_cbio_clinical.Rd
index 9e0e1b93..3d8f7598 100644
--- a/man/write_cbio_clinical.Rd
+++ b/man/write_cbio_clinical.Rd
@@ -33,10 +33,12 @@ The PATIENT file is actually optional, so there are only checks for making sure
 (since NF data is not well-normalized and there is a single table with everything).
 }
 \details{
-This relies on a \code{ref_map} specification to know which clinical data to include for cBioPortal
+This depends on a \code{ref_map} specification to know which clinical data to include for cBioPortal
 and how to segregate the clinical attributes into the right files.
-For example, say \code{df} contains clinical variables A-X, but mappings are only specified for
-variables A-C, L-M and others are not meant to be surfaced/made public. This will subset the \code{df} to what's specified in the mapping.
-Conversely, if there is a mapping for variable Z that is \emph{not} in the clinical data, this \emph{will} throw error.
+Basically, \code{ref_map} decides what variables can be made public and how they should be represented in cBioPortal.
+For example, given a table \code{T} on Synapse with variables A-Z and mappings in \code{ref_map} for A-C + L-M,
+we take the intersection of variables present.
+But first, check that \emph{required} variables in \emph{ref_map} are present.
+So first the subset \code{df} is created from \code{T}.
 }
 \keyword{internal}

From a14a1a38a6cce1a3dd163574340e16ced959e6ae Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 13 Feb 2025 18:23:59 -0800
Subject: [PATCH 4/5] Update source

---
 R/cbioportal.R   |  3 +++
 R/cboilerplate.R | 26 ++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/R/cbioportal.R b/R/cbioportal.R
index 7300f4d5..0b8694f2 100644
--- a/R/cbioportal.R
+++ b/R/cbioportal.R
@@ -188,6 +188,9 @@ cbp_add_maf <- function(maf_data, verbose = TRUE) {
 
   if(verbose) checked_message("Making maf meta file")
   make_meta_maf(cancer_study_identifier, verbose = verbose)
+  
+  if(verbose) checked_message("Making required _sequenced case list for mutation data")
+  make_case_list_maf(cancer_study_identifier)
 
   if(verbose) checked_message("Done with adding MAF data")
 
diff --git a/R/cboilerplate.R b/R/cboilerplate.R
index 6e326a51..969441dc 100644
--- a/R/cboilerplate.R
+++ b/R/cboilerplate.R
@@ -411,6 +411,31 @@ make_meta_study_generic <- function(cancer_study_identifier,
   return(meta)
 }
 
+#--- Generating case list files ------------------------------------------------ #
+
+#' Case lists for mutation samples
+#'
+#' https://docs.cbioportal.org/file-formats/#case-lists
+#' @keywords internal
+make_case_list_maf <- function(cancer_study_identifier, verbose = TRUE) {
+  
+  mut <- fread("data_mutations.txt")
+  mut_samples <- unique(mut$Tumor_Sample_Barcode)
+  n <- length(mut_samples)
+  case_list_ids <- paste(mut_samples,collapse = "\t")
+  meta <- glue::glue("cancer_study_identifier: {cancer_study_identifier}") %>%
+    append_kv("stable_id", paste0(cancer_study_identifier, "_sequenced")) %>%
+    append_kv("case_list_name", "Samples with mutation data from sequencing") %>%
+    append_kv("case_list_description", paste0("Samples with mutation data from sequencing ", "(", n, ")")) %>%
+    append_kv("case_list_ids", case_list_ids)
+  
+  if(!dir.exists("case_lists")) {
+    if(verbose) checked_message(glue::glue("Creating case_lists study directory"))
+    dir.create(glue::glue("./case_lists"))
+  }
+  
+  writeLines(meta, "case_lists/case-list.txt")
+}
 
 # --- Other utils -------------------------------------------------------------- #
 
@@ -443,3 +468,4 @@ use_ref_map <- function(ref_map, as_dt = TRUE) {
     return(ref_map_ls)
   }
 }
+

From 3690afc5b77185ac3f4e3d3f9a7f6902cf39ea1c Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Thu, 13 Feb 2025 18:24:36 -0800
Subject: [PATCH 5/5] Documentation

---
 man/make_case_list_maf.Rd                     | 12 +++++++
 ...tal-data-to-other-platforms-cbioportal.Rmd | 34 +++++++++++--------
 2 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 man/make_case_list_maf.Rd

diff --git a/man/make_case_list_maf.Rd b/man/make_case_list_maf.Rd
new file mode 100644
index 00000000..ff1ffb1d
--- /dev/null
+++ b/man/make_case_list_maf.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cboilerplate.R
+\name{make_case_list_maf}
+\alias{make_case_list_maf}
+\title{Case lists for mutation samples}
+\usage{
+make_case_list_maf(cancer_study_identifier, verbose = TRUE)
+}
+\description{
+https://docs.cbioportal.org/file-formats/#case-lists
+}
+\keyword{internal}
diff --git a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
index aac5fee0..94b317f4 100644
--- a/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
+++ b/vignettes/bringing-portal-data-to-other-platforms-cbioportal.Rmd
@@ -14,12 +14,17 @@ knitr::opts_chunk$set(
 )
 ```
 
-**Document Status:** Draft  
+**Document Status:** Working  
 **Estimated Reading Time:** 8 min
 
 ## Special acknowledgments 
 
-Functionality demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). 
+Utils demonstrated in this vignette benefited greatly from code originally written by [hhunterzinck](https://github.com/hhunterzinck). 
+
+## Important note
+
+The requirements for cBioPortal change, just like with any software or database. 
+The package is updated to keep up on a yearly submission basis, but there may be occasional points in time when the workflow is out-of-date with this external system. 
 
 ## Intro
 
@@ -47,7 +52,10 @@ syn_login()
 ## Create a new study dataset
 
 First create the study dataset "package" where we can put together the data. 
-Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc.
+Each study dataset combines multiple data types -- clinical, gene expression, gene variants, etc. 
+Meta can be edited after the file has been created. 
+This will also set the working directory to the new study directory.
+
 
 ```{r cbp_new_study, eval=FALSE}
 
@@ -64,15 +72,15 @@ These functions download data files and create the meta for them.
 
 Note that:
 
-- These should be run with the working directory set to the study dataset directory as set up above to ensure consistent metadata.
+- These should be run with the working directory set to the study directory as set up above to ensure consistent metadata.
 - **Defaults are for known NF-OSI processed data outputs**. 
 - If these defaults don't apply because of changes in the scenario, take a look at the lower-level utils `make_meta_*` or edit the files manually after.
 - Data types can vary in how much additional work is needed in remapping, reformatting, custom sanity checks, etc.
 
 ### Add mutations data
 
-- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline OK for public release. 
-- This data file type requires no further modifications except renaming.
+- `maf_data` references a final merged maf output file from the NF-OSI processing pipeline (vcf2maf) OK for public release. 
+- Under the hood, a required case list file is also generated.
 
 ```{r add_maf, eval=FALSE}
 
@@ -109,10 +117,8 @@ cbp_add_expression(mrna_data,
 
 ### Add clinical data
 
-- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly. 
 - `clinical_data` is prepared from an existing Synapse table. The table can be a subsetted version of those released in the study dataset, or pass in a query that can be used for getting the subset. For example, the full clinical cohort comprises patients 1-50, but the dataset can only release data for patients 1-20 for expression data and data patients 15-20 for cna data. Here, `clinical_data` can be a smaller table of just those 1-30, or it can be the original table but pass in a suitable additional filter, e.g. `where release = 'batch1'`.
 - Clinical data requires mapping to be as consistent with other public datasets as possible. `ref_map` defines the mapping of clinical variables from the NF-OSI data dictionary to cBioPortal's. Only variables in the mapping are exported to cBioPortal. Follow link below to inspect the default file and format used.
-- Clinical data **should be added last**, after all other data has been added, for sample checks to work properly.
 
 ```{r add_clinical, eval=FALSE}
 
@@ -124,15 +130,13 @@ cbp_add_clinical(clinical_data, ref_map)
 
 ## Validation
 
-There are additional steps such as generating case lists and validation that have to be done _outside_ of the package with a cBioPortal backend, where each portal may have specific configurations (such as genomic reference) to validate against.
-See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator/).
+Validation has to be done with a cBioPortal instance. Each portal may have specific configurations (such as genomic reference) to validate against.
 
-For the _public_ portal, the suggested step using the public server is given below.  
-
-Assuming your present working directory is `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like:  
+For an example simple *offline* validation, assuming you are at `~/datahub/public` and a study folder called `npst_nfosi_ntap_2022` has been placed into it, mount the dataset into the container and run validation like:  
 ```
 STUDY=npst_nfosi_ntap_2022
-sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:5.4.7 validateStudies.py -d /datahub -l $STUDY -u http://cbioportal.org -html /datahub/$STUDY/html_report
+sudo docker run --rm -v $(pwd):/datahub cbioportal/cbioportal:6.0.25 validateData.py -s datahub/$STUDY -n -v
 ```
 
-The html report will list issues by data types to help with any corrections needed.  
+**See the [general docs for dataset validation](https://docs.cbioportal.org/using-the-dataset-validator) for more examples.**
+