From 383391d546ca7df85273364b16f7e7592ad59cd0 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Wed, 2 Nov 2022 20:32:40 -0700 Subject: [PATCH] Prepare release 14.3.0 (#486) * [GEN-174] allow for patch public release versions to be specified (#481) * Use fileview and allow for patch releases to be specified * Add 0 release in function * lint * [GEN-185] Fusions entrez gene must be integers (#483) * Add check for entrez gene id * Make sure entrez gene id is only integers * Lint * [GEN-33] Support sv format (#480) * Support sv format * Add SV as a file that is released * Fix f-interpolation * Update synid * Add in check for duplicated rows * Lint * Fix tests * Lint * Update genie_registry/structural_variant.py * Update tests/test_sv.py * [GEN-76] No longer need to include the release version in the release files (#484) * don't add release name to filenames * Don't rename files * Remove files * Don't remove files for now * No longer need to rewrite metadata files * Comment out some lines of code * Remove all existing files * Add in HACK docstring * Don't add release version * Fix typo * Remove arg * Comment out unnecessary code * Lint * Fix tests * Lint * Remove cohort for seg * Push new version --- bin/consortium_to_public.py | 46 ++++---- bin/database_to_staging.py | 48 ++++---- genie/__init__.py | 2 +- genie/consortium_to_public.py | 138 +++++++++++++--------- genie/database_to_staging.py | 164 +++++++++++++++++++-------- genie_registry/fusions.py | 6 + genie_registry/structural_variant.py | 72 +++++++----- tests/test_database_to_staging.py | 8 +- tests/test_fusions.py | 8 +- tests/test_sv.py | 17 ++- 10 files changed, 322 insertions(+), 187 deletions(-) diff --git a/bin/consortium_to_public.py b/bin/consortium_to_public.py index 5e1693e1..41cba9b1 100644 --- a/bin/consortium_to_public.py +++ b/bin/consortium_to_public.py @@ -85,6 +85,9 @@ def generate_data_guide( def main(args): + # HACK: Delete all existing files first + process_functions.rmFiles(database_to_staging.GENIE_RELEASE_DIR) + cbioValidatorPath = os.path.join( args.cbioportalPath, "core/src/main/scripts/importer/validateData.py" ) @@ -120,18 +123,23 @@ def main(args): public_synid = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "public" ].values[0] - + # Use release folder fileview releaseSynId = databaseSynIdMappingDf["Id"][ - databaseSynIdMappingDf["Database"] == "release" + databaseSynIdMappingDf["Database"] == "releaseFolder" ].values[0] - - officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping( - syn, releaseSynId, test=args.test - ) - - assert ( - args.genieVersion in officialPublic.keys() - ), "genieVersion must be one of these: {}.".format(", ".join(officialPublic.keys())) + # TEST run of the infrastructure will always + # Map to a specific folder + if args.test: + officialPublic = {"TESTpublic": "syn12299959"} + else: + officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping( + syn, releaseSynId + ) + if args.genieVersion not in officialPublic.keys(): + allowed_public_release_names = ", ".join(officialPublic.keys()) + raise ValueError( + f"genieVersion must be one of these: {allowed_public_release_names}." + ) args.releaseId = officialPublic[args.genieVersion] if not args.test and not args.staging: @@ -155,9 +163,7 @@ def main(args): publicReleaseCutOff=args.publicReleaseCutOff, ) - database_to_staging.revise_metadata_files( - syn, args.staging, public_synid, args.genieVersion - ) + database_to_staging.revise_metadata_files(syn, public_synid, args.genieVersion) logger.info("CBIO VALIDATION") # Must be exit 0 because the validator sometimes fails, @@ -182,13 +188,13 @@ def main(args): cbioLog.write(cbio_decoded_output) syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid)) os.remove(cbio_log_file) - logger.info("REMOVING OLD FILES") - process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) - seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format( - database_to_staging.GENIE_RELEASE_DIR - ) - if os.path.exists(seg_meta_file): - os.unlink(seg_meta_file) + # logger.info("REMOVING OLD FILES") + # process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) + # seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format( + # database_to_staging.GENIE_RELEASE_DIR + # ) + # if os.path.exists(seg_meta_file): + # os.unlink(seg_meta_file) logger.info("CREATING LINK VERSION") folders = database_to_staging.create_link_version( diff --git a/bin/database_to_staging.py b/bin/database_to_staging.py index 73cde326..a0397deb 100644 --- a/bin/database_to_staging.py +++ b/bin/database_to_staging.py @@ -120,13 +120,16 @@ def main( debug: Synapse debug flag skip_mutationsincis: Skip mutation in cis filter """ + # HACK: Delete all existing files first + process_functions.rmFiles(database_to_staging.GENIE_RELEASE_DIR) + syn = process_functions.synLogin(pemfile, debug=debug) genie_user = os.environ.get("GENIE_USER") if pemfile is not None: genie_pass = process_functions.get_password(pemfile) else: genie_pass = None - + # HACK: Use project id instead of this... if test: databaseSynIdMappingId = "syn11600968" genie_version = "TESTING" @@ -214,11 +217,11 @@ def main( os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist)) clinical_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "data_clinical_{}.txt".format(genie_version), + "data_clinical.txt", ) assay_information_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "assay_information_{}.txt".format(genie_version), + "assay_information.txt", ) create_case_lists.main( clinical_path, @@ -237,20 +240,18 @@ def main( ) logger.info("REMOVING UNNECESSARY FILES") - genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR) - for genie_file in genie_files: - if ( - genie_version not in genie_file - and "meta" not in genie_file - and "case_lists" not in genie_file - ): - os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file)) - os.remove(clinical_path) + # genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR) + # for genie_file in genie_files: + # if ( + # genie_version not in genie_file + # and "meta" not in genie_file + # and "case_lists" not in genie_file + # ): + # os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file)) + # os.remove(clinical_path) logger.info("REVISE METADATA FILES") - database_to_staging.revise_metadata_files( - syn, staging, consortiumSynId, genie_version - ) + database_to_staging.revise_metadata_files(syn, consortiumSynId, genie_version) logger.info("CBIO VALIDATION") @@ -276,14 +277,15 @@ def main( cbio_log.write(cbioOutput.decode("utf-8")) syn.store(synapseclient.File(cbio_validator_log, parentId=log_folder_synid)) os.remove(cbio_validator_log) - logger.info("REMOVING OLD FILES") - - process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) - private_cna_meta_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt" - ) - if os.path.exists(private_cna_meta_path): - os.unlink(private_cna_meta_path) + # HACK: Instead of doing this, files should be written to a tempdir... + # logger.info("REMOVING OLD FILES") + + # process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) + # private_cna_meta_path = os.path.join( + # database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt" + # ) + # if os.path.exists(private_cna_meta_path): + # os.unlink(private_cna_meta_path) logger.info("CREATING LINK VERSION") # Returns release and case list folder diff --git a/genie/__init__.py b/genie/__init__.py index bb15863a..8c113d81 100644 --- a/genie/__init__.py +++ b/genie/__init__.py @@ -7,6 +7,6 @@ # create version in __init__.py # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ -__version__ = "14.2.0" +__version__ = "14.3.0" __all__ = ["__version__"] diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index 3b3f464f..0746753a 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -56,36 +56,34 @@ def consortiumToPublic( databaseSynIdMappingDf, publicReleaseCutOff=365, ): - cna_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "data_CNA_%s.txt" % genie_version - ) + cna_path = os.path.join(database_to_staging.GENIE_RELEASE_DIR, "data_CNA.txt") clinical_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "data_clinical_%s.txt" % genie_version + database_to_staging.GENIE_RELEASE_DIR, "data_clinical.txt" ) clinical_sample_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "data_clinical_sample_%s.txt" % genie_version, + "data_clinical_sample.txt", ) clinicl_patient_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "data_clinical_patient_%s.txt" % genie_version, + "data_clinical_patient.txt", ) data_gene_panel_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "data_gene_matrix_%s.txt" % genie_version + database_to_staging.GENIE_RELEASE_DIR, "data_gene_matrix.txt" ) mutations_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "data_mutations_extended_%s.txt" % genie_version, + "data_mutations_extended.txt", ) fusions_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "data_fusions_%s.txt" % genie_version + database_to_staging.GENIE_RELEASE_DIR, "data_fusions.txt" ) seg_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, - "genie_public_data_cna_hg19_%s.seg" % genie_version, + "data_cna_hg19.seg", ) combined_bed_path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "genie_combined_%s.bed" % genie_version + database_to_staging.GENIE_RELEASE_DIR, "genie_combined.bed" ) if not os.path.exists(database_to_staging.GENIE_RELEASE_DIR): @@ -331,7 +329,7 @@ def consortiumToPublic( seg_path, public_release_preview, genie_version, - name="genie_public_data_cna_hg19.seg", + name="data_cna_hg19.seg", ) elif entName == "genomic_information.txt": bed = syn.get(entId, followLink=True) @@ -345,24 +343,24 @@ def consortiumToPublic( genie_version, name="genomic_information.txt", ) - elif entName.startswith("data_gene_panel"): - genePanel = syn.get(entId, followLink=True) - # Create new gene panel naming and store - fileName = os.path.basename(genePanel.path) - newFileList = fileName.split("_") - newFileList[-1] = genie_version + ".txt" - newFileName = "_".join(newFileList) - genePanelPath = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, newFileName - ) - shutil.copy(genePanel.path, genePanelPath) - del newFileList[-1] - entName = "_".join(newFileList) - entName = entName + ".txt" - genepanel_ent = storeFile( - syn, genePanelPath, public_release_preview, genie_version, name=entName - ) - genePanelEntities.append(genepanel_ent) + # elif entName.startswith("data_gene_panel"): + # genePanel = syn.get(entId, followLink=True) + # # Create new gene panel naming and store + # fileName = os.path.basename(genePanel.path) + # # newFileList = fileName.split("_") + # # newFileList[-1] = genie_version + ".txt" + # # newFileName = "_".join(newFileList) + # genePanelPath = os.path.join( + # database_to_staging.GENIE_RELEASE_DIR, fileName + # ) + # shutil.copy(genePanel.path, genePanelPath) + # # del newFileList[-1] + # # entName = "_".join(newFileList) + # # entName = entName + ".txt" + # genepanel_ent = storeFile( + # syn, genePanelPath, public_release_preview, genie_version, name=entName + # ) + # genePanelEntities.append(genepanel_ent) else: ent = syn.get(entId, followLink=True, downloadFile=False) copiedId = synapseutils.copy( @@ -377,32 +375,64 @@ def consortiumToPublic( copiedEnt = syn.get(copiedId[ent.id], downloadFile=False) # Set version comment copiedEnt.versionComment = genie_version - syn.store(copiedEnt, forceVersion=False) + copiedEnt = syn.store(copiedEnt, forceVersion=False) + # There was a time when gene panel files had to be renamed + # with an appended genie version. But... GEN-76 + # No longer appending genie verison to release files + # So just need to track gene panel entities + if entName.startswith("data_gene_panel"): + genePanelEntities.append(copiedEnt) + return caseListEntities, genePanelEntities -def get_public_to_consortium_synid_mapping(syn, releaseSynId, test=False): +def get_public_to_consortium_synid_mapping( + syn: synapseclient.Synapse, release_synid: str +) -> dict: """ - Gets the mapping between public version name - and its Synapse ids (Can probably be replaced with folder view) + Gets the mapping between potential public release names and + the consortium release folder + + Args: + syn (Synapse): Synapse connection + release_synid (str): Release folder fileview + + Returns: + dict: Mapping between potential public release and consortium + release synapse id """ - temp = synapseutils.walk(syn, releaseSynId) - officialPublic = dict() - for dirpath, dirnames, filenames in temp: - release = os.path.basename(dirpath[0]) - # checkRelease = release.split(".") - final = [i.split("-") for i in release.split(".")] - checkRelease = [] - for i in final: - checkRelease.extend(i) - if test: - officialPublic["TESTpublic"] = "syn12299959" - else: - if len(checkRelease) == 3 and checkRelease[0] != "0": - if int(checkRelease[1]) > 0: - if checkRelease[0] in ["1", "2"]: - public_release_name = str(int(checkRelease[0]) + 1) + ".0.0" - else: - public_release_name = str(int(checkRelease[0])) + ".0-public" - officialPublic[public_release_name] = dirpath[1] - return officialPublic + # This dict contains the mapping between public release name and + # consortium release folder + public_to_consortium_map = dict() + # release_files = synapseutils.walk(syn, releaseSynId) + # TODO: fix the database to mapping table + consortium_release_folders = syn.tableQuery( + f"SELECT name, id FROM {release_synid} WHERE " + "name NOT LIKE 'Release %' " + "and name NOT LIKE '%-public' " + "and name NOT IN ('case_lists', 'potential_artifacts')" + "ORDER BY name" + ) + consortium_release_folders_df = consortium_release_folders.asDataFrame() + # Get major release version + consortium_release_folders_df["major_release"] = [ + release.split(".")[0] for release in consortium_release_folders_df["name"] + ] + # only keep the latest consortium release for the public release + consortium_release_folders_df.drop_duplicates( + "major_release", keep="last", inplace=True + ) + + for _, release_info in consortium_release_folders_df.iterrows(): + major_release = release_info["major_release"] + # add support for potential patch releases + for num in [0, 1, 2, 3]: + # This has to exist because the the first three GENIE releases + # used semantic versioning + if release_info["major_release"] in ["0", "1", "2"]: + public_release_name = f"{int(major_release) + 1}.{num}.0" + public_to_consortium_map[public_release_name] = release_info["id"] + else: + public_release_name = f"{major_release}.{num}-public" + public_to_consortium_map[public_release_name] = release_info["id"] + return public_to_consortium_map diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index c521bc83..ed640eea 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -8,6 +8,7 @@ import re import subprocess import time +from typing import List import pandas as pd import pyranges @@ -33,7 +34,8 @@ GENIE_RELEASE_DIR, "data_mutations_extended_%s.txt" ) FUSIONS_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_fusions_%s.txt") -SEG_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "genie_data_cna_hg19_%s.seg") +SEG_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_cna_hg19_%s.seg") +SV_CENTER_PATH = os.path.join(GENIE_RELEASE_DIR, "data_sv_%s.txt") BED_DIFFS_SEQASSAY_PATH = os.path.join(GENIE_RELEASE_DIR, "diff_%s.csv") @@ -703,14 +705,10 @@ def store_gene_panel_files( for synId in genePanelDf["id"]: genePanel = syn.get(synId) genePanelName = os.path.basename(genePanel.path) - newGenePanelPath = os.path.join( - GENIE_RELEASE_DIR, genePanelName.replace(".txt", f"_{genieVersion}.txt") - ) - print(genePanelName.replace(".txt", "").replace("data_gene_panel_", "")) - if ( - genePanelName.replace(".txt", "").replace("data_gene_panel_", "") - in panelNames - ): + newGenePanelPath = os.path.join(GENIE_RELEASE_DIR, genePanelName) + gene_panel = genePanelName.replace(".txt", "").replace("data_gene_panel_", "") + print(gene_panel) + if gene_panel in panelNames: os.rename(genePanel.path, newGenePanelPath) genePanelEntities.append( store_file( @@ -802,7 +800,7 @@ def store_fusion_files( ] # FusionsDf.to_csv(FUSIONS_PATH, sep="\t", index=False) fusionText = process_functions.removePandasDfFloat(FusionsDf) - fusions_path = os.path.join(GENIE_RELEASE_DIR, f"data_fusions_{genie_version}.txt") + fusions_path = os.path.join(GENIE_RELEASE_DIR, "data_fusions.txt") with open(fusions_path, "w") as fusion_file: fusion_file.write(fusionText) store_file( @@ -815,6 +813,87 @@ def store_fusion_files( ) +def store_sv_files( + syn: synapseclient.Synapse, + release_synid: str, + genie_version: str, + synid: str, + keep_for_center_consortium_samples: List[str], + keep_for_merged_consortium_samples: List[str], + current_release_staging: str, + center_mappingdf: pd.DataFrame, +): + """ + Create, filter, configure, and store structural variant file + + Args: + syn: Synapse object + release_synid: Synapse id to store release file + genie_version: GENIE version (ie. v6.1-consortium) + synid: SV database synid + keep_for_center_consortium_samples: Samples to keep for center files + keep_for_merged_consortium_samples: Samples to keep for merged file + current_release_staging: Staging flag + center_mappingdf: Center mapping dataframe + """ + logger.info("MERING, FILTERING, STORING FUSION FILES") + sv_df = process_functions.get_syntabledf( + syn, + f"select * from {synid}", + ) + version = syn.create_snapshot_version(synid, comment=genie_version) + + # sv_df["ENTREZ_GENE_ID"].mask( + # sv_df["ENTREZ_GENE_ID"] == 0, float("nan"), inplace=True + # ) + + if not current_release_staging: + sv_staging_df = sv_df[ + sv_df["SAMPLE_ID"].isin(keep_for_center_consortium_samples) + ] + for center in center_mappingdf.center: + center_fusion = sv_staging_df[sv_staging_df["CENTER"] == center] + if not center_fusion.empty: + center_fusion.to_csv(SV_CENTER_PATH % center, sep="\t", index=False) + store_file( + syn, + SV_CENTER_PATH % center, + genieVersion=genie_version, + parent=center_mappingdf["stagingSynId"][ + center_mappingdf["center"] == center + ][0], + ) + + sv_df = sv_df[sv_df["SAMPLE_ID"].isin(keep_for_merged_consortium_samples)] + # sv_df = sv_df.rename( + # columns={ + # "HUGO_SYMBOL": "Hugo_Symbol", + # "ENTREZ_GENE_ID": "Entrez_Gene_Id", + # "CENTER": "Center", + # "TUMOR_SAMPLE_BARCODE": "Tumor_Sample_Barcode", + # "FUSION": "Fusion", + # "DNA_SUPPORT": "DNA_support", + # "RNA_SUPPORT": "RNA_support", + # "METHOD": "Method", + # "FRAME": "Frame", + # } + # ) + + # FusionsDf.to_csv(FUSIONS_PATH, sep="\t", index=False) + sv_text = process_functions.removePandasDfFloat(sv_df) + sv_path = os.path.join(GENIE_RELEASE_DIR, "data_sv.txt") + with open(sv_path, "w") as sv_file: + sv_file.write(sv_text) + store_file( + syn, + sv_path, + parent=release_synid, + genieVersion=genie_version, + name="data_sv.txt", + used=f"{synid}.{version}", + ) + + def append_or_create_release_maf(dataframe: pd.DataFrame, filepath: str): """Creates a file with the dataframe or appends to a existing file. @@ -868,9 +947,7 @@ def store_maf_files( "select id from {} where name like '%mutation%'".format(flatfiles_view_synid) ) centerMafSynIdsDf = centerMafSynIds.asDataFrame() - mutations_path = os.path.join( - GENIE_RELEASE_DIR, "data_mutations_extended_%s.txt" % genie_version - ) + mutations_path = os.path.join(GENIE_RELEASE_DIR, "data_mutations_extended.txt") with open(mutations_path, "w"): pass # Create maf file per center for their staging directory @@ -1047,9 +1124,7 @@ def store_assay_info_files( List of whole exome sequencing SEQ_ASSAY_IDs """ logger.info("Creates assay information file") - assay_info_path = os.path.join( - GENIE_RELEASE_DIR, f"assay_information_{genie_version}.txt" - ) + assay_info_path = os.path.join(GENIE_RELEASE_DIR, "assay_information.txt") seq_assay_str = "','".join(clinicaldf["SEQ_ASSAY_ID"].unique()) version = syn.create_snapshot_version(assay_info_synid, comment=genie_version) assay_infodf = process_functions.get_syntabledf( @@ -1216,15 +1291,9 @@ def store_clinical_files( # mapping to generate the headers of the clinical file mapping_table = syn.tableQuery("SELECT * FROM syn9621600") mapping = mapping_table.asDataFrame() - clinical_path = os.path.join( - GENIE_RELEASE_DIR, "data_clinical_%s.txt" % genie_version - ) - clinical_sample_path = os.path.join( - GENIE_RELEASE_DIR, "data_clinical_sample_%s.txt" % genie_version - ) - clinical_patient_path = os.path.join( - GENIE_RELEASE_DIR, "data_clinical_patient_%s.txt" % genie_version - ) + clinical_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical.txt") + clinical_sample_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_sample.txt") + clinical_patient_path = os.path.join(GENIE_RELEASE_DIR, "data_clinical_patient.txt") process_functions.addClinicalHeaders( clinicaldf, mapping, @@ -1284,7 +1353,7 @@ def store_cna_files( list: CNA samples """ logger.info("MERING, FILTERING, STORING CNA FILES") - cna_path = os.path.join(GENIE_RELEASE_DIR, f"data_CNA_{genie_version}.txt") + cna_path = os.path.join(GENIE_RELEASE_DIR, "data_CNA.txt") query_str = ("select id from {} " "where name like 'data_CNA%'").format( flatfiles_view_synid ) @@ -1404,9 +1473,7 @@ def store_seg_files( current_release_staging: Staging flag """ logger.info("MERING, FILTERING, STORING SEG FILES") - seg_path = os.path.join( - GENIE_RELEASE_DIR, f"genie_private_data_cna_hg19_{genie_version}.seg" - ) + seg_path = os.path.join(GENIE_RELEASE_DIR, "data_cna_hg19.seg") version = syn.create_snapshot_version(seg_synid, comment=genie_version) seg = syn.tableQuery( @@ -1449,7 +1516,7 @@ def store_seg_files( seg_path, parent=release_synid, genieVersion=genie_version, - name="genie_private_data_cna_hg19.seg", + name="data_cna_hg19.seg", used=f"{seg_synid}.{version}", ) @@ -1477,9 +1544,7 @@ def store_data_gene_matrix( pandas.DataFrame: data gene matrix dataframe """ logger.info("STORING DATA GENE MATRIX FILE") - data_gene_matrix_path = os.path.join( - GENIE_RELEASE_DIR, "data_gene_matrix_%s.txt" % genie_version - ) + data_gene_matrix_path = os.path.join(GENIE_RELEASE_DIR, "data_gene_matrix.txt") # Samples have already been removed data_gene_matrix = pd.DataFrame(columns=["SAMPLE_ID", "SEQ_ASSAY_ID"]) data_gene_matrix = pd.concat( @@ -1536,9 +1601,7 @@ def store_bed_files( release_synid: Synapse id to store release file """ logger.info("STORING COMBINED BED FILE") - combined_bed_path = os.path.join( - GENIE_RELEASE_DIR, f"genomic_information_{genie_version}.txt" - ) + combined_bed_path = os.path.join(GENIE_RELEASE_DIR, "genomic_information.txt") if not current_release_staging: for seq_assay in beddf["SEQ_ASSAY_ID"].unique(): bed_seq_df = beddf[beddf["SEQ_ASSAY_ID"] == seq_assay] @@ -1634,6 +1697,9 @@ def stagingToCbio( fusionSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "fusions" ][0] + sv_synid = databaseSynIdMappingDf["Id"][databaseSynIdMappingDf["Database"] == "sv"][ + 0 + ] # Grab assay information assay_info_ind = databaseSynIdMappingDf["Database"] == "assayinfo" assay_info_synid = databaseSynIdMappingDf["Id"][assay_info_ind][0] @@ -1787,6 +1853,17 @@ def stagingToCbio( CENTER_MAPPING_DF, ) + store_sv_files( + syn, + consortiumReleaseSynId, + genieVersion, + sv_synid, + keepForCenterConsortiumSamples, + keepForMergedConsortiumSamples, + current_release_staging, + CENTER_MAPPING_DF, + ) + store_seg_files( syn, genieVersion, @@ -1841,13 +1918,12 @@ def update_process_trackingdf( syn.store(synapseclient.Table(process_trackerdb_synid, process_trackerdf)) -def revise_metadata_files(syn, staging, consortiumid, genie_version=None): +def revise_metadata_files(syn, consortiumid, genie_version=None): """ Rewrite metadata files with the correct GENIE version Args: syn: Synapse object - staging: staging flag consortiumid: Synapse id of consortium release folder genie_version: GENIE version, Default to None """ @@ -1869,11 +1945,6 @@ def revise_metadata_files(syn, staging, consortiumid, genie_version=None): version = re.search(".+GENIE.+v(.+)", meta_text).group(1) # Fix this line genie_version = version if genie_version is None else genie_version - version_on_file = re.search(".+data_(.+)[.]txt", meta_text) - if version_on_file is None: - version_on_file = re.search(".+data_(.+)[.]seg", meta_text) - if version_on_file is not None: - version_on_file = version_on_file.group(1).split("_")[-1] if version != genie_version: meta_text = meta_text.replace( @@ -1885,15 +1956,10 @@ def revise_metadata_files(syn, staging, consortiumid, genie_version=None): "GENIE v{}".format(version), "GENIE v{}".format(genie_version) ) - if version_on_file is not None: - meta_text = meta_text.replace(version_on_file, genie_version) - meta_text = meta_text.replace(version_on_file, genie_version) meta.seek(0) meta.write(meta_text) meta.truncate() - store_file( - syn, meta_ent.path, parent=consortiumid, genieVersion=genie_version - ) + store_file(syn, meta_ent.path, parent=consortiumid, genieVersion=genie_version) def search_and_create_folder(syn, parentid, folder_name): diff --git a/genie_registry/fusions.py b/genie_registry/fusions.py index 5ec30a88..6eceef2e 100644 --- a/genie_registry/fusions.py +++ b/genie_registry/fusions.py @@ -151,6 +151,12 @@ def _validate(self, fusionDF, nosymbol_check): "Your fusion file should not have any NA/blank Hugo Symbols.\n" ) # fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply(lambda x: validateSymbol(x, bedDf), axis=1) + # Check if ENTREZ_GENE_ID is + if process_functions.checkColExist(fusionDF, "ENTREZ_GENE_ID"): + if fusionDF["ENTREZ_GENE_ID"].dtype not in [int, float]: + total_error += ( + "Your fusion file must have integers for ENTREZ_GENE_ID.\n" + ) if process_functions.checkColExist(fusionDF, "TUMOR_SAMPLE_BARCODE"): error = process_functions.validate_genie_identifier( diff --git a/genie_registry/structural_variant.py b/genie_registry/structural_variant.py index 1ef8f5be..a1ff6318 100644 --- a/genie_registry/structural_variant.py +++ b/genie_registry/structural_variant.py @@ -49,17 +49,22 @@ def _validate(self, sv_df): if not have_sample_col: total_error.write("Structural Variant: Must have SAMPLE_ID column.\n") else: - if sv_df["SAMPLE_ID"].duplicated().any(): - total_error.write( - "Structural Variant: No duplicated SAMPLE_ID allowed.\n" - ) + # if sv_df["SAMPLE_ID"].duplicated().any(): + # total_error.write( + # "Structural Variant: No duplicated SAMPLE_ID allowed.\n" + # ) # TODO: switch to validate_genie_identifier function # After GH-444 is merged - if not all(sv_df["SAMPLE_ID"].str.startswith(f"GENIE-{self.center}")): - total_error.write( - "Structural Variant: SAMPLE_ID must start with " - f"GENIE-{self.center}\n" - ) + errors = process_functions.validate_genie_identifier( + identifiers=sv_df["SAMPLE_ID"], + center=self.center, + filename="Structural Variant", + col="SAMPLE_ID", + ) + total_error.write(errors) + + if sv_df.duplicated().any(): + total_error.write("Structural Variant: No duplicated rows allowed.\n") warn, error = process_functions.check_col_and_values( sv_df, @@ -71,13 +76,18 @@ def _validate(self, sv_df): total_warning.write(warn) total_error.write(error) + have_hugo_1 = process_functions.checkColExist(sv_df, "SITE1_HUGO_SYMBOL") + have_hugo_2 = process_functions.checkColExist(sv_df, "SITE2_HUGO_SYMBOL") + have_entrez_1 = process_functions.checkColExist(sv_df, "SITE1_ENTREZ_GENE_ID") + have_entrez_2 = process_functions.checkColExist(sv_df, "SITE2_ENTREZ_GENE_ID") + + if not ((have_hugo_1 or have_entrez_1) and (have_hugo_2 or have_entrez_2)): + total_error.write( + "Structural Variant: Either SITE1_HUGO_SYMBOL/SITE1_ENTREZ_GENE_ID " + "or SITE2_HUGO_SYMBOL/SITE2_ENTREZ_GENE_ID is required.\n" + ) + # optional_columns = [ - # "SITE1_HUGO_SYMBOL", - # "SITE2_HUGO_SYMBOL", - # "SITE1_ENSEMBL_TRANSCRIPT_ID", - # "SITE2_ENSEMBL_TRANSCRIPT_ID", - # "SITE1_ENTREZ_GENE_ID", - # "SITE2_ENTREZ_GENE_ID", # "SITE1_REGION_NUMBER", # "SITE2_REGION_NUMBER", # "SITE1_REGION", @@ -112,6 +122,8 @@ def _validate(self, sv_df): # ] # Check for columns that should be integar columsn int_cols = [ + "SITE1_ENTREZ_GENE_ID", + "SITE2_ENTREZ_GENE_ID", "SITE1_REGION_NUMBER", "SITE2_REGION_NUMBER", "SITE1_POSITION", @@ -138,17 +150,21 @@ def _validate(self, sv_df): "column(s): {}.\n".format(", ".join(non_ints)) ) - # region_allow_vals = [ - # "5_PRIME_UTR", "3_PRIME_UTR", "PROMOTER", "EXON", "INTRON" - # ] - # warn, error = process_functions.check_col_and_values( - # sv_df, "SITE1_REGION", region_allow_vals, - # "Structural Variant", required=False - # ) - # warn, error = process_functions.check_col_and_values( - # sv_df, "SITE2_REGION", region_allow_vals, - # "Structural Variant", required=False - # ) + region_allow_vals = ["5_PRIME_UTR", "3_PRIME_UTR", "PROMOTER", "EXON", "INTRON"] + warn, error = process_functions.check_col_and_values( + sv_df, + "SITE1_REGION", + region_allow_vals, + "Structural Variant", + required=False, + ) + warn, error = process_functions.check_col_and_values( + sv_df, + "SITE2_REGION", + region_allow_vals, + "Structural Variant", + required=False, + ) warn, error = process_functions.check_col_and_values( sv_df, "NCBI_BUILD", @@ -180,13 +196,13 @@ def _validate(self, sv_df): total_error.write(error) warn, error = process_functions.check_col_and_values( - sv_df, "DNA_SUPPORT", ["Yes", "No"], "Structural Variant", required=False + sv_df, "DNA_SUPPORT", ["yes", "no"], "Structural Variant", required=False ) # total_warning.write(warn) total_error.write(error) warn, error = process_functions.check_col_and_values( - sv_df, "RNA_SUPPORT", ["Yes", "No"], "Structural Variant", required=False + sv_df, "RNA_SUPPORT", ["yes", "no"], "Structural Variant", required=False ) # total_warning.write(warn) total_error.write(error) diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index e791c95d..2ae1bafe 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -61,7 +61,7 @@ def test_store_gene_panel_files(): assert patch_syn_table_query.call_count == 2 patch_storefile.assert_called_once_with( SYN, - os.path.join(database_to_staging.GENIE_RELEASE_DIR, "PANEL1_vTEST.txt"), + os.path.join(database_to_staging.GENIE_RELEASE_DIR, "PANEL1.txt"), parent=CONSORTIUM_SYNID, genieVersion=GENIE_VERSION, name="PANEL1.txt", @@ -72,7 +72,7 @@ def test_store_gene_panel_files(): patch_syn_get.assert_called_once_with("syn3333") patch_os_rename.assert_called_once_with( "/foo/bar/PANEL1.txt", - os.path.join(database_to_staging.GENIE_RELEASE_DIR, "PANEL1_vTEST.txt"), + os.path.join(database_to_staging.GENIE_RELEASE_DIR, "PANEL1.txt"), ) @@ -81,9 +81,7 @@ def test_store_assay_info_files(): assay_infodf = pd.DataFrame({"library_strategy": ["WXS"], "SEQ_ASSAY_ID": ["A"]}) clinicaldf = pd.DataFrame({"SEQ_ASSAY_ID": ["A"]}) database_to_staging.GENIE_RELEASE_DIR = "./" - path = os.path.join( - database_to_staging.GENIE_RELEASE_DIR, "assay_information_vTEST.txt" - ) + path = os.path.join(database_to_staging.GENIE_RELEASE_DIR, "assay_information.txt") with patch.object( SYN, "create_snapshot_version", return_value=2 ) as patch_create_version, patch.object( diff --git a/tests/test_fusions.py b/tests/test_fusions.py index d0ea8bbe..7031f096 100644 --- a/tests/test_fusions.py +++ b/tests/test_fusions.py @@ -115,20 +115,21 @@ def test_validation_perfect_invalid(fusionClass): fusionDf = pd.DataFrame( { "HUGO_SYMBOL": [float("nan"), "AAK1", "AAAS"], + "ENTREZ_GENE_ID": ["asdfa", 1, 2], "CENTER": ["SAGE", "SAGE", "SAGE"], "TUMOR_SAMPLE_BARCODE": ["ID1-1", "ID2-1", "ID1-3"], "FUSION": ["AAED-AAK1", "AAAS-AAK1", "AAAS-AAK1"], "DNA_SUPPORT": ["foo", "foo", "foo"], "RNA_SUPPORT": ["foo", "foo", "foo"], "METHOD": ["foo", "foo", "foo"], - "FRAME": ["foo", "foo", "foo"], } ) with patch.object(syn, "get", return_value=ENTITY): error, warning = fusionClass._validate(fusionDf, False) expectedErrors = ( - "Your fusion file must at least have these headers: ENTREZ_GENE_ID.\n" + "Your fusion file must at least have these headers: FRAME.\n" "Your fusion file should not have any NA/blank Hugo Symbols.\n" + "Your fusion file must have integers for ENTREZ_GENE_ID.\n" "fusion: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" ) @@ -138,7 +139,8 @@ def test_validation_perfect_invalid(fusionClass): with patch.object(syn, "get", return_value=ENTITY): error, warning = fusionClass._validate(fusionDf, True) expectedErrors = ( - "Your fusion file must at least have these headers: ENTREZ_GENE_ID.\n" + "Your fusion file must at least have these headers: FRAME.\n" + "Your fusion file must have integers for ENTREZ_GENE_ID.\n" "fusion: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" ) assert error == expectedErrors diff --git a/tests/test_sv.py b/tests/test_sv.py index c6ffea51..efd5ca36 100644 --- a/tests/test_sv.py +++ b/tests/test_sv.py @@ -45,19 +45,21 @@ def test_validation_sample_error(self): { "sample_id": ["GENIE-SAGE-ID1-1", "GENIE-SAGE-ID1-1", "ID3-1"], "SV_STATUS": ["SOMATIC", "SOMATIC", "GERMLINE"], + "SITE1_HUGO_SYMBOL": ["af", "af", "ff"], + "SITE2_HUGO_SYMBOL": ["af", "af", "ff"], } ) error, warning = self.sv_cls._validate(sv_df) assert error == ( - "Structural Variant: No duplicated SAMPLE_ID allowed.\n" "Structural Variant: SAMPLE_ID must start with GENIE-SAGE\n" + "Structural Variant: No duplicated rows allowed.\n" ) assert warning == "" def test_validation_missing_required_cols(self): sv_df = pd.DataFrame( { - "test": ["GENIE-SAGE-ID1-1", "GENIE-SAGE-ID1-1", "ID3-1"], + "test": ["GENIE-SAGE-ID1-1", "GENIE-SAGE-ID1-2", "ID3-1"], "foo": ["SOMATIC", "SOMATIC", "GERMLINE"], } ) @@ -65,6 +67,8 @@ def test_validation_missing_required_cols(self): assert error == ( "Structural Variant: Must have SAMPLE_ID column.\n" "Structural Variant: Must have SV_STATUS column.\n" + "Structural Variant: Either SITE1_HUGO_SYMBOL/SITE1_ENTREZ_GENE_ID or " + "SITE2_HUGO_SYMBOL/SITE2_ENTREZ_GENE_ID is required.\n" ) assert warning == "" @@ -73,6 +77,8 @@ def test_validation_integer_check(self): { "sample_id": ["GENIE-SAGE-ID1-1", "GENIE-SAGE-ID2-1"], "SV_STATUS": ["SOMATIC", "GERMLINE"], + "SITE1_ENTREZ_GENE_ID": [1, "foo"], + "SITE2_ENTREZ_GENE_ID": [1, "foo"], "SITE1_REGION_NUMBER": [1, "foo"], "SITE2_REGION_NUMBER": [1, "foo"], "SITE1_POSITION": [1, "foo"], @@ -91,6 +97,7 @@ def test_validation_integer_check(self): error, warning = self.sv_cls._validate(sv_df) assert error == ( "Structural Variant: Only integers allowed in these column(s): " + "SITE1_ENTREZ_GENE_ID, SITE2_ENTREZ_GENE_ID, " "SITE1_REGION_NUMBER, SITE2_REGION_NUMBER, SITE1_POSITION, SITE2_POSITION, " "TUMOR_SPLIT_READ_COUNT, TUMOR_PAIRED_END_READ_COUNT, SV_LENGTH, " "NORMAL_READ_COUNT, TUMOR_READ_COUNT, NORMAL_VARIANT_COUNT, " @@ -104,12 +111,14 @@ def test_validation_no_errors(self): { "sample_id": ["GENIE-SAGE-ID1-1", "GENIE-SAGE-ID2-1"], "SV_STATUS": ["SOMATIC", "GERMLINE"], + "SITE1_ENTREZ_GENE_ID": [1, 2], + "SITE2_ENTREZ_GENE_ID": [1, 3], "SITE1_REGION_NUMBER": [1, 2], "NCBI_BUILD": ["GRCh38", "GRCh37"], "BREAKPOINT_TYPE": ["PRECISE", "IMPRECISE"], "CONNECTION_TYPE": ["3to5", "5to5"], - "DNA_SUPPORT": ["Yes", "No"], - "RNA_Support": ["No", "No"], + "DNA_SUPPORT": ["yes", "no"], + "RNA_Support": ["yes", "no"], } ) error, warning = self.sv_cls._validate(sv_df)