diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 4df66698..db6766ac 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -22,7 +22,7 @@ jobs: run: | sudo apt-get install -y bedtools python -m pip install --upgrade pip - pip install flake8 pytest pytest-cov mock + pip install flake8 pytest pytest-cov pip install . if [ -f requirements.txt ]; then pip install -r requirements.txt; fi # - name: Lint with flake8 diff --git a/bin/consortium_to_public.py b/bin/consortium_to_public.py new file mode 100644 index 00000000..d255d74e --- /dev/null +++ b/bin/consortium_to_public.py @@ -0,0 +1,265 @@ +import argparse +import datetime +import logging +import os +import synapseclient +import subprocess +import time + +from genie import dashboard_table_updater +from genie import process_functions +from genie import consortium_to_public +from genie import database_to_staging + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +PWD = os.path.dirname(os.path.abspath(__file__)) + + +# TODO: Move to genie.database_to_staging.py +def generate_dashboard_html(genie_version, staging=False, + genie_user=None, + genie_pass=None): + """Generates dashboard html writeout that gets uploaded to the + release folder + + Args: + syn: Synapse connection + genie_version: GENIE release + staging: Use staging files. Default is False + genie_user: GENIE synapse username + genie_pass: GENIE synapse password + + """ + markdown_render_cmd = ['Rscript', + os.path.join(PWD, '../genie/dashboard_markdown_generator.R'), + genie_version, + '--template_path', + os.path.join(PWD, '../genie/dashboardTemplate.Rmd')] + + if genie_user is not None and genie_pass is not None: + markdown_render_cmd.extend(['--syn_user', genie_user, + '--syn_pass', genie_pass]) + if staging: + markdown_render_cmd.append('--staging') + subprocess.check_call(markdown_render_cmd) + + +# TODO: Move to genie.database_to_staging.py +def generate_data_guide(genie_version, oncotree_version=None, + database_mapping=None, genie_user=None, + genie_pass=None): + """Generates the GENIE data guide""" + + template_path = os.path.join(PWD, '../data_guide/data_guide_template.Rnw') + with open(template_path, 'r') as template_file: + template_str = template_file.read() + + replacements = {"{{release}}": genie_version, + "{{database_synid}}": database_mapping, + "{{oncotree}}": oncotree_version.replace("_", "\\_"), + "{{username}}": genie_user, + "{{password}}": genie_pass, + "{{genie_banner}}": os.path.join(PWD, + "../genie_banner.png")} + + for search in replacements: + replacement = replacements[search] + # If no replacement value is passed in, don't replace + if replacement is not None: + template_str = template_str.replace(search, replacement) + + with open(os.path.join(PWD, "data_guide.Rnw"), "w") as data_guide_file: + data_guide_file.write(template_str) + + subprocess.check_call(['R', 'CMD', 'Sweave', '--pdf', + os.path.join(PWD, "data_guide.Rnw")]) + return "data_guide.pdf" + + +def main(args): + cbioValidatorPath = os.path.join( + args.cbioportalPath, + "core/src/main/scripts/importer/validateData.py") + assert os.path.exists(cbioValidatorPath), \ + "Please specify correct cbioportalPath" + assert not (args.test and args.staging), \ + "You can only specify --test or --staging, not both" + try: + processingDate = datetime.datetime.strptime( + args.processingDate, '%b-%Y') + except ValueError: + raise ValueError( + "Process date must be in the format " + "abbreviated_month-YEAR ie. Oct-2017") + + syn = process_functions.synLogin(args.pemFile, debug=args.debug) + genie_user = os.environ.get('GENIE_USER') + if args.pemFile is not None: + genie_pass = process_functions.get_password(args.pemFile) + else: + genie_pass = None + + # Get all the possible public releases + # Get configuration + if args.test: + databaseSynIdMappingId = 'syn11600968' + args.genieVersion = "TESTpublic" + elif args.staging: + databaseSynIdMappingId = 'syn12094210' + else: + databaseSynIdMappingId = 'syn10967259' + databaseSynIdMapping = syn.tableQuery( + 'select * from %s' % databaseSynIdMappingId) + databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame() + public_synid = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'public'].values[0] + + releaseSynId = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'release'].values[0] + + officialPublic = \ + consortium_to_public.get_public_to_consortium_synid_mapping( + syn, releaseSynId, test=args.test) + + assert args.genieVersion in officialPublic.keys(), \ + "genieVersion must be one of these: {}.".format( + ", ".join(officialPublic.keys())) + + args.releaseId = officialPublic[args.genieVersion] + if not args.test and not args.staging: + processTrackerSynId = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] + processTracker = syn.tableQuery( + "SELECT timeStartProcessing FROM %s where center = 'SAGE' " + "and processingType = 'public'" % processTrackerSynId) + processTrackerDf = processTracker.asDataFrame() + processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) + syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) + + caseListEntities, genePanelEntities = \ + consortium_to_public.consortiumToPublic( + syn, processingDate, args.genieVersion, + args.releaseId, databaseSynIdMappingDf, + publicReleaseCutOff=args.publicReleaseCutOff) + + database_to_staging.revise_metadata_files(syn, + args.staging, + public_synid, + args.genieVersion) + + logger.info("CBIO VALIDATION") + # Must be exit 0 because the validator sometimes fails, + # but we still want to capture the output + command = ['python', cbioValidatorPath, '-s', + database_to_staging.GENIE_RELEASE_DIR, '-n', '; exit 0'] + cbio_output = subprocess.check_output(" ".join(command), shell=True) + cbio_decoded_output = cbio_output.decode("utf-8") + logger.info(cbio_decoded_output) + if not args.test and not args.staging: + log_folder_synid = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'logs'].values[0] + # Use tempfiles + cbio_log_file = "cbioValidatorLogsPublic_{}.txt".format( + args.genieVersion) + with open(cbio_log_file, "w") as cbioLog: + cbioLog.write(cbio_decoded_output) + syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid)) + os.remove(cbio_log_file) + logger.info("REMOVING OLD FILES") + process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) + seg_meta_file = '{}/genie_public_meta_cna_hg19_seg.txt'.format( + database_to_staging.GENIE_RELEASE_DIR) + if os.path.exists(seg_meta_file): + os.unlink(seg_meta_file) + + logger.info("CREATING LINK VERSION") + folders = database_to_staging.create_link_version( + syn, args.genieVersion, caseListEntities, + genePanelEntities, databaseSynIdMappingDf, + release_type="public" + ) + # Don't update process tracker is testing or staging + if not args.test and not args.staging: + processTracker = syn.tableQuery( + "SELECT timeEndProcessing FROM %s where center = 'SAGE' and " + "processingType = 'public'" % processTrackerSynId) + processTrackerDf = processTracker.asDataFrame() + processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) + syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) + + if not args.test: + logger.info("DASHBOARD UPDATE") + dashboard_table_updater.run_dashboard(syn, databaseSynIdMappingDf, + args.genieVersion, + staging=args.staging) + generate_dashboard_html(args.genieVersion, staging=args.staging, + genie_user=genie_user, + genie_pass=genie_pass) + logger.info("DASHBOARD UPDATE COMPLETE") + logger.info("AUTO GENERATE DATA GUIDE") + + onco_link = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'oncotreeLink' + ].values[0] + onco_link_ent = syn.get(onco_link) + oncotree_link = onco_link_ent.externalURL + oncotree_version = oncotree_link.split("=")[1] + + data_guide_pdf = generate_data_guide( + args.genieVersion, + oncotree_version=oncotree_version, + database_mapping=databaseSynIdMappingId, + genie_user=genie_user, + genie_pass=genie_pass + ) + data_guide_ent = synapseclient.File(data_guide_pdf, + parent=folders['release_folder']) + syn.store(data_guide_ent) + logger.info("COMPLETED CONSORTIUM TO PUBLIC") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("processingDate", + type=str, + metavar="Jan-2017", + help="The process date of GENIE in Month-Year format " + "(ie. Apr-2017)") + + parser.add_argument("cbioportalPath", + type=str, + metavar="/path/to/cbioportal", + help="Make sure you clone the cbioportal github: " + "git clone https://github.com/cBioPortal/cbioportal.git") + + parser.add_argument("genieVersion", + type=str, + help="GENIE public release version") + + parser.add_argument("--publicReleaseCutOff", + type=int, + default=366, + help="Public release cut off time in days (Must " + "account for leap year, 366)") + + parser.add_argument("--staging", + action='store_true', + help="Store into staging folder") + + parser.add_argument("--test", + action='store_true', + help="Store into staging folder") + + parser.add_argument("--pemFile", + type=str, + help="Path to PEM file (genie.pem)") + + parser.add_argument("--debug", + action='store_true', + help="Synapse debug feature") + args = parser.parse_args() + main(args) diff --git a/genie/consortium_to_public.py b/genie/consortium_to_public.py index 34a1fbe4..7d6a322a 100644 --- a/genie/consortium_to_public.py +++ b/genie/consortium_to_public.py @@ -1,11 +1,9 @@ -import argparse -import datetime +"""Converts consortium release files to public release files""" +#! /usr/bin/env python3 + import logging import os -import re import shutil -import subprocess -import time import synapseclient import synapseutils @@ -14,386 +12,312 @@ from . import process_functions from . import database_to_staging from . import create_case_lists -from . import dashboard_table_updater logging.basicConfig() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def storeFile(syn, filePath, parentId, anonymizeCenterDf, - genie_version, name=None): - #process.center_anon(filePath, anonymizeCenterDf) + +def storeFile(syn, filePath, parentId, genie_version, name=None): + """Stores file with genie version as comment + + Args: + syn: Synapse object + filePath: Path to file + parentId: Synapse id of folder + + Returns: + Stored Entity + """ if name is None: name = os.path.basename(filePath) - return(syn.store(synapseclient.File(filePath, name=name, parent = parentId, versionComment=genie_version))) - #process.center_convert_back(filePath, anonymizeCenterDf) + file_ent = synapseclient.File(filePath, name=name, parent=parentId, + versionComment=genie_version) + file_ent = syn.store(file_ent) + return file_ent + -#This is the only filter that returns mutation columns to keep def commonVariantFilter(mafDf): + ''' + This filter returns variants to keep + + Args: + mafDf: Maf dataframe + ''' mafDf['FILTER'] = mafDf['FILTER'].fillna("") - toKeep = ["common_variant" not in i for i in mafDf['FILTER']] - mafDf = mafDf[toKeep] - return(mafDf) - -def consortiumToPublic(syn, processingDate, genie_version, releaseId, databaseSynIdMappingDf, publicReleaseCutOff=365, staging=False): - - ANONYMIZE_CENTER = syn.tableQuery('SELECT * FROM syn10170510') - ANONYMIZE_CENTER_DF = ANONYMIZE_CENTER.asDataFrame() - CNA_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,"data_CNA_%s.txt" % genie_version) - CLINICAL_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'data_clinical_%s.txt' % genie_version) - CLINICAL_SAMPLE_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'data_clinical_sample_%s.txt' % genie_version) - CLINICAL_PATIENT_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'data_clinical_patient_%s.txt' % genie_version) - DATA_GENE_PANEL_PATH = os.path.join(dbTodatabase_to_stagingstaging.GENIE_RELEASE_DIR,'data_gene_matrix_%s.txt' % genie_version) - MUTATIONS_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'data_mutations_extended_%s.txt' % genie_version) - FUSIONS_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'data_fusions_%s.txt' % genie_version) - SEG_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'genie_public_data_cna_hg19_%s.seg' % genie_version) - COMBINED_BED_PATH = os.path.join(database_to_staging.GENIE_RELEASE_DIR,'genomic_information_%s.txt' % genie_version) + to_keep = ["common_variant" not in i for i in mafDf['FILTER']] + mafDf = mafDf[to_keep] + return mafDf + + +def consortiumToPublic(syn, processingDate, genie_version, + releaseId, databaseSynIdMappingDf, + publicReleaseCutOff=365): + cna_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + "data_CNA_%s.txt" % genie_version) + clinical_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_clinical_%s.txt' % genie_version) + clinical_sample_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_clinical_sample_%s.txt' % genie_version) + clinicl_patient_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_clinical_patient_%s.txt' % genie_version) + data_gene_panel_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_gene_matrix_%s.txt' % genie_version) + mutations_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_mutations_extended_%s.txt' % genie_version) + fusions_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'data_fusions_%s.txt' % genie_version) + seg_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'genie_public_data_cna_hg19_%s.seg' % genie_version) + combined_bed_path = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, + 'genie_combined_%s.bed' % genie_version) if not os.path.exists(database_to_staging.GENIE_RELEASE_DIR): os.mkdir(database_to_staging.GENIE_RELEASE_DIR) if not os.path.exists(database_to_staging.CASE_LIST_PATH): os.mkdir(database_to_staging.CASE_LIST_PATH) - # if staging: - # #public release staging - # PUBLIC_RELEASE_PREVIEW = "syn7871696" - # PUBLIC_RELEASE_PREVIEW_CASELIST = "syn9689659" - # else: - #public release preview - PUBLIC_RELEASE_PREVIEW = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'public'].values[0] - PUBLIC_RELEASE_PREVIEW_CASELIST = database_to_staging.find_caselistid(syn, PUBLIC_RELEASE_PREVIEW) - - ############################################################################################################################## - ## Sponsored projects filter - ############################################################################################################################## - ## if before release date -> go into staging consortium - ## if after date -> go into public + # public release preview + public_release_preview = databaseSynIdMappingDf['Id'][ + databaseSynIdMappingDf['Database'] == 'public'].values[0] + public_release_preview_caselist = \ + database_to_staging.find_caselistid(syn, public_release_preview) + + ####################################################################### + # Sponsored projects filter + ####################################################################### + # if before release date -> go into staging consortium + # if after date -> go into public # sponsoredReleaseDate = syn.tableQuery('SELECT * FROM syn8545108') # sponsoredReleaseDateDf = sponsoredReleaseDate.asDataFrame() # sponsoredProjectSamples = syn.tableQuery('SELECT * FROM syn8545106') # sponsoredProjectSamplesDf = sponsoredProjectSamples.asDataFrame() - # sponsoredProjectsDf = sponsoredProjectSamplesDf.merge(sponsoredReleaseDateDf, left_on="sponsoredProject", right_on="sponsoredProjects") - # dates = sponsoredProjectsDf['releaseDate'].apply(lambda date: datetime.datetime.strptime(date, '%b-%Y')) - # publicReleaseSamples = sponsoredProjectsDf['genieSampleId'][dates < processingDate] - ############################################################################################################################## - + # sponsoredProjectsDf = sponsoredProjectSamplesDf.merge( + # sponsoredReleaseDateDf, left_on="sponsoredProject", + # right_on="sponsoredProjects") + # dates = sponsoredProjectsDf['releaseDate'].apply( + # lambda date: datetime.datetime.strptime(date, '%b-%Y')) + # publicReleaseSamples = sponsoredProjectsDf['genieSampleId'][ + # dates < processingDate] + ####################################################################### + # SEQ_DATE filter - # Jun-2015, given processing date (today) -> public release (processing date - Jun-2015 > 12 months) + # Jun-2015, given processing date (today) -> public release + # (processing date - Jun-2015 > 12 months) consortiumReleaseWalk = synapseutils.walk(syn, releaseId) consortiumRelease = next(consortiumReleaseWalk) - clinical = [syn.get(synid, followLink=True) for filename, synid in consortiumRelease[2] if filename == "data_clinical.txt"][0] - gene_matrix = [syn.get(synid, followLink=True) for filename, synid in consortiumRelease[2] if filename == "data_gene_matrix.txt"][0] + clinical = [syn.get(synid, followLink=True) + for filename, synid in consortiumRelease[2] + if filename == "data_clinical.txt"][0] + gene_matrix = [syn.get(synid, followLink=True) + for filename, synid in consortiumRelease[2] + if filename == "data_gene_matrix.txt"][0] clinicalDf = pd.read_csv(clinical.path, sep="\t", comment="#") gene_matrixdf = pd.read_csv(gene_matrix.path, sep="\t") - removeForPublicSamples = process_functions.seqDateFilter(clinicalDf,processingDate,publicReleaseCutOff) - #comment back in when public release filter back on - #publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples) - #Make sure all null oncotree codes are removed + removeForPublicSamples = process_functions.seqDateFilter( + clinicalDf, processingDate, publicReleaseCutOff) + # comment back in when public release filter back on + # publicReleaseSamples = publicReleaseSamples.append(keepForPublicSamples) + # Make sure all null oncotree codes are removed clinicalDf = clinicalDf[~clinicalDf['ONCOTREE_CODE'].isnull()] - publicReleaseSamples = clinicalDf.SAMPLE_ID[~clinicalDf.SAMPLE_ID.isin(removeForPublicSamples)] + publicReleaseSamples = clinicalDf.SAMPLE_ID[ + ~clinicalDf.SAMPLE_ID.isin(removeForPublicSamples)] - logger.info("SEQ_DATES for public release: " + ", ".join(set(clinicalDf.SEQ_DATE[clinicalDf.SAMPLE_ID.isin(publicReleaseSamples)].astype(str)))) + existing_seq_dates = \ + clinicalDf.SEQ_DATE[clinicalDf.SAMPLE_ID.isin(publicReleaseSamples)] - #Clinical release scope filter - #If consortium -> Don't release to public - clinicalReleaseScope = syn.tableQuery("SELECT * FROM syn8545211 where releaseScope = 'public'") + logger.info("SEQ_DATES for public release: " + + ", ".join(set(existing_seq_dates.astype(str)))) + + # Clinical release scope filter + # If consortium -> Don't release to public + clinicalReleaseScope = syn.tableQuery( + "SELECT * FROM syn8545211 where releaseScope = 'public'") publicRelease = clinicalReleaseScope.asDataFrame() allClin = clinicalDf[clinicalDf['SAMPLE_ID'].isin(publicReleaseSamples)] - allClin.to_csv(CLINICAL_PATH, sep="\t", index=False) + allClin.to_csv(clinical_path, sep="\t", index=False) + + gene_matrixdf = \ + gene_matrixdf[gene_matrixdf['SAMPLE_ID'].isin(publicReleaseSamples)] + gene_matrixdf.to_csv(data_gene_panel_path, sep="\t", index=False) + storeFile(syn, data_gene_panel_path, public_release_preview, genie_version, + name="data_gene_matrix.txt") + storeFile(syn, clinical_path, public_release_preview, genie_version, + name="data_clinical.txt") - gene_matrixdf = gene_matrixdf[gene_matrixdf['SAMPLE_ID'].isin(publicReleaseSamples)] - gene_matrixdf.to_csv(DATA_GENE_PANEL_PATH,sep="\t",index=False) - storeFile(syn, DATA_GENE_PANEL_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_gene_matrix.txt") - storeFile(syn, CLINICAL_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_clinical.txt") - - create_case_lists.main(CLINICAL_PATH, DATA_GENE_PANEL_PATH, database_to_staging.CASE_LIST_PATH, "genie_public") + create_case_lists.main(clinical_path, data_gene_panel_path, + database_to_staging.CASE_LIST_PATH, "genie_public") caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH) caseListEntities = [] for casePath in caseListFiles: casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath) - caseListEntities.append(storeFile(syn, casePath, PUBLIC_RELEASE_PREVIEW_CASELIST, ANONYMIZE_CENTER_DF, genie_version)) + caseListEntities.append( + storeFile(syn, casePath, public_release_preview_caselist, + genie_version)) - #Grab mapping table to fill in clinical headers + # Grab mapping table to fill in clinical headers mapping_table = syn.tableQuery('SELECT * FROM syn9621600') mapping = mapping_table.asDataFrame() genePanelEntities = [] for entName, entId in consortiumRelease[2]: - if "data_linear" in entName or "meta_" in entName: + # skip files to convert + if (entName.startswith("data_linear") + or "meta_" in entName + or entName.endswith(".html") + or entName in ["data_clinical_sample.txt", + "data_gene_matrix.txt", + "data_clinical_patient.txt", + "data_guide.pdf", + "release_notes.pdf"]): + # data_gene_matrix was processed above because it had to be + # used for generating caselists continue - elif entName == "data_clinical.txt": - patientCols = publicRelease['fieldName'][publicRelease['level'] == "patient"].tolist() + if entName == "data_clinical.txt": + patientCols = publicRelease['fieldName'][ + publicRelease['level'] == "patient"].tolist() sampleCols = ["PATIENT_ID"] - sampleCols.extend(publicRelease['fieldName'][publicRelease['level'] == "sample"].tolist()) - #clinicalDf is defined on line 36 - # clinicalDf['AGE_AT_SEQ_REPORT'] = [int(math.floor(int(float(i))/365.25)) if process.checkInt(i) else i for i in clinicalDf['AGE_AT_SEQ_REPORT']] - # clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == ">32485"] = ">89" - # clinicalDf['AGE_AT_SEQ_REPORT'][clinicalDf['AGE_AT_SEQ_REPORT'] == "<6570"] = "<18" + sampleCols.extend(publicRelease['fieldName'][ + publicRelease['level'] == "sample"].tolist()) + # clinicalDf is defined on line 127 + clinicalDf = \ + clinicalDf[clinicalDf['SAMPLE_ID'].isin(publicReleaseSamples)] - clinicalDf = clinicalDf[clinicalDf['SAMPLE_ID'].isin(publicReleaseSamples)] - - #Delete columns that are private scope + # Delete columns that are private scope # for private in privateRelease: # del clinicalDf[private] - process_functions.addClinicalHeaders(clinicalDf, mapping, patientCols, sampleCols, CLINICAL_SAMPLE_PATH, CLINICAL_PATIENT_PATH) + process_functions.addClinicalHeaders(clinicalDf, mapping, + patientCols, sampleCols, + clinical_sample_path, + clinicl_patient_path) - storeFile(syn, CLINICAL_SAMPLE_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_clinical_sample.txt") - storeFile(syn, CLINICAL_PATIENT_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_clinical_patient.txt") + storeFile(syn, clinical_sample_path, public_release_preview, + genie_version, name="data_clinical_sample.txt") + storeFile(syn, clinicl_patient_path, public_release_preview, + genie_version, name="data_clinical_patient.txt") elif "mutation" in entName: mutation = syn.get(entId, followLink=True) mutationDf = pd.read_csv(mutation.path, sep="\t", comment="#") mutationDf = commonVariantFilter(mutationDf) mutationDf['FILTER'] = "PASS" - mutationDf = mutationDf[mutationDf['Tumor_Sample_Barcode'].isin(publicReleaseSamples)] + mutationDf = mutationDf[ + mutationDf['Tumor_Sample_Barcode'].isin(publicReleaseSamples)] text = process_functions.removeFloat(mutationDf) - with open(MUTATIONS_PATH, 'w') as f: + with open(mutations_path, 'w') as f: f.write(text) - storeFile(syn, MUTATIONS_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_mutations_extended.txt") + storeFile(syn, mutations_path, public_release_preview, + genie_version, name="data_mutations_extended.txt") elif "fusion" in entName: fusion = syn.get(entId, followLink=True) fusionDf = pd.read_csv(fusion.path, sep="\t") - #remove = ["Entrez_Gene_Id","Method"] - #fusionDf = fusionDf[fusionDf.columns[~fusionDf.columns.isin(remove)]] - fusionDf = fusionDf[fusionDf['Tumor_Sample_Barcode'].isin(publicReleaseSamples)] - fusionDf.to_csv(FUSIONS_PATH,sep="\t",index=False) - storeFile(syn, FUSIONS_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_fusions.txt") + fusionDf = fusionDf[ + fusionDf['Tumor_Sample_Barcode'].isin(publicReleaseSamples)] + fusionDf.to_csv(fusions_path, sep="\t", index=False) + storeFile( + syn, fusions_path, public_release_preview, + genie_version, name="data_fusions.txt") elif "CNA" in entName: cna = syn.get(entId, followLink=True) cnaDf = pd.read_csv(cna.path, sep="\t") - cnaDf = cnaDf[cnaDf.columns[cnaDf.columns.isin(publicReleaseSamples.append(pd.Series("Hugo_Symbol")))]] + cna_columns = publicReleaseSamples.append(pd.Series("Hugo_Symbol")) + # parse out the CNA columns to keep + cnaDf = cnaDf[cnaDf.columns[cnaDf.columns.isin(cna_columns)]] text = process_functions.removeFloat(cnaDf) - text = text.replace("\t\t","\tNA\t").replace("\t\t","\tNA\t").replace('\t\n',"\tNA\n") - with open(CNA_PATH, "w") as cnaFile: + text = text.replace( + "\t\t", "\tNA\t").replace( + "\t\t", "\tNA\t").replace( + '\t\n', "\tNA\n") + with open(cna_path, "w") as cnaFile: cnaFile.write(text) - storeFile(syn, CNA_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_CNA.txt") + storeFile( + syn, cna_path, public_release_preview, + genie_version, name="data_CNA.txt") elif entName.endswith(".seg"): seg = syn.get(entId, followLink=True) segDf = pd.read_csv(seg.path, sep="\t") segDf = segDf[segDf['ID'].isin(publicReleaseSamples)] text = process_functions.removeFloat(segDf) - with open(SEG_PATH, "w") as segFile: + with open(seg_path, "w") as segFile: segFile.write(text) - storeFile(syn, SEG_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="genie_public_data_cna_hg19.seg") - elif entName == "data_gene_matrix.txt": - pass - # This file was processed above because it had to be used for generating caselists - # panel = syn.get(entId, followLink=True) - # panelDf = pd.read_csv(panel.path, sep="\t") - # panelDf = panelDf[panelDf['SAMPLE_ID'].isin(publicReleaseSamples)] - # panelDf.to_csv(DATA_GENE_PANEL_PATH,sep="\t",index=False) - # storeFile(syn, DATA_GENE_PANEL_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="data_gene_matrix.txt") + storeFile(syn, seg_path, public_release_preview, genie_version, + name="genie_public_data_cna_hg19.seg") elif entName == "genomic_information.txt": bed = syn.get(entId, followLink=True) bedDf = pd.read_csv(bed.path, sep="\t") bedDf = bedDf[bedDf.SEQ_ASSAY_ID.isin(allClin.SEQ_ASSAY_ID)] - bedDf.to_csv(COMBINED_BED_PATH,sep="\t",index=False) - storeFile(syn, COMBINED_BED_PATH, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name="genomic_information.txt") - elif entName in ["data_clinical_sample.txt", "data_clinical_patient.txt"] or entName.endswith(".html"): - continue + bedDf.to_csv(combined_bed_path,sep="\t",index=False) + storeFile(syn, combined_bed_path, public_release_preview, + genie_version, name="genomic_information.txt") elif entName.startswith("data_gene_panel"): genePanel = syn.get(entId, followLink=True) - #Create new gene panel naming and store + # Create new gene panel naming and store fileName = os.path.basename(genePanel.path) newFileList = fileName.split("_") newFileList[-1] = genie_version + ".txt" newFileName = "_".join(newFileList) - genePanelPath = os.path.join(database_to_staging.GENIE_RELEASE_DIR, newFileName) + genePanelPath = os.path.join( + database_to_staging.GENIE_RELEASE_DIR, newFileName) shutil.copy(genePanel.path, genePanelPath) del newFileList[-1] entName = "_".join(newFileList) entName = entName + ".txt" - genePanelEntities.append(storeFile(syn, genePanelPath, PUBLIC_RELEASE_PREVIEW, ANONYMIZE_CENTER_DF, genie_version, name=entName)) + genepanel_ent = storeFile( + syn, genePanelPath, public_release_preview, + genie_version, name=entName) + genePanelEntities.append(genepanel_ent) else: ent = syn.get(entId, followLink=True, downloadFile=False) - copiedId = synapseutils.copy(syn, ent, PUBLIC_RELEASE_PREVIEW, version=ent.versionNumber, updateExisting=True, setProvenance = None, skipCopyAnnotations=True) - copiedEnt = syn.get(copiedId[ent.id],downloadFile=False) - #Set version comment - copiedEnt.versionComment=genie_version + copiedId = synapseutils.copy( + syn, ent, public_release_preview, + version=ent.versionNumber, + updateExisting=True, setProvenance=None, + skipCopyAnnotations=True) + copiedEnt = syn.get(copiedId[ent.id], downloadFile=False) + # Set version comment + copiedEnt.versionComment = genie_version syn.store(copiedEnt, forceVersion=False) - return((caseListEntities,genePanelEntities)) - -def perform_consortiumToPublic(syn, args, databaseSynIdMappingDf): - try: - processingDate = datetime.datetime.strptime(args.processingDate, '%b-%Y') - except ValueError as e: - raise ValueError("Process date must be in the format abbreviated_month-YEAR ie. Oct-2017") - return(consortiumToPublic(syn, processingDate, args.genieVersion, args.releaseId, databaseSynIdMappingDf, publicReleaseCutOff = args.publicReleaseCutOff, staging = args.staging)) - -def command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf): - reviseMetadataFiles(syn, args.staging, databaseSynIdMappingDf, args.genieVersion) - -def reviseMetadataFiles(syn, staging, databaseSynIdMappingDf, genieVersion=None): - ANONYMIZE_CENTER = syn.tableQuery('SELECT * FROM syn10170510') - ANONYMIZE_CENTER_DF = ANONYMIZE_CENTER.asDataFrame() - # if staging: - # parent = "syn7871696" - # else: - parent = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'public'].values[0] - allFiles = syn.getChildren(parent) - metadataEnts = [syn.get(i['id'], downloadLocation=database_to_staging.GENIE_RELEASE_DIR, ifcollision="overwrite.local") for i in allFiles if 'meta' in i['name']] - for metaEnt in metadataEnts: - with open(metaEnt.path, "r+") as meta: - metaText = meta.read() - if "meta_study" not in metaEnt.path: - version = '' - else: - version = re.search(".+GENIE.+v(.+)", metaText).group(1) - #Fix this line - genieVersion = version if genieVersion is None else genieVersion - dataFileVersion = re.search(".+data_(.+)[.]txt",metaText) - if dataFileVersion is None: - dataFileVersion = re.search(".+data_(.+)[.]seg",metaText) - if dataFileVersion is not None: - dataFileVersion = dataFileVersion.group(1).split("_")[-1] - - if version != genieVersion: - metaText = metaText.replace("GENIE Cohort v%s" % version,"GENIE Cohort v%s" % genieVersion) - metaText = metaText.replace("GENIE v%s" % version,"GENIE v%s" % genieVersion) - if dataFileVersion is not None: - metaText = metaText.replace(dataFileVersion, genieVersion) - metaText = metaText.replace(dataFileVersion, genieVersion) - meta.seek(0) - meta.write(metaText) - meta.truncate() - - storeFile(syn, metaEnt.path, parent, ANONYMIZE_CENTER_DF, genieVersion) - - -def createLinkVersion(syn, genie_version, caseListEntities, genePanelEntities, databaseSynIdMappingDf): - versioning = genie_version.split(".") - logger.info(genie_version) - main = versioning[0] - releaseSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'release'].values[0] - publicSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'public'].values[0] - #second = ".".join(versioning[1:]) - releases = synapseutils.walk(syn, releaseSynId) - mainReleaseFolders = next(releases)[1] - releaseFolderSynId = [synId for folderName, synId in mainReleaseFolders if folderName == "Release %s" % main] - if len(releaseFolderSynId) > 0: - secondRelease = synapseutils.walk(syn, releaseFolderSynId[0]) - secondReleaseFolders = next(secondRelease)[1] - secondReleaseFolderSynIdList = [synId for folderName, synId in secondReleaseFolders if folderName == genie_version] - if len(secondReleaseFolderSynIdList) > 0: - secondReleaseFolderSynId = secondReleaseFolderSynIdList[0] - else: - secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = releaseFolderSynId[0])).id - else: - mainReleaseFolderId = syn.store(synapseclient.Folder("Release %s" % main, parent = releaseSynId)).id - secondReleaseFolderSynId = syn.store(synapseclient.Folder(genie_version, parent = mainReleaseFolderId)).id - - caselistId = database_to_staging.find_caselistid(syn, secondReleaseFolderSynId) - - publicRelease = syn.getChildren(publicSynId) - [syn.store(synapseclient.Link(ents['id'], parent=secondReleaseFolderSynId, targetVersion=ents['versionNumber'])) for ents in publicRelease if ents['type'] != "org.sagebionetworks.repo.model.Folder" and ents['name'] != "data_clinical.txt" and not ents['name'].startswith("data_gene_panel")] - [syn.store(synapseclient.Link(ents.id, parent=caselistId, targetVersion=ents.versionNumber)) for ents in caseListEntities] - #Store gene panels - [syn.store(synapseclient.Link(ents.id, parent=secondReleaseFolderSynId, targetVersion=ents.versionNumber)) for ents in genePanelEntities] - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument("processingDate", type=str, metavar="Jan-2017", - help="The process date of GENIE in Month-Year format (ie. Apr-2017)") - parser.add_argument("cbioportalPath", type=str, metavar="/path/to/cbioportal", - help="Make sure you clone the cbioportal github: git clone https://github.com/cBioPortal/cbioportal.git") - parser.add_argument("genieVersion", type=str, - help="GENIE public release version") - parser.add_argument("--publicReleaseCutOff", type=int, metavar=366, default=366, - help="Public release cut off time in days (Must account for leap year, 366)") - parser.add_argument("--staging", action='store_true', - help="Store into staging folder") - parser.add_argument("--test", action='store_true', - help="Store into staging folder") - parser.add_argument("--pemFile", type=str, - help="Path to PEM file (genie.pem)") - parser.add_argument("--debug", action='store_true', - help="Synapse debug feature") - args = parser.parse_args() - cbioValidatorPath = os.path.join(args.cbioportalPath,"core/src/main/scripts/importer/validateData.py") - assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" - assert not (args.test and args.staging), "You can only specify --test or --staging, not both" - - syn = process_functions.synLogin(args.pemFile, debug=args.debug) - #Get all the possible public releases - if args.test: - databaseSynIdMappingId = 'syn11600968' - args.genieVersion = "TESTpublic" - elif args.staging: - databaseSynIdMappingId = 'syn12094210' - else: - databaseSynIdMappingId = 'syn10967259' - databaseSynIdMapping = syn.tableQuery('select * from %s' % databaseSynIdMappingId) - databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame() - releaseSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'release'].values[0] - + return caseListEntities, genePanelEntities + + +def get_public_to_consortium_synid_mapping(syn, releaseSynId, test=False): + ''' + Gets the mapping between public version name + and its Synapse ids (Can probably be replaced with folder view) + ''' temp = synapseutils.walk(syn, releaseSynId) officialPublic = dict() for dirpath, dirnames, filenames in temp: release = os.path.basename(dirpath[0]) - #checkRelease = release.split(".") + # checkRelease = release.split(".") final = [i.split("-") for i in release.split(".")] checkRelease = [] for i in final: checkRelease.extend(i) - if args.test: + if test: officialPublic['TESTpublic'] = "syn12299959" else: if len(checkRelease) == 3 and checkRelease[0] != "0": if int(checkRelease[1]) > 0: - if checkRelease[0] in ['1','2']: - officialPublic[str(int(checkRelease[0])+1)+".0.0"] = dirpath[1] + if checkRelease[0] in ['1', '2']: + public_release_name = \ + str(int(checkRelease[0])+1)+".0.0" else: - officialPublic[str(int(checkRelease[0]))+".0-public"] = dirpath[1] - assert args.genieVersion in officialPublic.keys(), "genieVersion must be one of these: %s." % ", ".join(officialPublic.keys()) - args.releaseId = officialPublic[args.genieVersion] - if not args.test and not args.staging: - processTrackerSynId = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] - processTracker = syn.tableQuery("SELECT timeStartProcessing FROM %s where center = 'SAGE' and processingType = 'public'" % processTrackerSynId) - processTrackerDf = processTracker.asDataFrame() - processTrackerDf['timeStartProcessing'][0] = str(int(time.time()*1000)) - syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) - - caseListEntities, genePanelEntities = perform_consortiumToPublic(syn, args, databaseSynIdMappingDf) - command_reviseMetadataFiles(syn, args, databaseSynIdMappingDf) - logger.info("CBIO VALIDATION") - #Must be exit 0 because the validator sometimes fails, but we still want to capture the output - command = ['python',cbioValidatorPath, '-s', database_to_staging.GENIE_RELEASE_DIR, '-n','; exit 0'] - cbioOutput = subprocess.check_output(" ".join(command), shell=True) - logger.info(cbioOutput.decode("utf-8")) - if not args.test and not args.staging: - log_folder_synid = databaseSynIdMappingDf['Id'][databaseSynIdMappingDf['Database'] == 'logs'].values[0] - with open("cbioValidatorLogsPublic_%s.txt" % args.genieVersion, "w") as cbioLog: - cbioLog.write(cbioOutput.decode("utf-8")) - syn.store(synapseclient.File("cbioValidatorLogsPublic_%s.txt" % args.genieVersion, parentId=log_folder_synid)) - os.remove("cbioValidatorLogsPublic_%s.txt" % args.genieVersion) - logger.info("REMOVING OLD FILES") - process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) - if os.path.exists('%s/genie_public_meta_cna_hg19_seg.txt' % database_to_staging.GENIE_RELEASE_DIR): - os.unlink('%s/genie_public_meta_cna_hg19_seg.txt' % database_to_staging.GENIE_RELEASE_DIR) - - logger.info("CREATING LINK VERSION") - createLinkVersion(syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf) - #Don't update process tracker is testing or staging - if not args.test and not args.staging: - processTracker = syn.tableQuery("SELECT timeEndProcessing FROM %s where center = 'SAGE' and processingType = 'public'" % processTrackerSynId) - processTrackerDf = processTracker.asDataFrame() - processTrackerDf['timeEndProcessing'][0] = str(int(time.time()*1000)) - syn.store(synapseclient.Table(processTrackerSynId,processTrackerDf)) - - if not args.test: - logger.info("DASHBOARD UPDATE") - dashboard_table_updater.run_dashboard(syn, databaseSynIdMappingDf, args.genieVersion, staging=args.staging, public=True) - dashboard_markdown_html_commands = ['Rscript', os.path.join(os.path.dirname(os.path.abspath(__file__)),'dashboard_markdown_generator.R'), args.genieVersion] - if args.staging: - dashboard_markdown_html_commands.append('--staging') - subprocess.check_call(dashboard_markdown_html_commands) - logger.info("DASHBOARD UPDATE COMPLETE") - - logger.info("COMPLETED CONSORTIUM TO PUBLIC") + public_release_name = \ + str(int(checkRelease[0]))+".0-public" + officialPublic[public_release_name] = dirpath[1] + return officialPublic diff --git a/genie/dashboard_table_updater.py b/genie/dashboard_table_updater.py index 1d472787..d98f400f 100644 --- a/genie/dashboard_table_updater.py +++ b/genie/dashboard_table_updater.py @@ -1,15 +1,12 @@ """Updates dashboard tables""" +import argparse import datetime import logging import os -import argparse import pandas as pd import synapseclient -try: - from synapseclient.core.utils import to_unix_epoch_time -except ModuleNotFoundError: - from synapseclient.utils import to_unix_epoch_time +from synapseclient.core.utils import to_unix_epoch_time from genie import process_functions diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py index 7b899c15..4f82ba82 100644 --- a/genie/database_to_staging.py +++ b/genie/database_to_staging.py @@ -642,7 +642,8 @@ def store_maf_files(syn, f.write(header) # with open(mafEnt.path,"r") as newMafFile: # newMafFile.readline() - center = mafEnt.path.split("_")[3] + # In case filename isn't named correctly. + center = mafEnt.path.split("_")[3].replace(".txt", "") # Make sure to only write the centers that release = True if center in center_mappingdf.center.tolist(): for row in mafFile: @@ -1499,7 +1500,8 @@ def search_and_create_folder(syn, parentid, folder_name): def create_link_version(syn, genie_version, case_list_entities, - gene_panel_entities, database_synid_mappingdf): + gene_panel_entities, database_synid_mappingdf, + release_type="consortium"): ''' Create release links from the actual entity and version @@ -1512,6 +1514,7 @@ def create_link_version(syn, genie_version, case_list_entities, gene_panel_entities: Gene panel entities database_synid_mappingdf: dataframe containing database to synapse id mapping + release_type: 'consortium' or 'public' release ''' # Grab major release numbers (ie 1,2,3 ...) major_release = genie_version.split(".")[0] @@ -1536,39 +1539,48 @@ def create_link_version(syn, genie_version, case_list_entities, # caselistId = findCaseListId(syn, release_folder_synid) consortium_synid = database_synid_mappingdf['Id'][ - database_synid_mappingdf['Database'] == 'consortium'].values[0] + database_synid_mappingdf['Database'] == release_type].values[0] consortium_release_files = syn.getChildren(consortium_synid) - # data_clinical.txt MUST be pulled in because the clinical file is - # needed in the consortium_to_public.py + for release_file in consortium_release_files: - if release_file['type'] != "org.sagebionetworks.repo.model.Folder" \ - and not release_file['name'].startswith("data_gene_panel"): + not_folder = release_file['type'] != "org.sagebionetworks.repo.model.Folder" + # data_clinical.txt MUST be pulled in when doing consortium release + not_public = (release_file['name'] != "data_clinical.txt" or + release_type == "consortium") + is_gene_panel = release_file['name'].startswith("data_gene_panel") + + if not_folder and not_public and not is_gene_panel: syn.store(synapseclient.Link( release_file['id'], parent=release_folder_synid, - targetVersion=release_file['versionNumber'])) + targetVersion=release_file['versionNumber'] + )) release_files = syn.getChildren(release_folder_synid) - clinical_ent = [ - ents['id'] - for ents in release_files - if ents['name'] == "data_clinical.txt"][0] - # Set private permission for the data_clinical.txt link - syn.setPermissions(clinical_ent, principalId=3346558, accessType=[]) - syn.setPermissions(clinical_ent, principalId=3326313, accessType=[]) + clinical_ent = [ents['id'] + for ents in release_files + if ents['name'] == "data_clinical.txt"] + if clinical_ent: + # Set private permission for the data_clinical.txt link + syn.setPermissions(clinical_ent[0], principalId=3346558, + accessType=[]) + syn.setPermissions(clinical_ent[0], principalId=3326313, + accessType=[]) for ents in case_list_entities: syn.store(synapseclient.Link( ents.id, parent=caselist_folder_synid, - targetVersion=ents.versionNumber)) + targetVersion=ents.versionNumber + )) # Store gene panels for ents in gene_panel_entities: syn.store(synapseclient.Link( ents.id, parent=release_folder_synid, - targetVersion=ents.versionNumber)) + targetVersion=ents.versionNumber + )) return {"release_folder": release_folder_synid, "caselist_folder": caselist_folder_synid} diff --git a/genie/input_to_database.py b/genie/input_to_database.py index efedd60c..0fd70fba 100644 --- a/genie/input_to_database.py +++ b/genie/input_to_database.py @@ -7,10 +7,7 @@ from typing import List import synapseclient -try: - from synapseclient.core.utils import to_unix_epoch_time -except ModuleNotFoundError: - from synapseclient.utils import to_unix_epoch_time +from synapseclient.core.utils import to_unix_epoch_time import synapseutils import pandas as pd diff --git a/genie/maf.py b/genie/maf.py index 0702de56..9fda1db1 100644 --- a/genie/maf.py +++ b/genie/maf.py @@ -4,10 +4,7 @@ import pandas as pd import synapseclient -try: - from synapseclient.core.exceptions import SynapseHTTPError -except ModuleNotFoundError: - from synapseclient.exceptions import SynapseHTTPError +from synapseclient.core.exceptions import SynapseHTTPError from .example_filetype_format import FileTypeFormat from . import process_functions diff --git a/genie/sampleRetraction.py b/genie/sampleRetraction.py index 31f45e61..72d11dc3 100644 --- a/genie/sampleRetraction.py +++ b/genie/sampleRetraction.py @@ -4,10 +4,7 @@ import pandas as pd import synapseclient -try: - from synapseclient.core.utils import to_unix_epoch_time -except ModuleNotFoundError: - from synapseclient.utils import to_unix_epoch_time +from synapseclient.core.utils import to_unix_epoch_time from .example_filetype_format import FileTypeFormat from . import process_functions diff --git a/genie/validate.py b/genie/validate.py index 54a11e7b..97e3bfbc 100755 --- a/genie/validate.py +++ b/genie/validate.py @@ -5,10 +5,7 @@ import sys import synapseclient -try: - from synapseclient.core.exceptions import SynapseHTTPError -except ModuleNotFoundError: - from synapseclient.exceptions import SynapseHTTPError +from synapseclient.core.exceptions import SynapseHTTPError from . import config from . import example_filetype_format diff --git a/tests/test_assay.py b/tests/test_assay.py index 0566a5c9..396ca2e2 100644 --- a/tests/test_assay.py +++ b/tests/test_assay.py @@ -1,10 +1,10 @@ """Test assay information validation and processing""" import copy -from mock import patch, create_autospec -import pytest +from unittest.mock import patch, create_autospec -import synapseclient import pandas as pd +import pytest +import synapseclient from genie.assay import Assayinfo from genie import process_functions diff --git a/tests/test_bed.py b/tests/test_bed.py index a49ae165..e792705f 100644 --- a/tests/test_bed.py +++ b/tests/test_bed.py @@ -1,13 +1,13 @@ """Test GENIE Bed class""" import tempfile import shutil -import mock -from mock import patch -import pytest +from unittest import mock +from unittest.mock import patch -import synapseclient import pandas as pd from pandas.testing import assert_frame_equal +import pytest +import synapseclient import genie.bed from genie.bed import bed diff --git a/tests/test_clinical.py b/tests/test_clinical.py index b2cbb64c..3878a067 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1,8 +1,8 @@ import datetime -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.clinical import clinical diff --git a/tests/test_clinicalSP.py b/tests/test_clinicalSP.py index 8928dd4a..41e0e732 100644 --- a/tests/test_clinicalSP.py +++ b/tests/test_clinicalSP.py @@ -1,7 +1,7 @@ -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.clinicalSP import clinicalSP diff --git a/tests/test_cna.py b/tests/test_cna.py index 26988d52..45952223 100644 --- a/tests/test_cna.py +++ b/tests/test_cna.py @@ -1,5 +1,5 @@ -import mock -from mock import patch +from unittest import mock +from unittest.mock import patch import pytest import pandas as pd diff --git a/tests/test_create_case_lists.py b/tests/test_create_case_lists.py index ca0c9a01..2b7c787a 100644 --- a/tests/test_create_case_lists.py +++ b/tests/test_create_case_lists.py @@ -1,4 +1,5 @@ import os + import pytest from genie import create_case_lists diff --git a/tests/test_database_to_staging.py b/tests/test_database_to_staging.py index 33cefb1a..572cf77f 100644 --- a/tests/test_database_to_staging.py +++ b/tests/test_database_to_staging.py @@ -1,8 +1,8 @@ """Tests database to staging functions""" import os +from unittest import mock +from unittest.mock import patch -import mock -from mock import patch import pandas as pd import synapseclient diff --git a/tests/test_filters.py b/tests/test_filters.py index efed9153..c01cdc27 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -4,8 +4,8 @@ import datetime import os import sys +from unittest.mock import patch -from mock import patch import pandas as pd from genie.process_functions import seqDateFilter diff --git a/tests/test_fusions.py b/tests/test_fusions.py index f419d96d..0760fef0 100644 --- a/tests/test_fusions.py +++ b/tests/test_fusions.py @@ -1,8 +1,8 @@ -import mock -from mock import patch -import pytest +from unittest import mock +from unittest.mock import patch import pandas as pd +import pytest import synapseclient from genie.fusions import fusions diff --git a/tests/test_input_to_database.py b/tests/test_input_to_database.py index 62a4af99..8bb4e225 100644 --- a/tests/test_input_to_database.py +++ b/tests/test_input_to_database.py @@ -1,10 +1,10 @@ from datetime import datetime -import mock -from mock import patch import os -import pytest +from unittest import mock +from unittest.mock import patch import pandas as pd +import pytest import synapseclient import synapseutils diff --git a/tests/test_maf.py b/tests/test_maf.py index 72c300e6..be0e99aa 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -1,7 +1,7 @@ -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.maf import maf diff --git a/tests/test_mutationsInCis.py b/tests/test_mutationsInCis.py index 760dc6cc..beda1065 100644 --- a/tests/test_mutationsInCis.py +++ b/tests/test_mutationsInCis.py @@ -1,8 +1,8 @@ -import mock -from mock import patch -import pytest +from unittest import mock +from unittest.mock import patch import pandas as pd +import pytest import synapseclient from genie.mutationsInCis import mutationsInCis diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 581fbea3..fea672b3 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -1,8 +1,8 @@ -import mock -from mock import patch -import pytest +from unittest import mock +from unittest.mock import patch import pandas as pd +import pytest import synapseclient import genie.process_functions @@ -17,6 +17,21 @@ DATABASE_DF.index = ['1_3', '2_3', '3_5'] ENTITY = synapseclient.Project("foo", annotations={"dbMapping": ["syn1234"]}) +@pytest.mark.parametrize("input_str,output", [ + ("1.0\t", "1\t"), + ("1.0\n", "1\n"), + ("1.5\t", "1.5\t"), + ("1\t", "1\t"), + ("0\t", "0\t"), + ("'a'\t'b'\n1.0\t2.0\n", "'a'\t'b'\n1\t2\n"), + ]) +def test_removeStringFloat(input_str, output): + """Remove string float - will always assume that there is a \n + at the end. This is because if a value was 2.01, we dont want to + remove the .0 from this.""" + assert genie.process_functions.removeStringFloat(input_str) == output + + def test_valid__check_valid_df(): genie.process_functions._check_valid_df(DATABASE_DF, "test") diff --git a/tests/test_retraction.py b/tests/test_retraction.py index ba9a0da5..21a11d24 100644 --- a/tests/test_retraction.py +++ b/tests/test_retraction.py @@ -1,7 +1,7 @@ -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.sampleRetraction import sampleRetraction diff --git a/tests/test_seg.py b/tests/test_seg.py index a750c6d1..c8a1fdba 100644 --- a/tests/test_seg.py +++ b/tests/test_seg.py @@ -1,7 +1,7 @@ -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.seg import seg diff --git a/tests/test_validate.py b/tests/test_validate.py index 1f41d54a..64c46998 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,13 +1,11 @@ """Tests validate.py""" -import mock -from mock import patch +from unittest import mock +from unittest.mock import patch + import pandas as pd import pytest import synapseclient -try: - from synapseclient.exceptions import SynapseHTTPError -except ModuleNotFoundError: - from synapseclient.core.exceptions import SynapseHTTPError +from synapseclient.core.exceptions import SynapseHTTPError from genie import validate, clinical, process_functions diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 49ff262c..e9d97e78 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -1,7 +1,7 @@ -import mock -import pytest +from unittest import mock import pandas as pd +import pytest import synapseclient from genie.vcf import vcf diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 9136292b..0914f2f6 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -1,6 +1,6 @@ -import mock -import pytest +from unittest import mock +import pytest import synapseclient from genie.workflow import workflow