Skip to content

Commit

Permalink
Prepare release 14.3.0 (#486)
Browse files Browse the repository at this point in the history
* [GEN-174] allow for patch public release versions to be specified (#481)

* Use fileview and allow for patch releases to be specified

* Add 0 release in function

* lint

* [GEN-185] Fusions entrez gene must be integers (#483)

* Add check for entrez gene id

* Make sure entrez gene id is only integers

* Lint

* [GEN-33] Support sv format (#480)

* Support sv format

* Add SV as a file that is released

* Fix f-interpolation

* Update synid

* Add in check for duplicated rows

* Lint

* Fix tests

* Lint

* Update genie_registry/structural_variant.py

* Update tests/test_sv.py

* [GEN-76] No longer need to include the release version in the release files (#484)

* don't add release name to filenames

* Don't rename files

* Remove files

* Don't remove files for now

* No longer need to rewrite metadata files

* Comment out some lines of code

* Remove all existing files

* Add in HACK docstring

* Don't add release version

* Fix typo

* Remove arg

* Comment out unnecessary code

* Lint

* Fix tests

* Lint

* Remove cohort for seg

* Push new version
  • Loading branch information
thomasyu888 authored Nov 3, 2022
1 parent e4801e5 commit 383391d
Show file tree
Hide file tree
Showing 10 changed files with 322 additions and 187 deletions.
46 changes: 26 additions & 20 deletions bin/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ def generate_data_guide(


def main(args):
# HACK: Delete all existing files first
process_functions.rmFiles(database_to_staging.GENIE_RELEASE_DIR)

cbioValidatorPath = os.path.join(
args.cbioportalPath, "core/src/main/scripts/importer/validateData.py"
)
Expand Down Expand Up @@ -120,18 +123,23 @@ def main(args):
public_synid = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "public"
].values[0]

# Use release folder fileview
releaseSynId = databaseSynIdMappingDf["Id"][
databaseSynIdMappingDf["Database"] == "release"
databaseSynIdMappingDf["Database"] == "releaseFolder"
].values[0]

officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping(
syn, releaseSynId, test=args.test
)

assert (
args.genieVersion in officialPublic.keys()
), "genieVersion must be one of these: {}.".format(", ".join(officialPublic.keys()))
# TEST run of the infrastructure will always
# Map to a specific folder
if args.test:
officialPublic = {"TESTpublic": "syn12299959"}
else:
officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping(
syn, releaseSynId
)
if args.genieVersion not in officialPublic.keys():
allowed_public_release_names = ", ".join(officialPublic.keys())
raise ValueError(
f"genieVersion must be one of these: {allowed_public_release_names}."
)

args.releaseId = officialPublic[args.genieVersion]
if not args.test and not args.staging:
Expand All @@ -155,9 +163,7 @@ def main(args):
publicReleaseCutOff=args.publicReleaseCutOff,
)

database_to_staging.revise_metadata_files(
syn, args.staging, public_synid, args.genieVersion
)
database_to_staging.revise_metadata_files(syn, public_synid, args.genieVersion)

logger.info("CBIO VALIDATION")
# Must be exit 0 because the validator sometimes fails,
Expand All @@ -182,13 +188,13 @@ def main(args):
cbioLog.write(cbio_decoded_output)
syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid))
os.remove(cbio_log_file)
logger.info("REMOVING OLD FILES")
process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format(
database_to_staging.GENIE_RELEASE_DIR
)
if os.path.exists(seg_meta_file):
os.unlink(seg_meta_file)
# logger.info("REMOVING OLD FILES")
# process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
# seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format(
# database_to_staging.GENIE_RELEASE_DIR
# )
# if os.path.exists(seg_meta_file):
# os.unlink(seg_meta_file)

logger.info("CREATING LINK VERSION")
folders = database_to_staging.create_link_version(
Expand Down
48 changes: 25 additions & 23 deletions bin/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,16 @@ def main(
debug: Synapse debug flag
skip_mutationsincis: Skip mutation in cis filter
"""
# HACK: Delete all existing files first
process_functions.rmFiles(database_to_staging.GENIE_RELEASE_DIR)

syn = process_functions.synLogin(pemfile, debug=debug)
genie_user = os.environ.get("GENIE_USER")
if pemfile is not None:
genie_pass = process_functions.get_password(pemfile)
else:
genie_pass = None

# HACK: Use project id instead of this...
if test:
databaseSynIdMappingId = "syn11600968"
genie_version = "TESTING"
Expand Down Expand Up @@ -214,11 +217,11 @@ def main(
os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist))
clinical_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"data_clinical_{}.txt".format(genie_version),
"data_clinical.txt",
)
assay_information_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"assay_information_{}.txt".format(genie_version),
"assay_information.txt",
)
create_case_lists.main(
clinical_path,
Expand All @@ -237,20 +240,18 @@ def main(
)

logger.info("REMOVING UNNECESSARY FILES")
genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
for genie_file in genie_files:
if (
genie_version not in genie_file
and "meta" not in genie_file
and "case_lists" not in genie_file
):
os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file))
os.remove(clinical_path)
# genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
# for genie_file in genie_files:
# if (
# genie_version not in genie_file
# and "meta" not in genie_file
# and "case_lists" not in genie_file
# ):
# os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file))
# os.remove(clinical_path)

logger.info("REVISE METADATA FILES")
database_to_staging.revise_metadata_files(
syn, staging, consortiumSynId, genie_version
)
database_to_staging.revise_metadata_files(syn, consortiumSynId, genie_version)

logger.info("CBIO VALIDATION")

Expand All @@ -276,14 +277,15 @@ def main(
cbio_log.write(cbioOutput.decode("utf-8"))
syn.store(synapseclient.File(cbio_validator_log, parentId=log_folder_synid))
os.remove(cbio_validator_log)
logger.info("REMOVING OLD FILES")

process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
private_cna_meta_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt"
)
if os.path.exists(private_cna_meta_path):
os.unlink(private_cna_meta_path)
# HACK: Instead of doing this, files should be written to a tempdir...
# logger.info("REMOVING OLD FILES")

# process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
# private_cna_meta_path = os.path.join(
# database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt"
# )
# if os.path.exists(private_cna_meta_path):
# os.unlink(private_cna_meta_path)

logger.info("CREATING LINK VERSION")
# Returns release and case list folder
Expand Down
2 changes: 1 addition & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

# create version in __init__.py
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
__version__ = "14.2.0"
__version__ = "14.3.0"

__all__ = ["__version__"]
138 changes: 84 additions & 54 deletions genie/consortium_to_public.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,36 +56,34 @@ def consortiumToPublic(
databaseSynIdMappingDf,
publicReleaseCutOff=365,
):
cna_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "data_CNA_%s.txt" % genie_version
)
cna_path = os.path.join(database_to_staging.GENIE_RELEASE_DIR, "data_CNA.txt")
clinical_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "data_clinical_%s.txt" % genie_version
database_to_staging.GENIE_RELEASE_DIR, "data_clinical.txt"
)
clinical_sample_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"data_clinical_sample_%s.txt" % genie_version,
"data_clinical_sample.txt",
)
clinicl_patient_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"data_clinical_patient_%s.txt" % genie_version,
"data_clinical_patient.txt",
)
data_gene_panel_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "data_gene_matrix_%s.txt" % genie_version
database_to_staging.GENIE_RELEASE_DIR, "data_gene_matrix.txt"
)
mutations_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"data_mutations_extended_%s.txt" % genie_version,
"data_mutations_extended.txt",
)
fusions_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "data_fusions_%s.txt" % genie_version
database_to_staging.GENIE_RELEASE_DIR, "data_fusions.txt"
)
seg_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR,
"genie_public_data_cna_hg19_%s.seg" % genie_version,
"data_cna_hg19.seg",
)
combined_bed_path = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, "genie_combined_%s.bed" % genie_version
database_to_staging.GENIE_RELEASE_DIR, "genie_combined.bed"
)

if not os.path.exists(database_to_staging.GENIE_RELEASE_DIR):
Expand Down Expand Up @@ -331,7 +329,7 @@ def consortiumToPublic(
seg_path,
public_release_preview,
genie_version,
name="genie_public_data_cna_hg19.seg",
name="data_cna_hg19.seg",
)
elif entName == "genomic_information.txt":
bed = syn.get(entId, followLink=True)
Expand All @@ -345,24 +343,24 @@ def consortiumToPublic(
genie_version,
name="genomic_information.txt",
)
elif entName.startswith("data_gene_panel"):
genePanel = syn.get(entId, followLink=True)
# Create new gene panel naming and store
fileName = os.path.basename(genePanel.path)
newFileList = fileName.split("_")
newFileList[-1] = genie_version + ".txt"
newFileName = "_".join(newFileList)
genePanelPath = os.path.join(
database_to_staging.GENIE_RELEASE_DIR, newFileName
)
shutil.copy(genePanel.path, genePanelPath)
del newFileList[-1]
entName = "_".join(newFileList)
entName = entName + ".txt"
genepanel_ent = storeFile(
syn, genePanelPath, public_release_preview, genie_version, name=entName
)
genePanelEntities.append(genepanel_ent)
# elif entName.startswith("data_gene_panel"):
# genePanel = syn.get(entId, followLink=True)
# # Create new gene panel naming and store
# fileName = os.path.basename(genePanel.path)
# # newFileList = fileName.split("_")
# # newFileList[-1] = genie_version + ".txt"
# # newFileName = "_".join(newFileList)
# genePanelPath = os.path.join(
# database_to_staging.GENIE_RELEASE_DIR, fileName
# )
# shutil.copy(genePanel.path, genePanelPath)
# # del newFileList[-1]
# # entName = "_".join(newFileList)
# # entName = entName + ".txt"
# genepanel_ent = storeFile(
# syn, genePanelPath, public_release_preview, genie_version, name=entName
# )
# genePanelEntities.append(genepanel_ent)
else:
ent = syn.get(entId, followLink=True, downloadFile=False)
copiedId = synapseutils.copy(
Expand All @@ -377,32 +375,64 @@ def consortiumToPublic(
copiedEnt = syn.get(copiedId[ent.id], downloadFile=False)
# Set version comment
copiedEnt.versionComment = genie_version
syn.store(copiedEnt, forceVersion=False)
copiedEnt = syn.store(copiedEnt, forceVersion=False)
# There was a time when gene panel files had to be renamed
# with an appended genie version. But... GEN-76
# No longer appending genie verison to release files
# So just need to track gene panel entities
if entName.startswith("data_gene_panel"):
genePanelEntities.append(copiedEnt)

return caseListEntities, genePanelEntities


def get_public_to_consortium_synid_mapping(syn, releaseSynId, test=False):
def get_public_to_consortium_synid_mapping(
syn: synapseclient.Synapse, release_synid: str
) -> dict:
"""
Gets the mapping between public version name
and its Synapse ids (Can probably be replaced with folder view)
Gets the mapping between potential public release names and
the consortium release folder
Args:
syn (Synapse): Synapse connection
release_synid (str): Release folder fileview
Returns:
dict: Mapping between potential public release and consortium
release synapse id
"""
temp = synapseutils.walk(syn, releaseSynId)
officialPublic = dict()
for dirpath, dirnames, filenames in temp:
release = os.path.basename(dirpath[0])
# checkRelease = release.split(".")
final = [i.split("-") for i in release.split(".")]
checkRelease = []
for i in final:
checkRelease.extend(i)
if test:
officialPublic["TESTpublic"] = "syn12299959"
else:
if len(checkRelease) == 3 and checkRelease[0] != "0":
if int(checkRelease[1]) > 0:
if checkRelease[0] in ["1", "2"]:
public_release_name = str(int(checkRelease[0]) + 1) + ".0.0"
else:
public_release_name = str(int(checkRelease[0])) + ".0-public"
officialPublic[public_release_name] = dirpath[1]
return officialPublic
# This dict contains the mapping between public release name and
# consortium release folder
public_to_consortium_map = dict()
# release_files = synapseutils.walk(syn, releaseSynId)
# TODO: fix the database to mapping table
consortium_release_folders = syn.tableQuery(
f"SELECT name, id FROM {release_synid} WHERE "
"name NOT LIKE 'Release %' "
"and name NOT LIKE '%-public' "
"and name NOT IN ('case_lists', 'potential_artifacts')"
"ORDER BY name"
)
consortium_release_folders_df = consortium_release_folders.asDataFrame()
# Get major release version
consortium_release_folders_df["major_release"] = [
release.split(".")[0] for release in consortium_release_folders_df["name"]
]
# only keep the latest consortium release for the public release
consortium_release_folders_df.drop_duplicates(
"major_release", keep="last", inplace=True
)

for _, release_info in consortium_release_folders_df.iterrows():
major_release = release_info["major_release"]
# add support for potential patch releases
for num in [0, 1, 2, 3]:
# This has to exist because the the first three GENIE releases
# used semantic versioning
if release_info["major_release"] in ["0", "1", "2"]:
public_release_name = f"{int(major_release) + 1}.{num}.0"
public_to_consortium_map[public_release_name] = release_info["id"]
else:
public_release_name = f"{major_release}.{num}-public"
public_to_consortium_map[public_release_name] = release_info["id"]
return public_to_consortium_map
Loading

0 comments on commit 383391d

Please sign in to comment.