Skip to content

Commit

Permalink
Merge pull request #244 from Sage-Bionetworks/fix-bed-preprocess
Browse files Browse the repository at this point in the history
Since the file is no longer renamed after download, must pass in entity name
  • Loading branch information
thomasyu888 authored Mar 11, 2020
2 parents bf08a41 + fa0b5a3 commit a8286ca
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 24 deletions.
6 changes: 3 additions & 3 deletions genie/bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,17 +482,17 @@ def _process(self, beddf, seq_assay_id, newpath,
gene_panel_path, parentid)
return final_bed

def preprocess(self, filepath):
def preprocess(self, newpath):
"""
Standardize and grab seq assay id from the bed file path
Args:
filePath: bed file path
newpath: bed file path
Returns:
dict: GENIE seq assay id
"""
seq_assay_id = os.path.basename(filepath).replace(".bed", "")
seq_assay_id = os.path.basename(newpath).replace(".bed", "")
seq_assay_id = seq_assay_id.upper().replace("_", "-")
return {'seq_assay_id': seq_assay_id}

Expand Down
7 changes: 4 additions & 3 deletions genie/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def _process(self, clinical, clinicalTemplate):

return(clinicalRemapped)

def preprocess(self, filepath):
def preprocess(self, newpath):
'''
Gather preprocess parameters
Expand All @@ -207,6 +207,7 @@ def preprocess(self, filepath):
dict with keys - 'clinicalTemplate', 'sample', 'patient',
'patientCols', 'sampleCols'
'''
entity_name = os.path.basename(newpath)
# These synapse ids for the clinical tier release scope is
# hardcoded because it never changes
patientColsTable = self.syn.tableQuery(
Expand All @@ -218,11 +219,11 @@ def preprocess(self, filepath):
'and inClinicalDb is True')
sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist()

if "patient" in filepath.lower():
if "patient" in entity_name.lower():
clinicalTemplate = pd.DataFrame(columns=patientCols)
sample = False
patient = True
elif "sample" in filepath.lower():
elif "sample" in entity_name.lower():
clinicalTemplate = pd.DataFrame(columns=sampleCols)
sample = True
patient = False
Expand Down
24 changes: 12 additions & 12 deletions genie/database_to_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,23 +1146,23 @@ def store_bed_files(syn, genie_version, beddf, seq_assay_ids,
release_synid: Synapse id to store release file
'''
logger.info("STORING COMBINED BED FILE")
combined_bed_path = os.path.join(
GENIE_RELEASE_DIR, 'genomic_information_%s.txt' % genie_version)
combined_bed_path = os.path.join(GENIE_RELEASE_DIR,
'genomic_information_%s.txt' % genie_version) # pylint: disable=line-too-long
if not current_release_staging:
for seq_assay in beddf['SEQ_ASSAY_ID'].unique():
bed_seq_df = beddf[beddf['SEQ_ASSAY_ID'] == seq_assay]
center = seq_assay.split("-")[0]
bed_seq_df = \
bed_seq_df[bed_seq_df['Hugo_Symbol'] != bed_seq_df['ID']]
bed_seq_df = bed_seq_df[bed_seq_df['Hugo_Symbol'] != bed_seq_df['ID']] # pylint: disable=line-too-long
# There should always be a match here, because there should never
# be a SEQ_ASSAY_ID that starts without the center name
# If there is, check the bed db for SEQ_ASSAY_ID
center_ind = center_mappingdf['center'] == center
if not bed_seq_df.empty:
bed_seq_df.to_csv(
BED_DIFFS_SEQASSAY_PATH % seq_assay,
index=False)
store_file(
syn, BED_DIFFS_SEQASSAY_PATH % seq_assay,
genieVersion=genie_version,
parent=center_mappingdf['stagingSynId'][
center_mappingdf['center'] == center][0])
bed_seq_df.to_csv(BED_DIFFS_SEQASSAY_PATH % seq_assay,
index=False)
store_file(syn, BED_DIFFS_SEQASSAY_PATH % seq_assay,
genieVersion=genie_version,
parent=center_mappingdf['stagingSynId'][center_ind][0]) # pylint: disable=line-too-long
# This clinicalDf is already filtered through most of the filters
beddf = beddf[beddf['SEQ_ASSAY_ID'].isin(seq_assay_ids)]
beddf.to_csv(combined_bed_path, sep="\t", index=False)
Expand Down
13 changes: 7 additions & 6 deletions genie/example_filetype_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,16 @@ def process_steps(self, df, **kwargs):
'''
pass

def preprocess(self, filePath):
def preprocess(self, newpath):
'''
This is for any preprocessing that has to occur to the filepath name
to add to kwargs for processing.
This is for any preprocessing that has to occur to the entity name
to add to kwargs for processing. entity name is included in
the new path
Args:
filePath: Path to file
newpath: Path to file
'''
return(dict())
return dict()

def process(self, filePath, **kwargs):
'''
Expand All @@ -104,7 +105,7 @@ def process(self, filePath, **kwargs):
Returns:
str: file path of processed file
'''
preprocess_args = self.preprocess(filePath)
preprocess_args = self.preprocess(kwargs.get('newPath'))
kwargs.update(preprocess_args)
mykwargs = {}
for required_parameter in self._process_kwargs:
Expand Down

0 comments on commit a8286ca

Please sign in to comment.