Merge pull request #244 from Sage-Bionetworks/fix-bed-preprocess

Since the file is no longer renamed after download, must pass in entity name
Sage-Bionetworks · Mar 11, 2020 · a8286ca · a8286ca
2 parents bf08a41 + fa0b5a3
commit a8286ca
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 24 deletions.
diff --git a/genie/bed.py b/genie/bed.py
@@ -482,17 +482,17 @@ def _process(self, beddf, seq_assay_id, newpath,
                                    gene_panel_path, parentid)
         return final_bed
 
-    def preprocess(self, filepath):
+    def preprocess(self, newpath):
         """
         Standardize and grab seq assay id from the bed file path
 
         Args:
-            filePath: bed file path
+            newpath: bed file path
 
         Returns:
             dict: GENIE seq assay id
         """
-        seq_assay_id = os.path.basename(filepath).replace(".bed", "")
+        seq_assay_id = os.path.basename(newpath).replace(".bed", "")
         seq_assay_id = seq_assay_id.upper().replace("_", "-")
         return {'seq_assay_id': seq_assay_id}
 

diff --git a/genie/clinical.py b/genie/clinical.py
@@ -196,7 +196,7 @@ def _process(self, clinical, clinicalTemplate):
 
         return(clinicalRemapped)
 
-    def preprocess(self, filepath):
+    def preprocess(self, newpath):
         '''
         Gather preprocess parameters
 
@@ -207,6 +207,7 @@ def preprocess(self, filepath):
             dict with keys - 'clinicalTemplate', 'sample', 'patient',
                              'patientCols', 'sampleCols'
         '''
+        entity_name = os.path.basename(newpath)
         # These synapse ids for the clinical tier release scope is
         # hardcoded because it never changes
         patientColsTable = self.syn.tableQuery(
@@ -218,11 +219,11 @@ def preprocess(self, filepath):
             'and inClinicalDb is True')
         sampleCols = sampleColsTable.asDataFrame()['fieldName'].tolist()
 
-        if "patient" in filepath.lower():
+        if "patient" in entity_name.lower():
             clinicalTemplate = pd.DataFrame(columns=patientCols)
             sample = False
             patient = True
-        elif "sample" in filepath.lower():
+        elif "sample" in entity_name.lower():
             clinicalTemplate = pd.DataFrame(columns=sampleCols)
             sample = True
             patient = False

diff --git a/genie/database_to_staging.py b/genie/database_to_staging.py
@@ -1146,23 +1146,23 @@ def store_bed_files(syn, genie_version, beddf, seq_assay_ids,
         release_synid: Synapse id to store release file
     '''
     logger.info("STORING COMBINED BED FILE")
-    combined_bed_path = os.path.join(
-        GENIE_RELEASE_DIR, 'genomic_information_%s.txt' % genie_version)
+    combined_bed_path = os.path.join(GENIE_RELEASE_DIR,
+                                     'genomic_information_%s.txt' % genie_version)  # pylint: disable=line-too-long
     if not current_release_staging:
         for seq_assay in beddf['SEQ_ASSAY_ID'].unique():
             bed_seq_df = beddf[beddf['SEQ_ASSAY_ID'] == seq_assay]
             center = seq_assay.split("-")[0]
-            bed_seq_df = \
-                bed_seq_df[bed_seq_df['Hugo_Symbol'] != bed_seq_df['ID']]
+            bed_seq_df = bed_seq_df[bed_seq_df['Hugo_Symbol'] != bed_seq_df['ID']]  # pylint: disable=line-too-long
+            # There should always be a match here, because there should never
+            # be a SEQ_ASSAY_ID that starts without the center name
+            # If there is, check the bed db for SEQ_ASSAY_ID
+            center_ind = center_mappingdf['center'] == center
             if not bed_seq_df.empty:
-                bed_seq_df.to_csv(
-                    BED_DIFFS_SEQASSAY_PATH % seq_assay,
-                    index=False)
-                store_file(
-                    syn, BED_DIFFS_SEQASSAY_PATH % seq_assay,
-                    genieVersion=genie_version,
-                    parent=center_mappingdf['stagingSynId'][
-                        center_mappingdf['center'] == center][0])
+                bed_seq_df.to_csv(BED_DIFFS_SEQASSAY_PATH % seq_assay,
+                                  index=False)
+                store_file(syn, BED_DIFFS_SEQASSAY_PATH % seq_assay,
+                           genieVersion=genie_version,
+                           parent=center_mappingdf['stagingSynId'][center_ind][0])  # pylint: disable=line-too-long
     # This clinicalDf is already filtered through most of the filters
     beddf = beddf[beddf['SEQ_ASSAY_ID'].isin(seq_assay_ids)]
     beddf.to_csv(combined_bed_path, sep="\t", index=False)

diff --git a/genie/example_filetype_format.py b/genie/example_filetype_format.py
@@ -83,15 +83,16 @@ def process_steps(self, df, **kwargs):
         '''
         pass
 
-    def preprocess(self, filePath):
+    def preprocess(self, newpath):
         '''
-        This is for any preprocessing that has to occur to the filepath name
-        to add to kwargs for processing.
+        This is for any preprocessing that has to occur to the entity name
+        to add to kwargs for processing.  entity name is included in
+        the new path
 
         Args:
-            filePath: Path to file
+            newpath: Path to file
         '''
-        return(dict())
+        return dict()
 
     def process(self, filePath, **kwargs):
         '''
@@ -104,7 +105,7 @@ def process(self, filePath, **kwargs):
         Returns:
             str: file path of processed file
         '''
-        preprocess_args = self.preprocess(filePath)
+        preprocess_args = self.preprocess(kwargs.get('newPath'))
         kwargs.update(preprocess_args)
         mykwargs = {}
         for required_parameter in self._process_kwargs: