Improve maf processing (#475)

* Update number of rows read * Revise function to move and configure maf * Fix tests * Create patch version
Sage-Bionetworks · Jul 17, 2022 · ec009b5 · ec009b5
1 parent 57d02ca
commit ec009b5
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 66 deletions.
diff --git a/genie/__init__.py b/genie/__init__.py
@@ -7,6 +7,6 @@
 
 # create version in __init__.py
 # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
-__version__ = "14.0.1"
+__version__ = "14.1.0"
 
 __all__ = ["__version__"]
diff --git a/genie/process_mutation.py b/genie/process_mutation.py
@@ -106,24 +106,10 @@
     "HGNC_ID",
     "PUBMED",
     "PICK",
+    "Exon_Number",
 ]
 
 
-def _rename_column_headers(dataframe: pd.DataFrame, col_map: dict) -> pd.DataFrame:
-    """Rename dataframe column headers
-
-    Args:
-        dataframe: pandas dataframe
-        col_map: Column Mapping {column_name: new_column_name,}
-
-    Returns:
-        Dataframe with new columns
-
-    """
-    dataframe = dataframe.rename(columns=col_map)
-    return dataframe
-
-
 def _convert_to_str_dtype(column_types, known_string_cols):
     """Sometimes the deteremined dtype is incorrect based off the first
     100 rows, update the incorrect dtypes.
@@ -136,28 +122,35 @@ def _convert_to_str_dtype(column_types, known_string_cols):
 
 def determine_dtype(path: str):
     """Reads in a dataframe partially and determines the dtype of columns"""
-    subset_df = pd.read_csv(path, nrows=100, sep="\t")
-    dtypes = subset_df.dtypes
-    colnames = dtypes.index
-    types = [i.name for i in dtypes.values]
-    column_types = dict(zip(colnames, types))
+    # Change this nrows to 5000 so that it better encapsulates the types
+    subset_df = pd.read_csv(path, nrows=5000, sep="\t")
+    column_types = subset_df.dtypes.to_dict()
     return column_types
 
 
-def move_maf(mutation_path, input_files_dir):
+def move_and_configure_maf(mutation_path: str, input_files_dir: str) -> str:
     """Moves maf files into processing directory. Maf file's column headers
-    are renamed if necessary"""
-    header_df = pd.read_csv(mutation_path, sep="\t", index_col=0, nrows=1, comment="#")
+    are renamed if necessary and .0 are stripped.
+
+    Args:
+        mutation_path (str): Mutation file path
+        input_files_dir (str): Input file directory
+
+    Returns:
+        str: Filepath to moved and configured maf
+    """
+    filename = os.path.basename(mutation_path)
+    new_filepath = os.path.join(input_files_dir, filename)
+    column_types = determine_dtype(mutation_path)
+    new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS)
+    mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types)
     # If any column headers need to be remapped, remap
-    if sum(header_df.columns.isin(MAF_COL_MAPPING.keys())) > 0:
-        filename = os.path.basename(mutation_path)
-        column_types = determine_dtype(mutation_path)
-        new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS)
-        mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types)
-        mafdf = _rename_column_headers(mafdf, col_map=MAF_COL_MAPPING)
-        mafdf.to_csv(os.path.join(input_files_dir, filename), sep="\t", index=False)
-    else:
-        shutil.copy(mutation_path, input_files_dir)
+    mafdf = mafdf.rename(columns=MAF_COL_MAPPING)
+    # Must remove floating .0 or else processing will fail for genome nexus
+    maf_text = process_functions.removePandasDfFloat(mafdf)
+    with open(new_filepath, "w") as new_maf_f:
+        new_maf_f.write(maf_text)
+    return new_filepath
 
 
 def move_mutation(mutation_path, input_files_dir):
@@ -166,7 +159,7 @@ def move_mutation(mutation_path, input_files_dir):
     if mutation_path.endswith(".vcf"):
         shutil.copy(mutation_path, input_files_dir)
     else:
-        move_maf(mutation_path, input_files_dir)
+        move_and_configure_maf(mutation_path, input_files_dir)
 
 
 def process_mutation_workflow(

diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py
@@ -1,4 +1,5 @@
 """Test process mutation functions"""
+from distutils.command.build import build
 import shutil
 import subprocess
 import tempfile
@@ -38,26 +39,13 @@ def test_format_maf():
     assert expected_mafdf.equals(formatted_mafdf[expected_mafdf.columns])
 
 
-def test__rename_column_headers():
-    """Tests the renaming of column headers"""
-    testdf = pd.DataFrame({"foo": ["bar"], "bar": ["baz"]})
-    col_map = {"foo": "new_foo", "bar": "new_bar"}
-    newdf = process_mutation._rename_column_headers(testdf, col_map)
-    assert all(newdf.columns == ["new_foo", "new_bar"])
-
-
 class TestDtype:
     def setup(self):
         self.testdf = pd.DataFrame({"foo": [1], "bar": ["baz"]})
         self.column_types = {"foo": "int64", "bar": "object"}
         self.mutation_path = "/path/test.csv"
         self.input_dir = "/my/dir/here"
-
-    def test__rename_column_headers(self):
-        """Tests the renaming of column headers"""
-        col_map = {"foo": "new_foo", "bar": "new_bar"}
-        newdf = process_mutation._rename_column_headers(self.testdf, col_map)
-        assert all(newdf.columns == ["new_foo", "new_bar"])
+        self.final_maf_path = "/my/dir/here/test.csv"
 
     def test_determine_dtype(self):
         """Tests determining dtype"""
@@ -72,17 +60,6 @@ def test__convert_to_str_dtype(self):
         )
         assert new_column_types == {"foo": "object", "bar": "object"}
 
-    def test_move_maf_copy(self):
-        """Test moving mafs that don't need to rename columns"""
-        with patch.object(
-            pd, "read_csv", return_value=self.testdf
-        ) as patch_read, patch.object(shutil, "copy") as patch_copy:
-            process_mutation.move_maf(self.mutation_path, self.input_dir)
-            patch_read.assert_called_once_with(
-                self.mutation_path, sep="\t", index_col=0, nrows=1, comment="#"
-            )
-            patch_copy.assert_called_once_with(self.mutation_path, self.input_dir)
-
     def test_move_maf_rename(self):
         """Test moving mafs when maf column headers need to be remapped"""
         testdf = pd.DataFrame({"CHROMOSOME": [1]})
@@ -91,18 +68,22 @@ def test_move_maf_rename(self):
         ) as patch_determine, patch.object(
             process_mutation, "_convert_to_str_dtype", return_value=self.column_types
         ) as patch_convert, patch.object(
-            process_mutation, "_rename_column_headers"
-        ) as patch_rename, patch.object(
-            testdf, "to_csv"
-        ):
-            process_mutation.move_maf(self.mutation_path, self.input_dir)
+            testdf, "rename"
+        ) as patch_rename, patch(
+            "builtins.open"
+        ) as patch_open:
+            moved_maf = process_mutation.move_and_configure_maf(
+                self.mutation_path, self.input_dir
+            )
             patch_determine.assert_called_once_with(self.mutation_path)
             patch_convert.assert_called_once_with(
                 self.column_types, process_mutation.KNOWN_STRING_COLS
             )
             patch_rename.assert_called_once_with(
-                testdf, col_map=process_mutation.MAF_COL_MAPPING
+                columns=process_mutation.MAF_COL_MAPPING
             )
+            patch_open.assert_called_once_with(self.final_maf_path, "w")
+            assert moved_maf == self.final_maf_path
 
     def test_move_mutation_vcf(self):
         """Test moving vcfs"""
@@ -112,7 +93,7 @@ def test_move_mutation_vcf(self):
 
     def test_move_mutation_maf(self):
         """Test moving maf files"""
-        with patch.object(process_mutation, "move_maf") as patch_move:
+        with patch.object(process_mutation, "move_and_configure_maf") as patch_move:
             process_mutation.move_mutation(self.mutation_path, self.input_dir)
             patch_move.assert_called_once_with(self.mutation_path, self.input_dir)