From ec009b53d6f7c363b9758c008de07dfeb08f1408 Mon Sep 17 00:00:00 2001 From: Thomas Yu Date: Sun, 17 Jul 2022 15:19:09 -0700 Subject: [PATCH] Improve maf processing (#475) * Update number of rows read * Revise function to move and configure maf * Fix tests * Create patch version --- genie/__init__.py | 2 +- genie/process_mutation.py | 59 +++++++++++++++------------------- tests/test_process_mutation.py | 45 ++++++++------------------ 3 files changed, 40 insertions(+), 66 deletions(-) diff --git a/genie/__init__.py b/genie/__init__.py index f68b09d2..200246ee 100644 --- a/genie/__init__.py +++ b/genie/__init__.py @@ -7,6 +7,6 @@ # create version in __init__.py # https://packaging.python.org/en/latest/guides/single-sourcing-package-version/ -__version__ = "14.0.1" +__version__ = "14.1.0" __all__ = ["__version__"] diff --git a/genie/process_mutation.py b/genie/process_mutation.py index 047f229e..40f6a48a 100644 --- a/genie/process_mutation.py +++ b/genie/process_mutation.py @@ -106,24 +106,10 @@ "HGNC_ID", "PUBMED", "PICK", + "Exon_Number", ] -def _rename_column_headers(dataframe: pd.DataFrame, col_map: dict) -> pd.DataFrame: - """Rename dataframe column headers - - Args: - dataframe: pandas dataframe - col_map: Column Mapping {column_name: new_column_name,} - - Returns: - Dataframe with new columns - - """ - dataframe = dataframe.rename(columns=col_map) - return dataframe - - def _convert_to_str_dtype(column_types, known_string_cols): """Sometimes the deteremined dtype is incorrect based off the first 100 rows, update the incorrect dtypes. @@ -136,28 +122,35 @@ def _convert_to_str_dtype(column_types, known_string_cols): def determine_dtype(path: str): """Reads in a dataframe partially and determines the dtype of columns""" - subset_df = pd.read_csv(path, nrows=100, sep="\t") - dtypes = subset_df.dtypes - colnames = dtypes.index - types = [i.name for i in dtypes.values] - column_types = dict(zip(colnames, types)) + # Change this nrows to 5000 so that it better encapsulates the types + subset_df = pd.read_csv(path, nrows=5000, sep="\t") + column_types = subset_df.dtypes.to_dict() return column_types -def move_maf(mutation_path, input_files_dir): +def move_and_configure_maf(mutation_path: str, input_files_dir: str) -> str: """Moves maf files into processing directory. Maf file's column headers - are renamed if necessary""" - header_df = pd.read_csv(mutation_path, sep="\t", index_col=0, nrows=1, comment="#") + are renamed if necessary and .0 are stripped. + + Args: + mutation_path (str): Mutation file path + input_files_dir (str): Input file directory + + Returns: + str: Filepath to moved and configured maf + """ + filename = os.path.basename(mutation_path) + new_filepath = os.path.join(input_files_dir, filename) + column_types = determine_dtype(mutation_path) + new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS) + mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types) # If any column headers need to be remapped, remap - if sum(header_df.columns.isin(MAF_COL_MAPPING.keys())) > 0: - filename = os.path.basename(mutation_path) - column_types = determine_dtype(mutation_path) - new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS) - mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types) - mafdf = _rename_column_headers(mafdf, col_map=MAF_COL_MAPPING) - mafdf.to_csv(os.path.join(input_files_dir, filename), sep="\t", index=False) - else: - shutil.copy(mutation_path, input_files_dir) + mafdf = mafdf.rename(columns=MAF_COL_MAPPING) + # Must remove floating .0 or else processing will fail for genome nexus + maf_text = process_functions.removePandasDfFloat(mafdf) + with open(new_filepath, "w") as new_maf_f: + new_maf_f.write(maf_text) + return new_filepath def move_mutation(mutation_path, input_files_dir): @@ -166,7 +159,7 @@ def move_mutation(mutation_path, input_files_dir): if mutation_path.endswith(".vcf"): shutil.copy(mutation_path, input_files_dir) else: - move_maf(mutation_path, input_files_dir) + move_and_configure_maf(mutation_path, input_files_dir) def process_mutation_workflow( diff --git a/tests/test_process_mutation.py b/tests/test_process_mutation.py index 35906ec1..deb09d52 100644 --- a/tests/test_process_mutation.py +++ b/tests/test_process_mutation.py @@ -1,4 +1,5 @@ """Test process mutation functions""" +from distutils.command.build import build import shutil import subprocess import tempfile @@ -38,26 +39,13 @@ def test_format_maf(): assert expected_mafdf.equals(formatted_mafdf[expected_mafdf.columns]) -def test__rename_column_headers(): - """Tests the renaming of column headers""" - testdf = pd.DataFrame({"foo": ["bar"], "bar": ["baz"]}) - col_map = {"foo": "new_foo", "bar": "new_bar"} - newdf = process_mutation._rename_column_headers(testdf, col_map) - assert all(newdf.columns == ["new_foo", "new_bar"]) - - class TestDtype: def setup(self): self.testdf = pd.DataFrame({"foo": [1], "bar": ["baz"]}) self.column_types = {"foo": "int64", "bar": "object"} self.mutation_path = "/path/test.csv" self.input_dir = "/my/dir/here" - - def test__rename_column_headers(self): - """Tests the renaming of column headers""" - col_map = {"foo": "new_foo", "bar": "new_bar"} - newdf = process_mutation._rename_column_headers(self.testdf, col_map) - assert all(newdf.columns == ["new_foo", "new_bar"]) + self.final_maf_path = "/my/dir/here/test.csv" def test_determine_dtype(self): """Tests determining dtype""" @@ -72,17 +60,6 @@ def test__convert_to_str_dtype(self): ) assert new_column_types == {"foo": "object", "bar": "object"} - def test_move_maf_copy(self): - """Test moving mafs that don't need to rename columns""" - with patch.object( - pd, "read_csv", return_value=self.testdf - ) as patch_read, patch.object(shutil, "copy") as patch_copy: - process_mutation.move_maf(self.mutation_path, self.input_dir) - patch_read.assert_called_once_with( - self.mutation_path, sep="\t", index_col=0, nrows=1, comment="#" - ) - patch_copy.assert_called_once_with(self.mutation_path, self.input_dir) - def test_move_maf_rename(self): """Test moving mafs when maf column headers need to be remapped""" testdf = pd.DataFrame({"CHROMOSOME": [1]}) @@ -91,18 +68,22 @@ def test_move_maf_rename(self): ) as patch_determine, patch.object( process_mutation, "_convert_to_str_dtype", return_value=self.column_types ) as patch_convert, patch.object( - process_mutation, "_rename_column_headers" - ) as patch_rename, patch.object( - testdf, "to_csv" - ): - process_mutation.move_maf(self.mutation_path, self.input_dir) + testdf, "rename" + ) as patch_rename, patch( + "builtins.open" + ) as patch_open: + moved_maf = process_mutation.move_and_configure_maf( + self.mutation_path, self.input_dir + ) patch_determine.assert_called_once_with(self.mutation_path) patch_convert.assert_called_once_with( self.column_types, process_mutation.KNOWN_STRING_COLS ) patch_rename.assert_called_once_with( - testdf, col_map=process_mutation.MAF_COL_MAPPING + columns=process_mutation.MAF_COL_MAPPING ) + patch_open.assert_called_once_with(self.final_maf_path, "w") + assert moved_maf == self.final_maf_path def test_move_mutation_vcf(self): """Test moving vcfs""" @@ -112,7 +93,7 @@ def test_move_mutation_vcf(self): def test_move_mutation_maf(self): """Test moving maf files""" - with patch.object(process_mutation, "move_maf") as patch_move: + with patch.object(process_mutation, "move_and_configure_maf") as patch_move: process_mutation.move_mutation(self.mutation_path, self.input_dir) patch_move.assert_called_once_with(self.mutation_path, self.input_dir)