Skip to content

Commit

Permalink
Improve maf processing (#475)
Browse files Browse the repository at this point in the history
* Update number of rows read

* Revise function to move and configure maf

* Fix tests

* Create patch version
  • Loading branch information
thomasyu888 authored Jul 17, 2022
1 parent 57d02ca commit ec009b5
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 66 deletions.
2 changes: 1 addition & 1 deletion genie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@

# create version in __init__.py
# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/
__version__ = "14.0.1"
__version__ = "14.1.0"

__all__ = ["__version__"]
59 changes: 26 additions & 33 deletions genie/process_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,24 +106,10 @@
"HGNC_ID",
"PUBMED",
"PICK",
"Exon_Number",
]


def _rename_column_headers(dataframe: pd.DataFrame, col_map: dict) -> pd.DataFrame:
"""Rename dataframe column headers
Args:
dataframe: pandas dataframe
col_map: Column Mapping {column_name: new_column_name,}
Returns:
Dataframe with new columns
"""
dataframe = dataframe.rename(columns=col_map)
return dataframe


def _convert_to_str_dtype(column_types, known_string_cols):
"""Sometimes the deteremined dtype is incorrect based off the first
100 rows, update the incorrect dtypes.
Expand All @@ -136,28 +122,35 @@ def _convert_to_str_dtype(column_types, known_string_cols):

def determine_dtype(path: str):
"""Reads in a dataframe partially and determines the dtype of columns"""
subset_df = pd.read_csv(path, nrows=100, sep="\t")
dtypes = subset_df.dtypes
colnames = dtypes.index
types = [i.name for i in dtypes.values]
column_types = dict(zip(colnames, types))
# Change this nrows to 5000 so that it better encapsulates the types
subset_df = pd.read_csv(path, nrows=5000, sep="\t")
column_types = subset_df.dtypes.to_dict()
return column_types


def move_maf(mutation_path, input_files_dir):
def move_and_configure_maf(mutation_path: str, input_files_dir: str) -> str:
"""Moves maf files into processing directory. Maf file's column headers
are renamed if necessary"""
header_df = pd.read_csv(mutation_path, sep="\t", index_col=0, nrows=1, comment="#")
are renamed if necessary and .0 are stripped.
Args:
mutation_path (str): Mutation file path
input_files_dir (str): Input file directory
Returns:
str: Filepath to moved and configured maf
"""
filename = os.path.basename(mutation_path)
new_filepath = os.path.join(input_files_dir, filename)
column_types = determine_dtype(mutation_path)
new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS)
mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types)
# If any column headers need to be remapped, remap
if sum(header_df.columns.isin(MAF_COL_MAPPING.keys())) > 0:
filename = os.path.basename(mutation_path)
column_types = determine_dtype(mutation_path)
new_column_types = _convert_to_str_dtype(column_types, KNOWN_STRING_COLS)
mafdf = pd.read_csv(mutation_path, sep="\t", dtype=new_column_types)
mafdf = _rename_column_headers(mafdf, col_map=MAF_COL_MAPPING)
mafdf.to_csv(os.path.join(input_files_dir, filename), sep="\t", index=False)
else:
shutil.copy(mutation_path, input_files_dir)
mafdf = mafdf.rename(columns=MAF_COL_MAPPING)
# Must remove floating .0 or else processing will fail for genome nexus
maf_text = process_functions.removePandasDfFloat(mafdf)
with open(new_filepath, "w") as new_maf_f:
new_maf_f.write(maf_text)
return new_filepath


def move_mutation(mutation_path, input_files_dir):
Expand All @@ -166,7 +159,7 @@ def move_mutation(mutation_path, input_files_dir):
if mutation_path.endswith(".vcf"):
shutil.copy(mutation_path, input_files_dir)
else:
move_maf(mutation_path, input_files_dir)
move_and_configure_maf(mutation_path, input_files_dir)


def process_mutation_workflow(
Expand Down
45 changes: 13 additions & 32 deletions tests/test_process_mutation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test process mutation functions"""
from distutils.command.build import build
import shutil
import subprocess
import tempfile
Expand Down Expand Up @@ -38,26 +39,13 @@ def test_format_maf():
assert expected_mafdf.equals(formatted_mafdf[expected_mafdf.columns])


def test__rename_column_headers():
"""Tests the renaming of column headers"""
testdf = pd.DataFrame({"foo": ["bar"], "bar": ["baz"]})
col_map = {"foo": "new_foo", "bar": "new_bar"}
newdf = process_mutation._rename_column_headers(testdf, col_map)
assert all(newdf.columns == ["new_foo", "new_bar"])


class TestDtype:
def setup(self):
self.testdf = pd.DataFrame({"foo": [1], "bar": ["baz"]})
self.column_types = {"foo": "int64", "bar": "object"}
self.mutation_path = "/path/test.csv"
self.input_dir = "/my/dir/here"

def test__rename_column_headers(self):
"""Tests the renaming of column headers"""
col_map = {"foo": "new_foo", "bar": "new_bar"}
newdf = process_mutation._rename_column_headers(self.testdf, col_map)
assert all(newdf.columns == ["new_foo", "new_bar"])
self.final_maf_path = "/my/dir/here/test.csv"

def test_determine_dtype(self):
"""Tests determining dtype"""
Expand All @@ -72,17 +60,6 @@ def test__convert_to_str_dtype(self):
)
assert new_column_types == {"foo": "object", "bar": "object"}

def test_move_maf_copy(self):
"""Test moving mafs that don't need to rename columns"""
with patch.object(
pd, "read_csv", return_value=self.testdf
) as patch_read, patch.object(shutil, "copy") as patch_copy:
process_mutation.move_maf(self.mutation_path, self.input_dir)
patch_read.assert_called_once_with(
self.mutation_path, sep="\t", index_col=0, nrows=1, comment="#"
)
patch_copy.assert_called_once_with(self.mutation_path, self.input_dir)

def test_move_maf_rename(self):
"""Test moving mafs when maf column headers need to be remapped"""
testdf = pd.DataFrame({"CHROMOSOME": [1]})
Expand All @@ -91,18 +68,22 @@ def test_move_maf_rename(self):
) as patch_determine, patch.object(
process_mutation, "_convert_to_str_dtype", return_value=self.column_types
) as patch_convert, patch.object(
process_mutation, "_rename_column_headers"
) as patch_rename, patch.object(
testdf, "to_csv"
):
process_mutation.move_maf(self.mutation_path, self.input_dir)
testdf, "rename"
) as patch_rename, patch(
"builtins.open"
) as patch_open:
moved_maf = process_mutation.move_and_configure_maf(
self.mutation_path, self.input_dir
)
patch_determine.assert_called_once_with(self.mutation_path)
patch_convert.assert_called_once_with(
self.column_types, process_mutation.KNOWN_STRING_COLS
)
patch_rename.assert_called_once_with(
testdf, col_map=process_mutation.MAF_COL_MAPPING
columns=process_mutation.MAF_COL_MAPPING
)
patch_open.assert_called_once_with(self.final_maf_path, "w")
assert moved_maf == self.final_maf_path

def test_move_mutation_vcf(self):
"""Test moving vcfs"""
Expand All @@ -112,7 +93,7 @@ def test_move_mutation_vcf(self):

def test_move_mutation_maf(self):
"""Test moving maf files"""
with patch.object(process_mutation, "move_maf") as patch_move:
with patch.object(process_mutation, "move_and_configure_maf") as patch_move:
process_mutation.move_mutation(self.mutation_path, self.input_dir)
patch_move.assert_called_once_with(self.mutation_path, self.input_dir)

Expand Down

0 comments on commit ec009b5

Please sign in to comment.