Skip to content

Commit

Permalink
Merge pull request #530 from Sage-Bionetworks/gen-756-fix-maf-validation
Browse files Browse the repository at this point in the history
* update maf read_csv
* black
* add fix for mixed dtypes, add tests
* add tests to catch exceptions/warnings
* add comments about mixed dtypes handling in pandas
* black
* remove line of code
* correct warnings reset placement
* add large mixed dtype csv for testing

---------

Co-authored-by: Jake VanCampen <jake.vancampen7@gmail.com>
Co-authored-by: Rixing Xu <rxu@w197.local>
  • Loading branch information
3 people authored Jul 10, 2023
2 parents b0e8cf8 + e42af60 commit 4805754
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 14 deletions.
23 changes: 23 additions & 0 deletions genie/transform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This module contains all the transformation functions used throughout the GENIE
package"""
import warnings

import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype
Expand Down Expand Up @@ -41,3 +42,25 @@ def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list:
return new_vals
else:
return df[col].tolist()


def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame:
"""This checks if a dataframe read in normally comes out with mixed data types (which happens
when low_memory = True because read_csv parses in chunks and guesses dtypes by chunk) and
converts a dataframe with mixed datatypes to one datatype.
Args:
read_csv_params (dict): of input params and values to pandas's read_csv function.
needs to include filepath to dataset to be read in
Returns:
pd.DataFrame : The dataset read in
"""
warnings.simplefilter("error", pd.errors.DtypeWarning)
try:
df = pd.read_csv(**read_csv_params, low_memory=True)
except pd.errors.DtypeWarning:
# setting engine to c as that is the only engine that works with low_memory=False
df = pd.read_csv(**read_csv_params, low_memory=False, engine="c")
warnings.resetwarnings()
return df
23 changes: 12 additions & 11 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from io import StringIO
import os
import logging
import warnings

import pandas as pd

from genie.example_filetype_format import FileTypeFormat
from genie import process_functions, validate
from genie import process_functions, transform, validate

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -295,13 +296,12 @@ def _get_dataframe(self, filePathList):
"Number of fields in a line do not match the "
"expected number of columns"
)

mutationdf = pd.read_csv(
filePathList[0],
sep="\t",
comment="#",
read_csv_params = {
"filepath_or_buffer": filePathList[0],
"sep": "\t",
"comment": "#",
# Keep the value 'NA'
na_values=[
"na_values": [
"-1.#IND",
"1.#QNAN",
"1.#IND",
Expand All @@ -317,12 +317,13 @@ def _get_dataframe(self, filePathList):
"-nan",
"",
],
keep_default_na=False,
"keep_default_na": False,
# This is to check if people write files
# with R, quote=T
quoting=3,
"quoting": 3,
# Retain completely blank lines so that
# validator will cause the file to fail
skip_blank_lines=False,
)
"skip_blank_lines": False,
}
mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)
return mutationdf
59 changes: 58 additions & 1 deletion tests/test_maf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest.mock import patch
from io import BytesIO, StringIO
from unittest.mock import mock_open, patch

import pandas as pd
import pytest
Expand Down Expand Up @@ -271,3 +272,59 @@ def test_valid__check_tsa1_tsa2(df):
"""Test valid TSA1 and TSA2"""
error = genie_registry.maf._check_tsa1_tsa2(df)
assert error == ""


@pytest.mark.parametrize(
"test_input,expected",
[
(
(
"Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome\n"
"TEST 3845 TEST GRCh37 12"
),
pd.DataFrame(
{
"Hugo_Symbol": ["TEST"],
"Entrez_Gene_Id": [3845],
"Center": ["TEST"],
"NCBI_Build": ["GRCh37"],
"Chromosome": [12],
}
),
),
(
(
"#Something Something else\n"
"Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome\n"
"TEST 3845 TEST GRCh37 12"
),
pd.DataFrame(
{
"Hugo_Symbol": ["TEST"],
"Entrez_Gene_Id": [3845],
"Center": ["TEST"],
"NCBI_Build": ["GRCh37"],
"Chromosome": [12],
}
),
),
],
ids=["no_pound_sign", "pound_sign"],
)
def test_that__get_dataframe_returns_expected_result(maf_class, test_input, expected):
with patch("builtins.open", mock_open(read_data=test_input)) as mock_file:
test = maf_class._get_dataframe(["some_path"])
pd.testing.assert_frame_equal(test, expected)


def test_that__get_dataframe_throws_value_error(maf_class):
file = (
"#Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome\n"
"TEST 3845 TEST GRCh37 12"
)
with patch("builtins.open", mock_open(read_data=file)) as patch_open:
with pytest.raises(
ValueError,
match="Number of fields in a line do not match the expected number of columns",
):
maf_class._get_dataframe(["some_path"])
118 changes: 116 additions & 2 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
"""Test genie.transform module"""
import os
from io import BytesIO
from unittest.mock import patch

import pandas as pd
from pandas.api.types import (
is_object_dtype,
)
import pytest

from genie import transform
from unittest import mock


class TestConvertCols:
Expand All @@ -17,8 +23,12 @@ class TestConvertCols:
pd.DataFrame({"some_col": ["Val1", float("nan")]}),
["Val1", float("nan")],
),
(
pd.DataFrame({"some_col": ["Val1", float("nan"), 1, "1"]}),
["Val1", float("nan"), "1", "1"],
),
],
ids=["float_w_na", "int_w_na", "string_w_na"],
ids=["float_w_na", "int_w_na", "string_w_na", "mixed_w_na"],
)
def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
self, test_input, expected
Expand All @@ -35,8 +45,12 @@ def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
pd.DataFrame({"some_col": ["Val1", "Val2"]}),
["Val1", "Val2"],
),
(
pd.DataFrame({"some_col": ["Val1", 1, "2"]}),
["Val1", "1", "2"],
),
],
ids=["float_no_na", "string_no_na"],
ids=["float_no_na", "string_no_na", "mixed_no_na"],
)
def test_that__convert_col_with_nas_to_str_returns_correct_vals_with_no_na_data(
self, test_input, expected
Expand Down Expand Up @@ -73,3 +87,103 @@ def test_that__convert_float_col_with_nas_to_int_does_nothing_with_str_data(self
result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
assert result[0] == "Val1"
assert pd.isna(result[1])


def read_csv_side_effect(**kwargs):
if kwargs["low_memory"] == True:
raise pd.errors.DtypeWarning("some_warning")


class TestConvertMixedDtypes:
@pytest.fixture(scope="session")
def test_input(self):
input = pd.DataFrame({"some_col": [1, "Val2", "1"]})
output = BytesIO()
input.to_csv(output, index=False)
output.seek(0)
yield output

@pytest.fixture(scope="session")
def test_mixed_dtype_input(self):
input = pd.DataFrame(
{
"some_col": ([1.0] * 100000 + ["a"] * 100000 + [float("nan")] * 100000),
"some_col2": (
[1.0] * 100000 + ["b"] * 100000 + [float("nan")] * 100000
),
}
)
input.to_csv("test_mixed_dtype_input.csv", index=False)
yield "test_mixed_dtype_input.csv"
os.remove("test_mixed_dtype_input.csv")

def test_that__convert_df_with_mixed_dtypes_gets_expected_output_with_normal_input(
self, test_input
):
df = transform._convert_df_with_mixed_dtypes(
{"filepath_or_buffer": test_input, "index_col": False}
)
pd.testing.assert_frame_equal(
df.reset_index(drop=True), pd.DataFrame({"some_col": ["1", "Val2", "1"]})
)
assert is_object_dtype(df["some_col"])

def test_that__convert_df_with_mixed_dtypes_gets_expected_output_with_large_mixed_dtype_input(
self, test_mixed_dtype_input
):
df = transform._convert_df_with_mixed_dtypes(
{"filepath_or_buffer": test_mixed_dtype_input, "index_col": False}
)
pd.testing.assert_frame_equal(
df.reset_index(drop=True),
pd.DataFrame(
{
"some_col": (
["1.0"] * 100000 + ["a"] * 100000 + [float("nan")] * 100000
),
"some_col2": (
["1.0"] * 100000 + ["b"] * 100000 + [float("nan")] * 100000
),
}
),
)
assert is_object_dtype(df["some_col"])

def test_that__convert_df_with_mixed_dtypes_calls_read_csv_once_if_no_exception(
self, test_input
):
with mock.patch.object(pd, "read_csv") as mock_read_csv:
transform._convert_df_with_mixed_dtypes(
{"filepath_or_buffer": test_input, "index_col": False}
)
mock_read_csv.assert_called_once_with(
filepath_or_buffer=test_input,
index_col=False,
low_memory=True,
)

def test_that__convert_df_with_mixed_dtypes_catches_mixed_dtype_exception(
self, test_input
):
with mock.patch.object(
pd, "read_csv", side_effect=read_csv_side_effect
) as mock_read_csv:
transform._convert_df_with_mixed_dtypes(
{"filepath_or_buffer": test_input, "index_col": False}
)
mock_read_csv.assert_has_calls(
[
mock.call(
filepath_or_buffer=test_input,
index_col=False,
low_memory=True,
),
mock.call(
filepath_or_buffer=test_input,
index_col=False,
low_memory=False,
engine="c",
),
],
any_order=False,
)

0 comments on commit 4805754

Please sign in to comment.