Merge pull request #530 from Sage-Bionetworks/gen-756-fix-maf-validation

* update maf read_csv * black * add fix for mixed dtypes, add tests * add tests to catch exceptions/warnings * add comments about mixed dtypes handling in pandas * black * remove line of code * correct warnings reset placement * add large mixed dtype csv for testing --------- Co-authored-by: Jake VanCampen <jake.vancampen7@gmail.com> Co-authored-by: Rixing Xu <rxu@w197.local>
Sage-Bionetworks · Jul 10, 2023 · 4805754 · 4805754
2 parents b0e8cf8 + e42af60
commit 4805754
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 14 deletions.
diff --git a/genie/transform.py b/genie/transform.py
@@ -1,5 +1,6 @@
 """This module contains all the transformation functions used throughout the GENIE
 package"""
+import warnings
 
 import pandas as pd
 from pandas.api.types import is_integer_dtype, is_float_dtype
@@ -41,3 +42,25 @@ def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list:
         return new_vals
     else:
         return df[col].tolist()
+
+
+def _convert_df_with_mixed_dtypes(read_csv_params: dict) -> pd.DataFrame:
+    """This checks if a dataframe read in normally comes out with mixed data types (which happens
+    when low_memory = True because read_csv parses in chunks and guesses dtypes by chunk) and
+    converts a dataframe with mixed datatypes to one datatype.
+
+    Args:
+        read_csv_params (dict): of input params and values to pandas's read_csv function.
+            needs to include filepath to dataset to be read in
+
+    Returns:
+        pd.DataFrame : The dataset read in
+    """
+    warnings.simplefilter("error", pd.errors.DtypeWarning)
+    try:
+        df = pd.read_csv(**read_csv_params, low_memory=True)
+    except pd.errors.DtypeWarning:
+        # setting engine to c as that is the only engine that works with low_memory=False
+        df = pd.read_csv(**read_csv_params, low_memory=False, engine="c")
+    warnings.resetwarnings()
+    return df
diff --git a/genie_registry/maf.py b/genie_registry/maf.py
@@ -1,11 +1,12 @@
 from io import StringIO
 import os
 import logging
+import warnings
 
 import pandas as pd
 
 from genie.example_filetype_format import FileTypeFormat
-from genie import process_functions, validate
+from genie import process_functions, transform, validate
 
 logger = logging.getLogger(__name__)
 
@@ -295,13 +296,12 @@ def _get_dataframe(self, filePathList):
                 "Number of fields in a line do not match the "
                 "expected number of columns"
             )
-
-        mutationdf = pd.read_csv(
-            filePathList[0],
-            sep="\t",
-            comment="#",
+        read_csv_params = {
+            "filepath_or_buffer": filePathList[0],
+            "sep": "\t",
+            "comment": "#",
             # Keep the value 'NA'
-            na_values=[
+            "na_values": [
                 "-1.#IND",
                 "1.#QNAN",
                 "1.#IND",
@@ -317,12 +317,13 @@ def _get_dataframe(self, filePathList):
                 "-nan",
                 "",
             ],
-            keep_default_na=False,
+            "keep_default_na": False,
             # This is to check if people write files
             # with R, quote=T
-            quoting=3,
+            "quoting": 3,
             # Retain completely blank lines so that
             # validator will cause the file to fail
-            skip_blank_lines=False,
-        )
+            "skip_blank_lines": False,
+        }
+        mutationdf = transform._convert_df_with_mixed_dtypes(read_csv_params)
         return mutationdf
diff --git a/tests/test_maf.py b/tests/test_maf.py
@@ -1,4 +1,5 @@
-from unittest.mock import patch
+from io import BytesIO, StringIO
+from unittest.mock import mock_open, patch
 
 import pandas as pd
 import pytest
@@ -271,3 +272,59 @@ def test_valid__check_tsa1_tsa2(df):
     """Test valid TSA1 and TSA2"""
     error = genie_registry.maf._check_tsa1_tsa2(df)
     assert error == ""
+
+
+@pytest.mark.parametrize(
+    "test_input,expected",
+    [
+        (
+            (
+                "Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome\n"
+                "TEST	3845	TEST	GRCh37	12"
+            ),
+            pd.DataFrame(
+                {
+                    "Hugo_Symbol": ["TEST"],
+                    "Entrez_Gene_Id": [3845],
+                    "Center": ["TEST"],
+                    "NCBI_Build": ["GRCh37"],
+                    "Chromosome": [12],
+                }
+            ),
+        ),
+        (
+            (
+                "#Something Something else\n"
+                "Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome\n"
+                "TEST	3845	TEST	GRCh37	12"
+            ),
+            pd.DataFrame(
+                {
+                    "Hugo_Symbol": ["TEST"],
+                    "Entrez_Gene_Id": [3845],
+                    "Center": ["TEST"],
+                    "NCBI_Build": ["GRCh37"],
+                    "Chromosome": [12],
+                }
+            ),
+        ),
+    ],
+    ids=["no_pound_sign", "pound_sign"],
+)
+def test_that__get_dataframe_returns_expected_result(maf_class, test_input, expected):
+    with patch("builtins.open", mock_open(read_data=test_input)) as mock_file:
+        test = maf_class._get_dataframe(["some_path"])
+        pd.testing.assert_frame_equal(test, expected)
+
+
+def test_that__get_dataframe_throws_value_error(maf_class):
+    file = (
+        "#Hugo_Symbol	Entrez_Gene_Id	Center	NCBI_Build	Chromosome\n"
+        "TEST	3845	TEST	GRCh37	12"
+    )
+    with patch("builtins.open", mock_open(read_data=file)) as patch_open:
+        with pytest.raises(
+            ValueError,
+            match="Number of fields in a line do not match the expected number of columns",
+        ):
+            maf_class._get_dataframe(["some_path"])
diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -1,10 +1,16 @@
 """Test genie.transform module"""
+import os
+from io import BytesIO
 from unittest.mock import patch
 
 import pandas as pd
+from pandas.api.types import (
+    is_object_dtype,
+)
 import pytest
 
 from genie import transform
+from unittest import mock
 
 
 class TestConvertCols:
@@ -17,8 +23,12 @@ class TestConvertCols:
                 pd.DataFrame({"some_col": ["Val1", float("nan")]}),
                 ["Val1", float("nan")],
             ),
+            (
+                pd.DataFrame({"some_col": ["Val1", float("nan"), 1, "1"]}),
+                ["Val1", float("nan"), "1", "1"],
+            ),
         ],
-        ids=["float_w_na", "int_w_na", "string_w_na"],
+        ids=["float_w_na", "int_w_na", "string_w_na", "mixed_w_na"],
     )
     def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
         self, test_input, expected
@@ -35,8 +45,12 @@ def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
                 pd.DataFrame({"some_col": ["Val1", "Val2"]}),
                 ["Val1", "Val2"],
             ),
+            (
+                pd.DataFrame({"some_col": ["Val1", 1, "2"]}),
+                ["Val1", "1", "2"],
+            ),
         ],
-        ids=["float_no_na", "string_no_na"],
+        ids=["float_no_na", "string_no_na", "mixed_no_na"],
     )
     def test_that__convert_col_with_nas_to_str_returns_correct_vals_with_no_na_data(
         self, test_input, expected
@@ -73,3 +87,103 @@ def test_that__convert_float_col_with_nas_to_int_does_nothing_with_str_data(self
         result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
         assert result[0] == "Val1"
         assert pd.isna(result[1])
+
+
+def read_csv_side_effect(**kwargs):
+    if kwargs["low_memory"] == True:
+        raise pd.errors.DtypeWarning("some_warning")
+
+
+class TestConvertMixedDtypes:
+    @pytest.fixture(scope="session")
+    def test_input(self):
+        input = pd.DataFrame({"some_col": [1, "Val2", "1"]})
+        output = BytesIO()
+        input.to_csv(output, index=False)
+        output.seek(0)
+        yield output
+
+    @pytest.fixture(scope="session")
+    def test_mixed_dtype_input(self):
+        input = pd.DataFrame(
+            {
+                "some_col": ([1.0] * 100000 + ["a"] * 100000 + [float("nan")] * 100000),
+                "some_col2": (
+                    [1.0] * 100000 + ["b"] * 100000 + [float("nan")] * 100000
+                ),
+            }
+        )
+        input.to_csv("test_mixed_dtype_input.csv", index=False)
+        yield "test_mixed_dtype_input.csv"
+        os.remove("test_mixed_dtype_input.csv")
+
+    def test_that__convert_df_with_mixed_dtypes_gets_expected_output_with_normal_input(
+        self, test_input
+    ):
+        df = transform._convert_df_with_mixed_dtypes(
+            {"filepath_or_buffer": test_input, "index_col": False}
+        )
+        pd.testing.assert_frame_equal(
+            df.reset_index(drop=True), pd.DataFrame({"some_col": ["1", "Val2", "1"]})
+        )
+        assert is_object_dtype(df["some_col"])
+
+    def test_that__convert_df_with_mixed_dtypes_gets_expected_output_with_large_mixed_dtype_input(
+        self, test_mixed_dtype_input
+    ):
+        df = transform._convert_df_with_mixed_dtypes(
+            {"filepath_or_buffer": test_mixed_dtype_input, "index_col": False}
+        )
+        pd.testing.assert_frame_equal(
+            df.reset_index(drop=True),
+            pd.DataFrame(
+                {
+                    "some_col": (
+                        ["1.0"] * 100000 + ["a"] * 100000 + [float("nan")] * 100000
+                    ),
+                    "some_col2": (
+                        ["1.0"] * 100000 + ["b"] * 100000 + [float("nan")] * 100000
+                    ),
+                }
+            ),
+        )
+        assert is_object_dtype(df["some_col"])
+
+    def test_that__convert_df_with_mixed_dtypes_calls_read_csv_once_if_no_exception(
+        self, test_input
+    ):
+        with mock.patch.object(pd, "read_csv") as mock_read_csv:
+            transform._convert_df_with_mixed_dtypes(
+                {"filepath_or_buffer": test_input, "index_col": False}
+            )
+            mock_read_csv.assert_called_once_with(
+                filepath_or_buffer=test_input,
+                index_col=False,
+                low_memory=True,
+            )
+
+    def test_that__convert_df_with_mixed_dtypes_catches_mixed_dtype_exception(
+        self, test_input
+    ):
+        with mock.patch.object(
+            pd, "read_csv", side_effect=read_csv_side_effect
+        ) as mock_read_csv:
+            transform._convert_df_with_mixed_dtypes(
+                {"filepath_or_buffer": test_input, "index_col": False}
+            )
+            mock_read_csv.assert_has_calls(
+                [
+                    mock.call(
+                        filepath_or_buffer=test_input,
+                        index_col=False,
+                        low_memory=True,
+                    ),
+                    mock.call(
+                        filepath_or_buffer=test_input,
+                        index_col=False,
+                        low_memory=False,
+                        engine="c",
+                    ),
+                ],
+                any_order=False,
+            )