[ENH] Coverage for data writers (#2127)

* remove unnecessary ExponentTransform import * coverage data writers
aeon-toolkit · Oct 6, 2024 · 5884f4c · 5884f4c
1 parent 5ec4693
commit 5884f4c
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 199 deletions.
diff --git a/aeon/datasets/__init__.py b/aeon/datasets/__init__.py
@@ -16,7 +16,6 @@
     "load_human_activity_segmentation_datasets",
     # Write functions
     "write_to_tsfile",
-    "write_to_tsf_file",
     "write_to_arff_file",
     "write_results_to_uea_format",
     # Single problem loaders
@@ -67,7 +66,6 @@
 from aeon.datasets._data_writers import (
     write_results_to_uea_format,
     write_to_arff_file,
-    write_to_tsf_file,
     write_to_tsfile,
 )
 from aeon.datasets._dataframe_loaders import (

diff --git a/aeon/datasets/_data_writers.py b/aeon/datasets/_data_writers.py
@@ -107,7 +107,7 @@ def _write_data_to_tsfile(
     """
     # ensure data provided is a ndarray
     if not isinstance(X, np.ndarray) and not isinstance(X, list):
-        raise TypeError("Data provided must be a ndarray or a list")
+        raise ValueError("Data provided must be a ndarray or a list")
     class_labels = None
     if y is not None:
         # ensure number of cases is same as the class value list
@@ -245,201 +245,6 @@ def _write_header(
     return file
 
 
-def write_to_tsf_file(
-    X,
-    path,
-    y=None,
-    problem_name="sample_data.tsf",
-    header=None,
-    attribute=None,
-    frequency=None,
-    horizon=0,
-):
-    """Write an aeon collection of time series to text file in .tsf format.
-
-    Write metadata and data stored in aeon compatible data set to file.
-    A description of the tsf format is in examples/load_data.ipynb.
-
-    Parameters
-    ----------
-    X : pd.DataFrame, each cell a pd.Series
-        Collection of time series: univariate, multivariate, equal or unequal length.
-    path : string.
-        Location of the directory to write file
-    y: None or pd.Series, default = None
-        Response variable, discrete for classification, continuous for regression
-        None if clustering.
-    problem_name : string, default = "sample_data"
-        The file is written to <path>/<problem_name>/<problem_name>.tsf
-    header: string, default = None
-        Optional text at the top of the file that is ignored when loading.
-    """
-    if not (isinstance(X, pd.DataFrame)):
-        raise TypeError(f" Wrong input data type {type(X)} convert to pd.DataFrame")
-
-    # See if passed file name contains .tsf extension or not
-    split = problem_name.split(".")
-    if split[-1] != "tsf":
-        problem_name = problem_name + ".tsf"
-
-    _write_dataframe_to_tsf_file(
-        X,
-        path,
-        y=None,
-        problem_name=problem_name,
-        attribute=attribute,
-        frequency=frequency,
-        horizon=horizon,
-        comment=header,
-    )
-
-
-def _write_dataframe_to_tsf_file(
-    X,
-    path,
-    y=None,
-    problem_name="sample_data",
-    comment=None,
-    attribute=None,
-    frequency=None,
-    horizon=0,
-):
-    # ensure data provided is a dataframe
-    if not isinstance(X, pd.DataFrame):
-        raise ValueError(f"Data provided must be a DataFrame, passed a {type(X)}")
-    # See if passed file name contains .tsf extension or not
-    split = problem_name.split(".")
-    if split[-1] != "tsf":
-        problem_name = problem_name + ".tsf"
-    equal_length = not X.isnull().values.any()
-    missing = X.isnull().values.any()
-    if frequency is None:
-        frequency = calculate_frequency(X)
-    if attribute is None:
-        attribute = {"series_name": "string", "start_timestamp": "date"}
-
-    file = _write_header_tsf(
-        path,
-        problem_name=problem_name,
-        attribute=attribute,
-        equal_length=equal_length,
-        frequency=frequency,
-        horizon=horizon,
-        missing=missing,
-        comment=comment,
-    )
-
-    X = X.reset_index(drop=False, inplace=False)
-
-    n_cases, n_channels = X.shape
-
-    for j in range(1, n_channels):
-        column_name = X.columns[j]
-        file.write(f"{str(column_name)}:")
-
-        # Find the index of the first non-empty value in the column
-        first_non_empty_index = X.iloc[:, j].first_valid_index()
-        start_timestamp_index = None
-
-        if first_non_empty_index is not None:
-            start_timestamp_index = X.index[first_non_empty_index]
-            start_timestamp = X.iloc[start_timestamp_index, 0].strftime(
-                "%Y-%m-%d %H-%M-%S"
-            )
-        file.write(f"{str(start_timestamp)}:")
-
-        for i in range(start_timestamp_index, n_cases - 1):
-            series = X.iloc[i, j]
-            # Check if the value is NaN
-            if pd.notna(series):
-                series_str = str(series)
-            else:
-                series_str = "?"  # Replace NaN with a ?
-
-            # Write the series string to the file
-            file.write(f"{series_str},")
-
-        series = X.iloc[-1, j]
-        # Check if he value is NaN
-        if pd.notna(series):
-            series_str = str(series)
-        else:
-            series_str = "?"  # Replace NaN with a ?
-        # Write the series string to the file
-        file.write(f"{series_str}")
-
-        # Check if y is not None before accessing its elements
-        if y is not None:
-            file.write(f"{y[i]}\n")
-        else:
-            file.write("\n")  # Write a newline if y is None
-    file.close()
-
-
-def _write_header_tsf(
-    path,
-    problem_name,
-    attribute,
-    equal_length=True,
-    frequency=None,
-    horizon=0,
-    missing=False,
-    comment=None,
-):
-    if not os.path.exists(path):
-        os.makedirs(path)
-    # See if passed file name contains .tsf extension or not
-    split = problem_name.split(".")
-    if split[-1] != "tsf":
-        problem_name = problem_name + ".tsf"
-    load_path = f"{path}/{problem_name}"
-
-    file = open(load_path, "w")
-
-    if comment is not None:
-        file.write("\n# ".join(textwrap.wrap("# " + comment)))
-        file.write("\n")
-
-    file.write(f"@relation {str(split[0]).lower()}\n")
-    # Write attribute metadata for each column
-    if attribute is not None:
-        for attr in attribute:
-            file.write(f"@attribute {str(attr)} {str(attribute[attr])}\n")
-    file.write(f"@frequency {str(frequency).lower()}\n")
-    file.write(f"@horizon {str(horizon).lower()}\n")
-    file.write(f"@missing {str(missing).lower()}\n")
-    file.write(f"@equallength {str(equal_length).lower()}\n")
-    file.write("@data\n")
-
-    return file
-
-
-def calculate_frequency(df):
-    # Convert timestamps to DateTime format
-    df["Timestamp"] = pd.to_datetime(df.index)
-
-    # Calculate time differences
-    time_diffs = df["Timestamp"].diff().dropna()
-
-    # Calculate median time difference
-    median_diff = time_diffs.median()
-
-    # Determine frequency based on median time difference
-    if median_diff <= pd.Timedelta(days=1):
-        frequency = "daily"
-    elif median_diff <= pd.Timedelta(weeks=1):
-        frequency = "weekly"
-    elif median_diff <= pd.Timedelta(days=30):
-        frequency = "monthly"
-    elif median_diff <= pd.Timedelta(days=365):
-        frequency = "yearly"
-    else:
-        frequency = "other"  # You can define more granular frequencies as needed
-    df.drop("Timestamp", axis=1, inplace=True)
-
-    return frequency
-
-
 def write_to_arff_file(
     X,
     y,

diff --git a/aeon/datasets/tests/test_data_writers.py b/aeon/datasets/tests/test_data_writers.py
@@ -104,7 +104,7 @@ def test_write_data_to_tsfile_invalid():
     """Test function to check the handling of invalid inputs by write_to_tsfile."""
     with pytest.raises(TypeError, match="Wrong input data type"):
         write_to_tsfile("A string", "path")
-    with pytest.raises(TypeError, match="Data provided must be a ndarray or a list"):
+    with pytest.raises(ValueError, match="Data provided must be a ndarray or a list"):
         _write_data_to_tsfile("AFC", "49", "undefeated")
     X, _ = make_example_3d_numpy(n_cases=6, n_timepoints=10, n_channels=1)
     y = np.ndarray([0, 1, 1, 0, 1])
@@ -141,6 +141,37 @@ def test_write_dataframe_to_ts(tsfile_writer):
         np.testing.assert_array_almost_equal(newy.astype(int), y)
 
 
+def test_write_inputs():
+    """Tests whether error thrown if wrong input."""
+    # load an example dataset
+    problem_name = "Testy.ts"
+    with tempfile.TemporaryDirectory() as tmp:
+        # output the dataframe in a ts file
+        X, y = make_example_nested_dataframe(min_n_timepoints=12)
+        X2, y2 = make_example_3d_numpy()
+        X3, y3 = make_example_3d_numpy_list()
+        with pytest.raises(ValueError, match="Data provided must be a ndarray"):
+            _write_data_to_tsfile(
+                X=X,
+                path=tmp,
+                y=y,
+                problem_name=problem_name,
+            )
+        _write_data_to_tsfile(X=X3, path=tmp, y=y3, problem_name=problem_name)
+        with pytest.raises(ValueError, match="Data provided must be a DataFrame"):
+            _write_dataframe_to_tsfile(
+                X=X2,
+                path=tmp,
+                y=y2,
+                problem_name=problem_name,
+            )
+        with pytest.raises(TypeError, match="Wrong input data type"):
+            write_to_arff_file(X, y, tmp)
+        X2, y2 = make_example_3d_numpy(n_cases=5, n_channels=2)
+        with pytest.raises(ValueError, match="must be a 3D array with shape"):
+            write_to_arff_file(X2, y2, tmp)
+
+
 def test_write_header():
     """Test _write_header."""
     with tempfile.TemporaryDirectory() as tmp:
@@ -186,6 +217,18 @@ def test_write_results_to_uea_format():
             write_results_to_uea_format(
                 "HC", "Testy", y_pred=y_pred, y_true=y_true, output_path=tmp
             )
+        with pytest.raises(ValueError, match="Unknown 'split' value"):
+            write_results_to_uea_format(
+                "HC",
+                "Testy",
+                y_pred=y_pred,
+                output_path=tmp,
+                full_path=False,
+                split="FOO",
+                timing_type="seconds",
+                first_line_comment="Hello",
+            )
+
         y_true = np.array([0, 1, 1, 0])
         write_results_to_uea_format(
             "HC",
@@ -211,3 +254,14 @@ def test_write_results_to_uea_format():
             first_line_comment="Hello",
             predicted_probs=probs,
         )
+        write_results_to_uea_format(
+            "HC",
+            "Testy2",
+            y_pred=y_pred,
+            output_path=tmp,
+            full_path=False,
+            split="TEST",
+            timing_type="seconds",
+            first_line_comment="Hello",
+            predicted_probs=probs,
+        )