Skip to content

Commit

Permalink
[ENH] Coverage for data writers (#2127)
Browse files Browse the repository at this point in the history
* remove unnecessary ExponentTransform import

* coverage data writers
  • Loading branch information
TonyBagnall authored Oct 6, 2024
1 parent 5ec4693 commit 5884f4c
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 199 deletions.
2 changes: 0 additions & 2 deletions aeon/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
"load_human_activity_segmentation_datasets",
# Write functions
"write_to_tsfile",
"write_to_tsf_file",
"write_to_arff_file",
"write_results_to_uea_format",
# Single problem loaders
Expand Down Expand Up @@ -67,7 +66,6 @@
from aeon.datasets._data_writers import (
write_results_to_uea_format,
write_to_arff_file,
write_to_tsf_file,
write_to_tsfile,
)
from aeon.datasets._dataframe_loaders import (
Expand Down
197 changes: 1 addition & 196 deletions aeon/datasets/_data_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _write_data_to_tsfile(
"""
# ensure data provided is a ndarray
if not isinstance(X, np.ndarray) and not isinstance(X, list):
raise TypeError("Data provided must be a ndarray or a list")
raise ValueError("Data provided must be a ndarray or a list")
class_labels = None
if y is not None:
# ensure number of cases is same as the class value list
Expand Down Expand Up @@ -245,201 +245,6 @@ def _write_header(
return file


def write_to_tsf_file(
X,
path,
y=None,
problem_name="sample_data.tsf",
header=None,
attribute=None,
frequency=None,
horizon=0,
):
"""Write an aeon collection of time series to text file in .tsf format.
Write metadata and data stored in aeon compatible data set to file.
A description of the tsf format is in examples/load_data.ipynb.
Parameters
----------
X : pd.DataFrame, each cell a pd.Series
Collection of time series: univariate, multivariate, equal or unequal length.
path : string.
Location of the directory to write file
y: None or pd.Series, default = None
Response variable, discrete for classification, continuous for regression
None if clustering.
problem_name : string, default = "sample_data"
The file is written to <path>/<problem_name>/<problem_name>.tsf
header: string, default = None
Optional text at the top of the file that is ignored when loading.
"""
if not (isinstance(X, pd.DataFrame)):
raise TypeError(f" Wrong input data type {type(X)} convert to pd.DataFrame")

# See if passed file name contains .tsf extension or not
split = problem_name.split(".")
if split[-1] != "tsf":
problem_name = problem_name + ".tsf"

_write_dataframe_to_tsf_file(
X,
path,
y=None,
problem_name=problem_name,
attribute=attribute,
frequency=frequency,
horizon=horizon,
comment=header,
)


def _write_dataframe_to_tsf_file(
X,
path,
y=None,
problem_name="sample_data",
comment=None,
attribute=None,
frequency=None,
horizon=0,
):
# ensure data provided is a dataframe
if not isinstance(X, pd.DataFrame):
raise ValueError(f"Data provided must be a DataFrame, passed a {type(X)}")
# See if passed file name contains .tsf extension or not
split = problem_name.split(".")
if split[-1] != "tsf":
problem_name = problem_name + ".tsf"
equal_length = not X.isnull().values.any()
missing = X.isnull().values.any()
if frequency is None:
frequency = calculate_frequency(X)
if attribute is None:
attribute = {"series_name": "string", "start_timestamp": "date"}

file = _write_header_tsf(
path,
problem_name=problem_name,
attribute=attribute,
equal_length=equal_length,
frequency=frequency,
horizon=horizon,
missing=missing,
comment=comment,
)

X = X.reset_index(drop=False, inplace=False)

n_cases, n_channels = X.shape

for j in range(1, n_channels):
column_name = X.columns[j]
file.write(f"{str(column_name)}:")

# Find the index of the first non-empty value in the column
first_non_empty_index = X.iloc[:, j].first_valid_index()
start_timestamp_index = None

if first_non_empty_index is not None:
start_timestamp_index = X.index[first_non_empty_index]
start_timestamp = X.iloc[start_timestamp_index, 0].strftime(
"%Y-%m-%d %H-%M-%S"
)
file.write(f"{str(start_timestamp)}:")

for i in range(start_timestamp_index, n_cases - 1):
series = X.iloc[i, j]
# Check if the value is NaN
if pd.notna(series):
series_str = str(series)
else:
series_str = "?" # Replace NaN with a ?

# Write the series string to the file
file.write(f"{series_str},")

series = X.iloc[-1, j]
# Check if he value is NaN
if pd.notna(series):
series_str = str(series)
else:
series_str = "?" # Replace NaN with a ?
# Write the series string to the file
file.write(f"{series_str}")

# Check if y is not None before accessing its elements
if y is not None:
file.write(f"{y[i]}\n")
else:
file.write("\n") # Write a newline if y is None
file.close()


def _write_header_tsf(
path,
problem_name,
attribute,
equal_length=True,
frequency=None,
horizon=0,
missing=False,
comment=None,
):
if not os.path.exists(path):
os.makedirs(path)
# See if passed file name contains .tsf extension or not
split = problem_name.split(".")
if split[-1] != "tsf":
problem_name = problem_name + ".tsf"
load_path = f"{path}/{problem_name}"

file = open(load_path, "w")

if comment is not None:
file.write("\n# ".join(textwrap.wrap("# " + comment)))
file.write("\n")

file.write(f"@relation {str(split[0]).lower()}\n")
# Write attribute metadata for each column
if attribute is not None:
for attr in attribute:
file.write(f"@attribute {str(attr)} {str(attribute[attr])}\n")
file.write(f"@frequency {str(frequency).lower()}\n")
file.write(f"@horizon {str(horizon).lower()}\n")
file.write(f"@missing {str(missing).lower()}\n")
file.write(f"@equallength {str(equal_length).lower()}\n")
file.write("@data\n")

return file


def calculate_frequency(df):
# Convert timestamps to DateTime format
df["Timestamp"] = pd.to_datetime(df.index)

# Calculate time differences
time_diffs = df["Timestamp"].diff().dropna()

# Calculate median time difference
median_diff = time_diffs.median()

# Determine frequency based on median time difference
if median_diff <= pd.Timedelta(days=1):
frequency = "daily"
elif median_diff <= pd.Timedelta(weeks=1):
frequency = "weekly"
elif median_diff <= pd.Timedelta(days=30):
frequency = "monthly"
elif median_diff <= pd.Timedelta(days=365):
frequency = "yearly"
else:
frequency = "other" # You can define more granular frequencies as needed
df.drop("Timestamp", axis=1, inplace=True)

return frequency


def write_to_arff_file(
X,
y,
Expand Down
56 changes: 55 additions & 1 deletion aeon/datasets/tests/test_data_writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def test_write_data_to_tsfile_invalid():
"""Test function to check the handling of invalid inputs by write_to_tsfile."""
with pytest.raises(TypeError, match="Wrong input data type"):
write_to_tsfile("A string", "path")
with pytest.raises(TypeError, match="Data provided must be a ndarray or a list"):
with pytest.raises(ValueError, match="Data provided must be a ndarray or a list"):
_write_data_to_tsfile("AFC", "49", "undefeated")
X, _ = make_example_3d_numpy(n_cases=6, n_timepoints=10, n_channels=1)
y = np.ndarray([0, 1, 1, 0, 1])
Expand Down Expand Up @@ -141,6 +141,37 @@ def test_write_dataframe_to_ts(tsfile_writer):
np.testing.assert_array_almost_equal(newy.astype(int), y)


def test_write_inputs():
"""Tests whether error thrown if wrong input."""
# load an example dataset
problem_name = "Testy.ts"
with tempfile.TemporaryDirectory() as tmp:
# output the dataframe in a ts file
X, y = make_example_nested_dataframe(min_n_timepoints=12)
X2, y2 = make_example_3d_numpy()
X3, y3 = make_example_3d_numpy_list()
with pytest.raises(ValueError, match="Data provided must be a ndarray"):
_write_data_to_tsfile(
X=X,
path=tmp,
y=y,
problem_name=problem_name,
)
_write_data_to_tsfile(X=X3, path=tmp, y=y3, problem_name=problem_name)
with pytest.raises(ValueError, match="Data provided must be a DataFrame"):
_write_dataframe_to_tsfile(
X=X2,
path=tmp,
y=y2,
problem_name=problem_name,
)
with pytest.raises(TypeError, match="Wrong input data type"):
write_to_arff_file(X, y, tmp)
X2, y2 = make_example_3d_numpy(n_cases=5, n_channels=2)
with pytest.raises(ValueError, match="must be a 3D array with shape"):
write_to_arff_file(X2, y2, tmp)


def test_write_header():
"""Test _write_header."""
with tempfile.TemporaryDirectory() as tmp:
Expand Down Expand Up @@ -186,6 +217,18 @@ def test_write_results_to_uea_format():
write_results_to_uea_format(
"HC", "Testy", y_pred=y_pred, y_true=y_true, output_path=tmp
)
with pytest.raises(ValueError, match="Unknown 'split' value"):
write_results_to_uea_format(
"HC",
"Testy",
y_pred=y_pred,
output_path=tmp,
full_path=False,
split="FOO",
timing_type="seconds",
first_line_comment="Hello",
)

y_true = np.array([0, 1, 1, 0])
write_results_to_uea_format(
"HC",
Expand All @@ -211,3 +254,14 @@ def test_write_results_to_uea_format():
first_line_comment="Hello",
predicted_probs=probs,
)
write_results_to_uea_format(
"HC",
"Testy2",
y_pred=y_pred,
output_path=tmp,
full_path=False,
split="TEST",
timing_type="seconds",
first_line_comment="Hello",
predicted_probs=probs,
)

0 comments on commit 5884f4c

Please sign in to comment.