revert legacy data_gen

TonyBagnall · TonyBagnall · commit 98baa26558db · 2024-09-07T18:31:55.000+01:00
diff --git a/aeon/testing/data_generation/_legacy/__init__.py b/aeon/testing/data_generation/_legacy/__init__.py
@@ -0,0 +1,25 @@
+"""Legacy data generators."""
+
+__all__ = [
+    "make_example_long_table",
+    "_make_collection",
+    "_make_collection_X",
+    "_make_classification_y",
+    "make_series",
+    "make_forecasting_problem",
+    "_make_index",
+    "get_examples",
+]
+
+from aeon.testing.data_generation._legacy._collection import (
+    _make_classification_y,
+    _make_collection,
+    _make_collection_X,
+    make_example_long_table,
+)
+from aeon.testing.data_generation._legacy._series import (
+    _make_index,
+    make_forecasting_problem,
+    make_series,
+)
+from aeon.testing.data_generation._legacy._test_examples import get_examples
diff --git a/aeon/testing/data_generation/_legacy/_collection.py b/aeon/testing/data_generation/_legacy/_collection.py
@@ -0,0 +1,141 @@
+import numpy as np
+import pandas as pd
+from sklearn.utils import check_random_state
+
+from aeon.utils.conversion import convert_collection
+
+
+def make_example_long_table(
+    n_cases: int = 50, n_channels: int = 2, n_timepoints: int = 20
+) -> pd.DataFrame:
+    """Generate example collection in long table format file.
+
+    Parameters
+    ----------
+    n_cases: int, default = 50
+        Number of cases.
+    n_channels: int, default = 2
+        Number of dimensions.
+    n_timepoints: int, default = 20
+        Length of the series.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing random data in long format.
+    """
+    rows_per_case = n_timepoints * n_channels
+    total_rows = n_cases * n_timepoints * n_channels
+
+    case_ids = np.empty(total_rows, dtype=int)
+    idxs = np.empty(total_rows, dtype=int)
+    dims = np.empty(total_rows, dtype=int)
+    vals = np.random.rand(total_rows)
+
+    for i in range(total_rows):
+        case_ids[i] = int(i / rows_per_case)
+        rem = i % rows_per_case
+        dims[i] = int(rem / n_timepoints)
+        idxs[i] = rem % n_timepoints
+
+    df = pd.DataFrame()
+    df["case_id"] = pd.Series(case_ids)
+    df["dim_id"] = pd.Series(dims)
+    df["reading_id"] = pd.Series(idxs)
+    df["value"] = pd.Series(vals)
+    return df
+
+
+def _make_collection(
+    n_cases=20,
+    n_channels=1,
+    n_timepoints=20,
+    y=None,
+    all_positive=False,
+    random_state=None,
+    return_type="numpy3D",
+):
+    """Generate aeon compatible test data, data formats.
+
+    Parameters
+    ----------
+    n_cases : int, optional, default=20
+        number of instances per series in the collection
+    n_channels : int, optional, default=1
+        number of variables in the time series
+    n_timepoints : int, optional, default=20
+        number of time points in each series
+    y : None (default), or 1D np.darray or 1D array-like, shape (n_cases, )
+        if passed, return will be generated with association to y
+    all_positive : bool, optional, default=False
+        whether series contain only positive values when generated
+    random_state : None (default) or int
+        if int is passed, will be used in numpy RandomState for generation
+    return_type : str, aeon collection type, default="numpy3D"
+
+    Returns
+    -------
+    X : an aeon time series data container of type return_type
+        with n_cases instances, n_channels variables, n_timepoints time points
+        generating distribution is all values i.i.d. normal with std 0.5
+        if y is passed, i-th series values are additively shifted by y[i] * 100
+    """
+    # If target variable y is given, we ignore n_cases and instead generate as
+    # many instances as in the target variable
+    if y is not None:
+        y = np.asarray(y)
+        n_cases = len(y)
+    rng = check_random_state(random_state)
+
+    # Generate data as 3d numpy array
+    X = rng.normal(scale=0.5, size=(n_cases, n_channels, n_timepoints))
+
+    # Generate association between data and target variable
+    if y is not None:
+        X = X + (y * 100).reshape(-1, 1, 1)
+
+    if all_positive:
+        X = X**2
+
+    X = convert_collection(X, return_type)
+    return X
+
+
+def _make_collection_X(
+    n_cases=20,
+    n_channels=1,
+    n_timepoints=20,
+    y=None,
+    all_positive=False,
+    return_numpy=False,
+    random_state=None,
+):
+    if return_numpy:
+        return_type = "numpy3D"
+    else:
+        return_type = "nested_univ"
+
+    return _make_collection(
+        n_cases=n_cases,
+        n_channels=n_channels,
+        n_timepoints=n_timepoints,
+        y=y,
+        all_positive=all_positive,
+        random_state=random_state,
+        return_type=return_type,
+    )
+
+
+def _make_classification_y(
+    n_cases=20, n_classes=2, return_numpy=True, random_state=None
+):
+    if not n_cases >= n_classes:
+        raise ValueError("n_cases must be bigger than n_classes")
+    rng = check_random_state(random_state)
+    n_repeats = int(np.ceil(n_cases / n_classes))
+    y = np.tile(np.arange(n_classes), n_repeats)[:n_cases]
+    rng.shuffle(y)
+    if return_numpy:
+        return y
+    else:
+        return pd.Series(y)
diff --git a/aeon/testing/data_generation/_legacy/_series.py b/aeon/testing/data_generation/_legacy/_series.py
@@ -0,0 +1,137 @@
+"""Legacy series data generators."""
+
+import numpy as np
+import pandas as pd
+from sklearn.utils import check_random_state
+
+
+def make_series(
+    n_timepoints: int = 50,
+    n_columns: int = 1,
+    all_positive: bool = True,
+    index_type=None,
+    return_numpy: bool = False,
+    random_state=None,
+    add_nan: bool = False,
+):
+    """Generate univariate or multivariate time series.
+
+    Parameters
+    ----------
+    n_timepoints : int, default = 50
+        Num of timepoints in series.
+    n_columns : int, default = 1
+        Number of columns of y.
+    all_positive : bool, default = True
+        Only positive values or not.
+    index_type : pd.PeriodIndex or None, default = None
+        pandas Index type to use.
+    random_state : inst, str, float, default=None
+        Set seed of random state
+    add_nan : bool, default = False
+        Add nan values to the series.
+
+    Returns
+    -------
+    np.ndarray, pd.Series, pd.DataFrame
+        np.ndarray if return_numpy is True
+        pd.Series if n_columns == 1
+        else pd.DataFrame
+    """
+    rng = check_random_state(random_state)
+    data = rng.normal(size=(n_timepoints, n_columns))
+    if add_nan:
+        # add some nan values
+        data[len(data) // 2] = np.nan
+        data[0] = np.nan
+        data[-1] = np.nan
+    if all_positive:
+        data -= np.min(data, axis=0) - 1
+    if return_numpy:
+        if n_columns == 1:
+            data = data.ravel()
+        return data
+    else:
+        index = _make_index(n_timepoints, index_type)
+        if n_columns == 1:
+            return pd.Series(data.ravel(), index)
+        else:
+            return pd.DataFrame(data, index)
+
+
+def _make_index(n_timepoints, index_type=None):
+    """Make indices for unit testing."""
+    if index_type == "period":
+        start = "2000-01"
+        freq = "M"
+        return pd.period_range(start=start, periods=n_timepoints, freq=freq)
+
+    elif index_type == "datetime" or index_type is None:
+        start = "2000-01-01"
+        freq = "D"
+        return pd.date_range(start=start, periods=n_timepoints, freq=freq)
+
+    elif index_type == "range":
+        start = 3  # check non-zero based indices
+        return pd.RangeIndex(start=start, stop=start + n_timepoints)
+
+    elif index_type == "int":
+        start = 3
+        return pd.Index(np.arange(start, start + n_timepoints), dtype=int)
+
+    else:
+        raise ValueError(f"index_class: {index_type} is not supported")
+
+
+def make_forecasting_problem(
+    n_timepoints: int = 50,
+    all_positive: bool = True,
+    index_type=None,
+    make_X: bool = False,
+    n_columns: int = 1,
+    random_state=None,
+):
+    """Return test data for forecasting tests.
+
+    Parameters
+    ----------
+    n_timepoints : int, default = 50
+        Num of timepoints in series.
+    all_positive : bool, default = True
+        Only positive values or not.
+    index_type : pd.PeriodIndex or None, default = None
+        pandas Index type to use.
+    make_X : bool, default = False
+        Should X data also be returned.
+    n_columns : int, default = 1
+        Number of columns of y.
+    random_state : inst, str, float, default=None
+        Set seed of random state
+
+    Returns
+    -------
+    pd.Series
+        generated series if not make_X
+    (pd.Series, pd.DataFrame)
+        (pd.Series, pd.DataFrame) if make_X
+    """
+    y = make_series(
+        n_timepoints=n_timepoints,
+        n_columns=n_columns,
+        all_positive=all_positive,
+        index_type=index_type,
+        random_state=random_state,
+    )
+
+    if not make_X:
+        return y
+
+    X = make_series(
+        n_timepoints=n_timepoints,
+        n_columns=2,
+        all_positive=all_positive,
+        index_type=index_type,
+        random_state=random_state,
+    )
+    X.index = y.index
+    return y, X
diff --git a/aeon/testing/data_generation/_legacy/_test_examples.py b/aeon/testing/data_generation/_legacy/_test_examples.py
@@ -0,0 +1,90 @@
+"""Generate test examples."""
+
+import numpy as np
+import pandas as pd
+
+# pd.Series
+s1 = pd.Series([1, 4, 0.5, -3], dtype=np.float64, name="a")
+series_examples = [s1]
+# pd.DataFrame univariate and multivariate
+d1 = pd.DataFrame({"a": [1, 4, 0.5, -3]})
+d2 = pd.DataFrame({"a": [1, 4, 0.5, -3], "b": [3, 7, 2, -3 / 7]})
+dataframe_examples = [d1, d2]
+# pd-multiindex multivariate, equally sampled
+cols = ["instances", "timepoints"] + ["var_0", "var_1"]
+mi1list = [
+    pd.DataFrame([[0, 0, 1, 4], [0, 1, 2, 5], [0, 2, 3, 6]], columns=cols),
+    pd.DataFrame([[1, 0, 1, 4], [1, 1, 2, 55], [1, 2, 3, 6]], columns=cols),
+    pd.DataFrame([[2, 0, 1, 42], [2, 1, 2, 5], [2, 2, 3, 6]], columns=cols),
+]
+mi1 = pd.concat(mi1list)
+mi1 = mi1.set_index(["instances", "timepoints"])
+
+cols = ["instances", "timepoints"] + ["var_0"]
+mi2list = [
+    pd.DataFrame([[0, 0, 4], [0, 1, 5], [0, 2, 6]], columns=cols),
+    pd.DataFrame([[1, 0, 4], [1, 1, 55], [1, 2, 6]], columns=cols),
+    pd.DataFrame([[2, 0, 42], [2, 1, 5], [2, 2, 6]], columns=cols),
+]
+mi2 = pd.concat(mi2list)
+mi2 = mi2.set_index(["instances", "timepoints"])
+multiindex_examples = [mi1, mi2]
+
+# pd_multiindex_hier
+cols = ["foo", "bar", "timepoints"] + [f"var_{i}" for i in range(2)]
+mih1list = [
+    pd.DataFrame(
+        [["a", 0, 0, 1, 4], ["a", 0, 1, 2, 5], ["a", 0, 2, 3, 6]], columns=cols
+    ),
+    pd.DataFrame(
+        [["a", 1, 0, 1, 4], ["a", 1, 1, 2, 55], ["a", 1, 2, 3, 6]], columns=cols
+    ),
+    pd.DataFrame(
+        [["a", 2, 0, 1, 42], ["a", 2, 1, 2, 5], ["a", 2, 2, 3, 6]], columns=cols
+    ),
+    pd.DataFrame(
+        [["b", 0, 0, 1, 4], ["b", 0, 1, 2, 5], ["b", 0, 2, 3, 6]], columns=cols
+    ),
+    pd.DataFrame(
+        [["b", 1, 0, 1, 4], ["b", 1, 1, 2, 55], ["b", 1, 2, 3, 6]], columns=cols
+    ),
+    pd.DataFrame(
+        [["b", 2, 0, 1, 42], ["b", 2, 1, 2, 5], ["b", 2, 2, 3, 6]], columns=cols
+    ),
+]
+mih1 = pd.concat(mih1list)
+mih1 = mih1.set_index(["foo", "bar", "timepoints"])
+
+cols = ["foo", "bar", "timepoints"] + [f"var_{i}" for i in range(1)]
+
+mih2list = [
+    pd.DataFrame([["a", 0, 0, 1], ["a", 0, 1, 2], ["a", 0, 2, 3]], columns=cols),
+    pd.DataFrame([["a", 1, 0, 1], ["a", 1, 1, 2], ["a", 1, 2, 3]], columns=cols),
+    pd.DataFrame([["a", 2, 0, 1], ["a", 2, 1, 2], ["a", 2, 2, 3]], columns=cols),
+    pd.DataFrame([["b", 0, 0, 1], ["b", 0, 1, 2], ["b", 0, 2, 3]], columns=cols),
+    pd.DataFrame([["b", 1, 0, 1], ["b", 1, 1, 2], ["b", 1, 2, 3]], columns=cols),
+    pd.DataFrame([["b", 2, 0, 1], ["b", 2, 1, 2], ["b", 2, 2, 3]], columns=cols),
+]
+mih2 = pd.concat(mih2list)
+mih2 = mih2.set_index(["foo", "bar", "timepoints"])
+mih_examples = [mih1, mih2]
+
+np1 = np.array([1, 2, 3, 4, 5])
+np2 = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+np_examples = [np1, np2]
+
+
+def get_examples(datatype: str):
+    """Create two examples of each possible type."""
+    if datatype == "pd.Series":
+        return series_examples
+    elif datatype == "pd.DataFrame":
+        return dataframe_examples
+    elif datatype == "pd-multiindex":
+        return multiindex_examples
+    elif datatype == "pd_multiindex_hier":
+        return mih_examples
+    elif datatype == "np.ndarray":
+        return np_examples
+    else:
+        raise ValueError(f"Unknown datatype : {datatype} in get examples.")