Skip to content

Commit 98baa26

Browse files
committed
revert legacy data_gen
1 parent c535530 commit 98baa26

File tree

4 files changed

+393
-0
lines changed

4 files changed

+393
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Legacy data generators."""
2+
3+
__all__ = [
4+
"make_example_long_table",
5+
"_make_collection",
6+
"_make_collection_X",
7+
"_make_classification_y",
8+
"make_series",
9+
"make_forecasting_problem",
10+
"_make_index",
11+
"get_examples",
12+
]
13+
14+
from aeon.testing.data_generation._legacy._collection import (
15+
_make_classification_y,
16+
_make_collection,
17+
_make_collection_X,
18+
make_example_long_table,
19+
)
20+
from aeon.testing.data_generation._legacy._series import (
21+
_make_index,
22+
make_forecasting_problem,
23+
make_series,
24+
)
25+
from aeon.testing.data_generation._legacy._test_examples import get_examples
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
import numpy as np
2+
import pandas as pd
3+
from sklearn.utils import check_random_state
4+
5+
from aeon.utils.conversion import convert_collection
6+
7+
8+
def make_example_long_table(
9+
n_cases: int = 50, n_channels: int = 2, n_timepoints: int = 20
10+
) -> pd.DataFrame:
11+
"""Generate example collection in long table format file.
12+
13+
Parameters
14+
----------
15+
n_cases: int, default = 50
16+
Number of cases.
17+
n_channels: int, default = 2
18+
Number of dimensions.
19+
n_timepoints: int, default = 20
20+
Length of the series.
21+
22+
Returns
23+
-------
24+
pd.DataFrame
25+
DataFrame containing random data in long format.
26+
"""
27+
rows_per_case = n_timepoints * n_channels
28+
total_rows = n_cases * n_timepoints * n_channels
29+
30+
case_ids = np.empty(total_rows, dtype=int)
31+
idxs = np.empty(total_rows, dtype=int)
32+
dims = np.empty(total_rows, dtype=int)
33+
vals = np.random.rand(total_rows)
34+
35+
for i in range(total_rows):
36+
case_ids[i] = int(i / rows_per_case)
37+
rem = i % rows_per_case
38+
dims[i] = int(rem / n_timepoints)
39+
idxs[i] = rem % n_timepoints
40+
41+
df = pd.DataFrame()
42+
df["case_id"] = pd.Series(case_ids)
43+
df["dim_id"] = pd.Series(dims)
44+
df["reading_id"] = pd.Series(idxs)
45+
df["value"] = pd.Series(vals)
46+
return df
47+
48+
49+
def _make_collection(
50+
n_cases=20,
51+
n_channels=1,
52+
n_timepoints=20,
53+
y=None,
54+
all_positive=False,
55+
random_state=None,
56+
return_type="numpy3D",
57+
):
58+
"""Generate aeon compatible test data, data formats.
59+
60+
Parameters
61+
----------
62+
n_cases : int, optional, default=20
63+
number of instances per series in the collection
64+
n_channels : int, optional, default=1
65+
number of variables in the time series
66+
n_timepoints : int, optional, default=20
67+
number of time points in each series
68+
y : None (default), or 1D np.darray or 1D array-like, shape (n_cases, )
69+
if passed, return will be generated with association to y
70+
all_positive : bool, optional, default=False
71+
whether series contain only positive values when generated
72+
random_state : None (default) or int
73+
if int is passed, will be used in numpy RandomState for generation
74+
return_type : str, aeon collection type, default="numpy3D"
75+
76+
Returns
77+
-------
78+
X : an aeon time series data container of type return_type
79+
with n_cases instances, n_channels variables, n_timepoints time points
80+
generating distribution is all values i.i.d. normal with std 0.5
81+
if y is passed, i-th series values are additively shifted by y[i] * 100
82+
"""
83+
# If target variable y is given, we ignore n_cases and instead generate as
84+
# many instances as in the target variable
85+
if y is not None:
86+
y = np.asarray(y)
87+
n_cases = len(y)
88+
rng = check_random_state(random_state)
89+
90+
# Generate data as 3d numpy array
91+
X = rng.normal(scale=0.5, size=(n_cases, n_channels, n_timepoints))
92+
93+
# Generate association between data and target variable
94+
if y is not None:
95+
X = X + (y * 100).reshape(-1, 1, 1)
96+
97+
if all_positive:
98+
X = X**2
99+
100+
X = convert_collection(X, return_type)
101+
return X
102+
103+
104+
def _make_collection_X(
105+
n_cases=20,
106+
n_channels=1,
107+
n_timepoints=20,
108+
y=None,
109+
all_positive=False,
110+
return_numpy=False,
111+
random_state=None,
112+
):
113+
if return_numpy:
114+
return_type = "numpy3D"
115+
else:
116+
return_type = "nested_univ"
117+
118+
return _make_collection(
119+
n_cases=n_cases,
120+
n_channels=n_channels,
121+
n_timepoints=n_timepoints,
122+
y=y,
123+
all_positive=all_positive,
124+
random_state=random_state,
125+
return_type=return_type,
126+
)
127+
128+
129+
def _make_classification_y(
130+
n_cases=20, n_classes=2, return_numpy=True, random_state=None
131+
):
132+
if not n_cases >= n_classes:
133+
raise ValueError("n_cases must be bigger than n_classes")
134+
rng = check_random_state(random_state)
135+
n_repeats = int(np.ceil(n_cases / n_classes))
136+
y = np.tile(np.arange(n_classes), n_repeats)[:n_cases]
137+
rng.shuffle(y)
138+
if return_numpy:
139+
return y
140+
else:
141+
return pd.Series(y)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
"""Legacy series data generators."""
2+
3+
import numpy as np
4+
import pandas as pd
5+
from sklearn.utils import check_random_state
6+
7+
8+
def make_series(
9+
n_timepoints: int = 50,
10+
n_columns: int = 1,
11+
all_positive: bool = True,
12+
index_type=None,
13+
return_numpy: bool = False,
14+
random_state=None,
15+
add_nan: bool = False,
16+
):
17+
"""Generate univariate or multivariate time series.
18+
19+
Parameters
20+
----------
21+
n_timepoints : int, default = 50
22+
Num of timepoints in series.
23+
n_columns : int, default = 1
24+
Number of columns of y.
25+
all_positive : bool, default = True
26+
Only positive values or not.
27+
index_type : pd.PeriodIndex or None, default = None
28+
pandas Index type to use.
29+
random_state : inst, str, float, default=None
30+
Set seed of random state
31+
add_nan : bool, default = False
32+
Add nan values to the series.
33+
34+
Returns
35+
-------
36+
np.ndarray, pd.Series, pd.DataFrame
37+
np.ndarray if return_numpy is True
38+
pd.Series if n_columns == 1
39+
else pd.DataFrame
40+
"""
41+
rng = check_random_state(random_state)
42+
data = rng.normal(size=(n_timepoints, n_columns))
43+
if add_nan:
44+
# add some nan values
45+
data[len(data) // 2] = np.nan
46+
data[0] = np.nan
47+
data[-1] = np.nan
48+
if all_positive:
49+
data -= np.min(data, axis=0) - 1
50+
if return_numpy:
51+
if n_columns == 1:
52+
data = data.ravel()
53+
return data
54+
else:
55+
index = _make_index(n_timepoints, index_type)
56+
if n_columns == 1:
57+
return pd.Series(data.ravel(), index)
58+
else:
59+
return pd.DataFrame(data, index)
60+
61+
62+
def _make_index(n_timepoints, index_type=None):
63+
"""Make indices for unit testing."""
64+
if index_type == "period":
65+
start = "2000-01"
66+
freq = "M"
67+
return pd.period_range(start=start, periods=n_timepoints, freq=freq)
68+
69+
elif index_type == "datetime" or index_type is None:
70+
start = "2000-01-01"
71+
freq = "D"
72+
return pd.date_range(start=start, periods=n_timepoints, freq=freq)
73+
74+
elif index_type == "range":
75+
start = 3 # check non-zero based indices
76+
return pd.RangeIndex(start=start, stop=start + n_timepoints)
77+
78+
elif index_type == "int":
79+
start = 3
80+
return pd.Index(np.arange(start, start + n_timepoints), dtype=int)
81+
82+
else:
83+
raise ValueError(f"index_class: {index_type} is not supported")
84+
85+
86+
def make_forecasting_problem(
87+
n_timepoints: int = 50,
88+
all_positive: bool = True,
89+
index_type=None,
90+
make_X: bool = False,
91+
n_columns: int = 1,
92+
random_state=None,
93+
):
94+
"""Return test data for forecasting tests.
95+
96+
Parameters
97+
----------
98+
n_timepoints : int, default = 50
99+
Num of timepoints in series.
100+
all_positive : bool, default = True
101+
Only positive values or not.
102+
index_type : pd.PeriodIndex or None, default = None
103+
pandas Index type to use.
104+
make_X : bool, default = False
105+
Should X data also be returned.
106+
n_columns : int, default = 1
107+
Number of columns of y.
108+
random_state : inst, str, float, default=None
109+
Set seed of random state
110+
111+
Returns
112+
-------
113+
pd.Series
114+
generated series if not make_X
115+
(pd.Series, pd.DataFrame)
116+
(pd.Series, pd.DataFrame) if make_X
117+
"""
118+
y = make_series(
119+
n_timepoints=n_timepoints,
120+
n_columns=n_columns,
121+
all_positive=all_positive,
122+
index_type=index_type,
123+
random_state=random_state,
124+
)
125+
126+
if not make_X:
127+
return y
128+
129+
X = make_series(
130+
n_timepoints=n_timepoints,
131+
n_columns=2,
132+
all_positive=all_positive,
133+
index_type=index_type,
134+
random_state=random_state,
135+
)
136+
X.index = y.index
137+
return y, X
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
"""Generate test examples."""
2+
3+
import numpy as np
4+
import pandas as pd
5+
6+
# pd.Series
7+
s1 = pd.Series([1, 4, 0.5, -3], dtype=np.float64, name="a")
8+
series_examples = [s1]
9+
# pd.DataFrame univariate and multivariate
10+
d1 = pd.DataFrame({"a": [1, 4, 0.5, -3]})
11+
d2 = pd.DataFrame({"a": [1, 4, 0.5, -3], "b": [3, 7, 2, -3 / 7]})
12+
dataframe_examples = [d1, d2]
13+
# pd-multiindex multivariate, equally sampled
14+
cols = ["instances", "timepoints"] + ["var_0", "var_1"]
15+
mi1list = [
16+
pd.DataFrame([[0, 0, 1, 4], [0, 1, 2, 5], [0, 2, 3, 6]], columns=cols),
17+
pd.DataFrame([[1, 0, 1, 4], [1, 1, 2, 55], [1, 2, 3, 6]], columns=cols),
18+
pd.DataFrame([[2, 0, 1, 42], [2, 1, 2, 5], [2, 2, 3, 6]], columns=cols),
19+
]
20+
mi1 = pd.concat(mi1list)
21+
mi1 = mi1.set_index(["instances", "timepoints"])
22+
23+
cols = ["instances", "timepoints"] + ["var_0"]
24+
mi2list = [
25+
pd.DataFrame([[0, 0, 4], [0, 1, 5], [0, 2, 6]], columns=cols),
26+
pd.DataFrame([[1, 0, 4], [1, 1, 55], [1, 2, 6]], columns=cols),
27+
pd.DataFrame([[2, 0, 42], [2, 1, 5], [2, 2, 6]], columns=cols),
28+
]
29+
mi2 = pd.concat(mi2list)
30+
mi2 = mi2.set_index(["instances", "timepoints"])
31+
multiindex_examples = [mi1, mi2]
32+
33+
# pd_multiindex_hier
34+
cols = ["foo", "bar", "timepoints"] + [f"var_{i}" for i in range(2)]
35+
mih1list = [
36+
pd.DataFrame(
37+
[["a", 0, 0, 1, 4], ["a", 0, 1, 2, 5], ["a", 0, 2, 3, 6]], columns=cols
38+
),
39+
pd.DataFrame(
40+
[["a", 1, 0, 1, 4], ["a", 1, 1, 2, 55], ["a", 1, 2, 3, 6]], columns=cols
41+
),
42+
pd.DataFrame(
43+
[["a", 2, 0, 1, 42], ["a", 2, 1, 2, 5], ["a", 2, 2, 3, 6]], columns=cols
44+
),
45+
pd.DataFrame(
46+
[["b", 0, 0, 1, 4], ["b", 0, 1, 2, 5], ["b", 0, 2, 3, 6]], columns=cols
47+
),
48+
pd.DataFrame(
49+
[["b", 1, 0, 1, 4], ["b", 1, 1, 2, 55], ["b", 1, 2, 3, 6]], columns=cols
50+
),
51+
pd.DataFrame(
52+
[["b", 2, 0, 1, 42], ["b", 2, 1, 2, 5], ["b", 2, 2, 3, 6]], columns=cols
53+
),
54+
]
55+
mih1 = pd.concat(mih1list)
56+
mih1 = mih1.set_index(["foo", "bar", "timepoints"])
57+
58+
cols = ["foo", "bar", "timepoints"] + [f"var_{i}" for i in range(1)]
59+
60+
mih2list = [
61+
pd.DataFrame([["a", 0, 0, 1], ["a", 0, 1, 2], ["a", 0, 2, 3]], columns=cols),
62+
pd.DataFrame([["a", 1, 0, 1], ["a", 1, 1, 2], ["a", 1, 2, 3]], columns=cols),
63+
pd.DataFrame([["a", 2, 0, 1], ["a", 2, 1, 2], ["a", 2, 2, 3]], columns=cols),
64+
pd.DataFrame([["b", 0, 0, 1], ["b", 0, 1, 2], ["b", 0, 2, 3]], columns=cols),
65+
pd.DataFrame([["b", 1, 0, 1], ["b", 1, 1, 2], ["b", 1, 2, 3]], columns=cols),
66+
pd.DataFrame([["b", 2, 0, 1], ["b", 2, 1, 2], ["b", 2, 2, 3]], columns=cols),
67+
]
68+
mih2 = pd.concat(mih2list)
69+
mih2 = mih2.set_index(["foo", "bar", "timepoints"])
70+
mih_examples = [mih1, mih2]
71+
72+
np1 = np.array([1, 2, 3, 4, 5])
73+
np2 = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
74+
np_examples = [np1, np2]
75+
76+
77+
def get_examples(datatype: str):
78+
"""Create two examples of each possible type."""
79+
if datatype == "pd.Series":
80+
return series_examples
81+
elif datatype == "pd.DataFrame":
82+
return dataframe_examples
83+
elif datatype == "pd-multiindex":
84+
return multiindex_examples
85+
elif datatype == "pd_multiindex_hier":
86+
return mih_examples
87+
elif datatype == "np.ndarray":
88+
return np_examples
89+
else:
90+
raise ValueError(f"Unknown datatype : {datatype} in get examples.")

0 commit comments

Comments
 (0)