From 5ec46931b1d4057cdb439cbd3705c449743a4952 Mon Sep 17 00:00:00 2001 From: Antoine Guillaume Date: Sat, 5 Oct 2024 22:48:54 +0200 Subject: [PATCH] [MNT] Fix RDST CI failure and slow down (#2121) * Remove distance argument for RDST * replace old to new random generator API, unfiformize dtypes * fix visualization * put back regressors on ignore list --- aeon/classification/shapelet_based/_rdst.py | 6 - aeon/testing/testing_config.py | 2 + .../_dilated_shapelet_transform.py | 152 ++++++++---------- .../tests/test_dilated_shapelet_transform.py | 16 +- aeon/utils/numba/general.py | 26 +-- aeon/visualisation/estimator/_shapelets.py | 52 +++++- examples/classification/shapelet_based.ipynb | 5 +- 7 files changed, 136 insertions(+), 123 deletions(-) diff --git a/aeon/classification/shapelet_based/_rdst.py b/aeon/classification/shapelet_based/_rdst.py index 2e0b0ab6af..0c5c92bb5a 100644 --- a/aeon/classification/shapelet_based/_rdst.py +++ b/aeon/classification/shapelet_based/_rdst.py @@ -60,9 +60,6 @@ class RDSTClassifier(BaseClassifier): If True, restricts the value of the shapelet dilation parameter to be prime values. This can greatly speed-up the algorithm for long time series and/or short shapelet lengths, possibly at the cost of some accuracy. - distance: str="manhattan" - Name of the distance function to be used. By default this is the - manhattan distance. Other distances from the aeon distance modules can be used. estimator : BaseEstimator or None, default=None Base estimator for the ensemble, can be supplied a sklearn `BaseEstimator`. If `None` a default `RidgeClassifierCV` classifier is used with standard scaling. @@ -147,7 +144,6 @@ def __init__( threshold_percentiles=None, alpha_similarity: float = 0.5, use_prime_dilations: bool = False, - distance: str = "manhattan", estimator=None, save_transformed_data: bool = False, class_weight=None, @@ -160,7 +156,6 @@ def __init__( self.threshold_percentiles = threshold_percentiles self.alpha_similarity = alpha_similarity self.use_prime_dilations = use_prime_dilations - self.distance = distance self.estimator = estimator self.save_transformed_data = save_transformed_data self.class_weight = class_weight @@ -203,7 +198,6 @@ def _fit(self, X, y): use_prime_dilations=self.use_prime_dilations, n_jobs=self.n_jobs, random_state=self.random_state, - distance=self.distance, ) if self.estimator is None: self._estimator = make_pipeline( diff --git a/aeon/testing/testing_config.py b/aeon/testing/testing_config.py index 2a6ca32a5b..9d8f96abac 100644 --- a/aeon/testing/testing_config.py +++ b/aeon/testing/testing_config.py @@ -15,10 +15,12 @@ EXCLUDE_ESTIMATORS = [ "SeriesSearch", "QuerySearch", + "ClearSkyTransformer", # See #2071 "RandomDilatedShapeletTransform", "RDSTClassifier", "RDSTRegressor", + "RISTRegressor", ] diff --git a/aeon/transformations/collection/shapelet_based/_dilated_shapelet_transform.py b/aeon/transformations/collection/shapelet_based/_dilated_shapelet_transform.py index 714d1a826b..48af1d3bf4 100644 --- a/aeon/transformations/collection/shapelet_based/_dilated_shapelet_transform.py +++ b/aeon/transformations/collection/shapelet_based/_dilated_shapelet_transform.py @@ -14,11 +14,10 @@ import numpy as np from numba import njit, prange, set_num_threads -from numba.core.registry import CPUDispatcher from numba.typed import List +from numpy.random._generator import Generator from sklearn.preprocessing import LabelEncoder -from aeon.distances import get_distance_function from aeon.transformations.collection import BaseCollectionTransformer from aeon.utils.numba.general import ( AEON_NUMBA_STD_THRESHOLD, @@ -26,7 +25,6 @@ combinations_1d, get_subsequence, get_subsequence_with_mean_std, - set_numba_random_seed, sliding_mean_std_one_series, ) from aeon.utils.numba.stats import prime_up_to @@ -87,9 +85,6 @@ class RandomDilatedShapeletTransform(BaseCollectionTransformer): If True, restricts the value of the shapelet dilation parameter to be prime values. This can greatly speed up the algorithm for long time series and/or short shapelet lengths, possibly at the cost of some accuracy. - distance: str="manhattan" - Name of the distance function to be used. By default this is the - manhattan distance. Other distances from the aeon distance modules can be used. n_jobs : int, default=1 The number of threads used for both `fit` and `transform`. random_state : int or None, default=None @@ -161,7 +156,6 @@ def __init__( alpha_similarity: float = 0.5, use_prime_dilations: bool = False, random_state: Optional[int] = None, - distance: CPUDispatcher = "manhattan", n_jobs: int = 1, ): self.max_shapelets = max_shapelets @@ -171,7 +165,6 @@ def __init__( self.alpha_similarity = alpha_similarity self.use_prime_dilations = use_prime_dilations self.random_state = random_state - self.distance = distance self.n_jobs = n_jobs super().__init__() @@ -193,12 +186,15 @@ def _fit(self, X: np.ndarray, y: Optional[Union[np.ndarray, TypingList]] = None) self : RandomDilatedShapeletTransform This estimator. """ - self.distance_func = get_distance_function(self.distance) - if isinstance(self.random_state, int): - self._random_state = np.int32(self.random_state) + self._random_generator = np.random.default_rng(self.random_state) + elif self.random_state is None: + self._random_generator = np.random.default_rng() else: - self._random_state = np.int32(np.random.randint(0, 2**31)) + raise ValueError( + "Expected integer or None for random_state argument but got" + f"{type(self.random_state)}" + ) n_cases_ = len(X) self.min_n_timepoints_ = min([X[i].shape[1] for i in range(n_cases_)]) @@ -228,8 +224,7 @@ def _fit(self, X: np.ndarray, y: Optional[Union[np.ndarray, TypingList]] = None) self.threshold_percentiles_, self.alpha_similarity, self.use_prime_dilations, - self._random_state, - self.distance_func, + self._random_generator, ) if len(self.shapelets_[0]) == 0: raise RuntimeError( @@ -276,7 +271,6 @@ def _transform( X_new = dilated_shapelet_transform( X, self.shapelets_, - self.distance_func, ) if np.isinf(X_new).any() or np.isnan(X_new).any(): warnings.warn( @@ -310,7 +304,7 @@ def _check_input_params(self): "an array (got {}).".format(self.shapelet_lengths_) ) - self.shapelet_lengths_ = np.array(self.shapelet_lengths_, dtype=np.int32) + self.shapelet_lengths_ = np.array(self.shapelet_lengths_, dtype=np.int_) if not np.all(self.shapelet_lengths_ >= 2): warnings.warn( "Some values in 'shapelet_lengths' are inferior to 2." @@ -389,6 +383,7 @@ def _init_random_shapelet_params( use_prime_dilations: bool, n_channels: int, n_timepoints: int, + random_generator: Generator, ): """Randomly initialize the parameters of the shapelets. @@ -414,6 +409,8 @@ def _init_random_shapelet_params( Number of channels of the input time series. n_timepoints : int Size of the input time series. + random_generator : + The random generator used for random operations. Returns ------- @@ -438,46 +435,52 @@ def _init_random_shapelet_params( """ # Init startpoint array - startpoints = np.zeros(max_shapelets, dtype=np.int32) + startpoints = np.zeros(max_shapelets, dtype=np.int_) # Init class array - classes = np.zeros(max_shapelets, dtype=np.int32) + classes = np.zeros(max_shapelets, dtype=np.int_) # Lengths of the shapelets # test dtypes correctness - lengths = np.random.choice(shapelet_lengths, size=max_shapelets).astype(np.int32) + lengths = shapelet_lengths[ + random_generator.integers( + 0, high=len(shapelet_lengths), size=max_shapelets + ).astype(np.int_) + ] # Upper bound values for dilations - dilations = np.zeros(max_shapelets, dtype=np.int32) + dilations = np.zeros(max_shapelets, dtype=np.int_) upper_bounds = np.log2(np.floor_divide(n_timepoints - 1, lengths - 1)) if use_prime_dilations: - _primes = prime_up_to(np.int32(2 ** upper_bounds.max())) + _primes = prime_up_to(np.int_(2 ** upper_bounds.max())) # 1 is not prime, but it is still a valid dilation for the "prime" scheme - primes = np.zeros((_primes.shape[0] + 1), dtype=np.int32) + primes = np.zeros((_primes.shape[0] + 1), dtype=np.int_) primes[0] = 1 primes[1:] = _primes for i in prange(max_shapelets): - shp_primes = primes[primes <= np.int32(2 ** upper_bounds[i])] - dilations[i] = shp_primes[choice_log(shp_primes.shape[0], 1)[0]] + shp_primes = primes[primes <= np.int_(2 ** upper_bounds[i])] + dilations[i] = shp_primes[ + choice_log(shp_primes.shape[0], 1, random_generator)[0] + ] else: for i in prange(max_shapelets): - dilations[i] = np.int32(2 ** np.random.uniform(0, upper_bounds[i])) + dilations[i] = np.int_(2 ** random_generator.uniform(0, upper_bounds[i])) # Init threshold array - threshold = np.zeros(max_shapelets, dtype=np.float64) + threshold = np.zeros(max_shapelets, dtype=np.float_) # Init values array values = np.full( (max_shapelets, n_channels, max(shapelet_lengths)), np.inf, - dtype=np.float64, + dtype=np.float_, ) # Is shapelet using z-normalization ? - normalize = np.random.random(size=max_shapelets) + normalize = random_generator.uniform(0, 1, size=max_shapelets) normalize = normalize < proba_normalization - means = np.zeros((max_shapelets, n_channels), dtype=np.float64) - stds = np.zeros((max_shapelets, n_channels), dtype=np.float64) + means = np.zeros((max_shapelets, n_channels), dtype=np.float_) + stds = np.zeros((max_shapelets, n_channels), dtype=np.float_) return ( values, @@ -493,14 +496,14 @@ def _init_random_shapelet_params( @njit(cache=True) -def _get_admissible_sampling_point(current_mask): +def _get_admissible_sampling_point(current_mask, random_generator): n_cases = len(current_mask) # Count the number of admissible points per sample as cumsum n_admissible_points = 0 for i in range(n_cases): n_admissible_points += current_mask[i].shape[0] if n_admissible_points > 0: - idx_choice = np.random.choice(n_admissible_points) + idx_choice = random_generator.integers(0, high=n_admissible_points) for i in range(n_cases): _new_val = idx_choice - current_mask[i].shape[0] if _new_val < 0 and current_mask[i].shape[0] > 0: @@ -520,8 +523,7 @@ def random_dilated_shapelet_extraction( threshold_percentiles: np.ndarray, alpha_similarity: float, use_prime_dilations: bool, - seed: int, - distance: CPUDispatcher, + random_generator: Generator, ): """Randomly generate a set of shapelets given the input parameters. @@ -556,12 +558,8 @@ def random_dilated_shapelet_extraction( If True, restrict the value of the shapelet dilation parameter to be prime values. This can greatly speed up the algorithm for long time series and/or short shapelet length, possibly at the cost of some accuracy. - seed : int - Seed for random number generation. - distance: CPUDispatcher - A Numba function used to compute the distance between two multidimensional - time series of shape (n_channels, length). Used as distance function between - shapelets and candidate subsequences + random_generator : Generator + The random generator used for random operations. Returns ------- @@ -588,13 +586,11 @@ def random_dilated_shapelet_extraction( """ n_cases = len(X) n_channels = X[0].shape[0] - n_timepointss = np.zeros(n_cases, dtype=np.int64) + n_timepointss = np.zeros(n_cases, dtype=np.int_) for i in range(n_cases): n_timepointss[i] = X[i].shape[1] min_n_timepoints = n_timepointss.min() max_n_timepoints = n_timepointss.max() - # Fix the random seed - set_numba_random_seed(seed) # Initialize shapelets ( @@ -614,6 +610,7 @@ def random_dilated_shapelet_extraction( use_prime_dilations, n_channels, min_n_timepoints, + random_generator, ) # Get unique dilations to loop over unique_dil = np.unique(dilations) @@ -647,7 +644,9 @@ def random_dilated_shapelet_extraction( for _i in range(n_cases) ] ) - idx_sample, idx_timestamp = _get_admissible_sampling_point(current_mask) + idx_sample, idx_timestamp = _get_admissible_sampling_point( + current_mask, random_generator + ) if idx_sample >= 0: # Update the mask in two directions from the sampling point alpha_size = length - int(max(1, (1 - alpha_similarity) * min_len)) @@ -679,7 +678,9 @@ def random_dilated_shapelet_extraction( loc_others = np.where(y == y[idx_sample])[0] if loc_others.shape[0] > 1: loc_others = loc_others[loc_others != idx_sample] - id_test = np.random.choice(loc_others) + id_test = loc_others[ + random_generator.integers(0, high=loc_others.shape[0]) + ] else: id_test = idx_sample @@ -691,12 +692,12 @@ def random_dilated_shapelet_extraction( X[id_test], length, dilation ) X_subs = normalize_subsequences(X_subs, X_means, X_stds) - x_dist = compute_shapelet_dist_vector(X_subs, _val, length, distance) + x_dist = compute_shapelet_dist_vector(X_subs, _val) lower_bound = np.percentile(x_dist, threshold_percentiles[0]) upper_bound = np.percentile(x_dist, threshold_percentiles[1]) - threshold[i_shp] = np.random.uniform(lower_bound, upper_bound) + threshold[i_shp] = random_generator.uniform(lower_bound, upper_bound) values[i_shp, :, :length] = _val # Extract the starting point index of the shapelet @@ -741,7 +742,6 @@ def dilated_shapelet_transform( np.ndarray, np.ndarray, ], - distance: CPUDispatcher, ): """Perform the shapelet transform with a set of shapelets and a set of time series. @@ -769,9 +769,6 @@ def dilated_shapelet_transform( Standard deviation of the shapelets - classes : array, shape (max_shapelets) An initialized (empty) startpoint array for each shapelet - distance: CPUDispatcher - A Numba function used to compute the distance between two multidimensional - time series of shape (n_channels, length). Returns @@ -810,9 +807,7 @@ def dilated_shapelet_transform( idx_no_norm = id_shps[np.where(~normalize[id_shps])[0]] for i_shp in idx_no_norm: X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = ( - compute_shapelet_features( - X_subs, values[i_shp], length, threshold[i_shp], distance - ) + compute_shapelet_features(X_subs, values[i_shp], threshold[i_shp]) ) idx_norm = id_shps[np.where(normalize[id_shps])[0]] @@ -822,7 +817,7 @@ def dilated_shapelet_transform( for i_shp in idx_norm: X_new[i_x, (n_ft * i_shp) : (n_ft * i_shp + n_ft)] = ( compute_shapelet_features( - X_subs, values[i_shp], length, threshold[i_shp], distance + X_subs, values[i_shp], threshold[i_shp] ) ) return X_new @@ -830,9 +825,9 @@ def dilated_shapelet_transform( @njit(fastmath=True, cache=True) def normalize_subsequences( - X_subs: np.ndarray[np.float64], - X_means: np.ndarray[np.float64], - X_stds: np.ndarray[np.float64], + X_subs: np.ndarray[np.float_], + X_means: np.ndarray[np.float_], + X_stds: np.ndarray[np.float_], ): """ Generate subsequences from a time series given the length and dilation parameters. @@ -866,7 +861,7 @@ def normalize_subsequences( @njit(fastmath=True, cache=True) def get_all_subsequences(X: np.ndarray, length: int, dilation: int) -> np.ndarray: """ - Generate subsequences from a time series given the length and dilation parameters. + Generate a view of subsequcnes from a time series given length and dilation values. Parameters ---------- @@ -880,27 +875,20 @@ def get_all_subsequences(X: np.ndarray, length: int, dilation: int) -> np.ndarra Returns ------- array, shape = (n_timestamps-(length-1)*dilation, n_channels, length) - Subsequences of the input time series. + The view of the subsequences of the input time series. """ - n_channels, n_timestamps = X.shape - n_subsequences = n_timestamps - (length - 1) * dilation - X_subs = np.zeros((n_subsequences, n_channels, length)) - for i_sub in prange(n_subsequences): - for i_channel in prange(n_channels): - for i_length in prange(length): - X_subs[i_sub, i_channel, i_length] = X[ - i_channel, i_sub + (i_length * dilation) - ] - return X_subs + n_features, n_timestamps = X.shape + s0, s1 = X.strides + out_shape = (n_timestamps - (length - 1) * dilation, n_features, np.int64(length)) + strides = (s1, s0, s1 * dilation) + return np.lib.stride_tricks.as_strided(X, shape=out_shape, strides=strides) @njit(fastmath=True, cache=True) def compute_shapelet_features( X_subs: np.ndarray, values: np.ndarray, - length: int, threshold: float, - distance: CPUDispatcher, ): """Extract the features from a shapelet distance vector. @@ -921,9 +909,6 @@ def compute_shapelet_features( Length of the shapelet threshold : float The threshold parameter of the shapelet - distance: CPUDispatcher - A Numba function used to compute the distance between two multidimensional - time series of shape (n_channels, length). Returns ------- @@ -934,25 +919,26 @@ def compute_shapelet_features( _argmin = np.inf _SO = 0 - n_subsequences = X_subs.shape[0] + n_subsequences, n_channels, length = X_subs.shape for i_sub in prange(n_subsequences): - _dist = distance(X_subs[i_sub], values[:, :length]) + _dist = 0 + for k in prange(n_channels): + for i_len in prange(length): + _dist += abs(X_subs[i_sub, k, i_len] - values[k, i_len]) if _dist < _min: _min = _dist _argmin = i_sub if _dist < threshold: _SO += 1 - return np.float64(_min), np.float64(_argmin), np.float64(_SO) + return np.float_(_min), np.float_(_argmin), np.float_(_SO) @njit(fastmath=True, cache=True) def compute_shapelet_dist_vector( X_subs: np.ndarray, values: np.ndarray, - length: int, - distance: CPUDispatcher, ): """Extract the features from a shapelet distance vector. @@ -980,8 +966,10 @@ def compute_shapelet_dist_vector( dist_vector : array, shape = (n_timestamps-(length-1)*dilation) The distance vector between the shapelets and candidate subsequences """ - n_subsequences = X_subs.shape[0] + n_subsequences, n_channels, length = X_subs.shape dist_vector = np.zeros(n_subsequences) for i_sub in prange(n_subsequences): - dist_vector[i_sub] = distance(X_subs[i_sub], values[:, :length]) + for k in prange(n_channels): + for i_len in prange(length): + dist_vector[i_sub] += abs(X_subs[i_sub, k, i_len] - values[k, i_len]) return dist_vector diff --git a/aeon/transformations/collection/shapelet_based/tests/test_dilated_shapelet_transform.py b/aeon/transformations/collection/shapelet_based/tests/test_dilated_shapelet_transform.py index 2391af5135..124cf07d02 100644 --- a/aeon/transformations/collection/shapelet_based/tests/test_dilated_shapelet_transform.py +++ b/aeon/transformations/collection/shapelet_based/tests/test_dilated_shapelet_transform.py @@ -144,9 +144,7 @@ def test_compute_shapelet_features(dtype): dilation = 1 threshold = 0.01 X_subs = get_all_subsequences(X, length, dilation) - _min, _argmin, SO = compute_shapelet_features( - X_subs, values, length, threshold, manhattan_distance - ) + _min, _argmin, SO = compute_shapelet_features(X_subs, values, threshold) # On some occasion, float32 precision with fasmath retruns things like # 2.1835059227370834e-07 instead of 0 @@ -157,9 +155,7 @@ def test_compute_shapelet_features(dtype): dilation = 2 threshold = 0.1 X_subs = get_all_subsequences(X, length, dilation) - _min, _argmin, SO = compute_shapelet_features( - X_subs, values, length, threshold, manhattan_distance - ) + _min, _argmin, SO = compute_shapelet_features(X_subs, values, threshold) assert_almost_equal(_min, 0.0, decimal=4) assert _argmin == 7.0 @@ -168,9 +164,7 @@ def test_compute_shapelet_features(dtype): dilation = 4 threshold = 2 X_subs = get_all_subsequences(X, length, dilation) - _min, _argmin, SO = compute_shapelet_features( - X_subs, values, length, threshold, manhattan_distance - ) + _min, _argmin, SO = compute_shapelet_features(X_subs, values, threshold) assert_almost_equal(_min, 0.0, decimal=4) assert _argmin == 3.0 @@ -185,9 +179,7 @@ def test_compute_shapelet_dist_vector(dtype): for dilation in [1, 3, 5]: values = np.random.rand(3, length).astype(dtype) X_subs = get_all_subsequences(X, length, dilation) - d_vect = compute_shapelet_dist_vector( - X_subs, values, length, manhattan_distance - ) + d_vect = compute_shapelet_dist_vector(X_subs, values) true_vect = np.zeros(X.shape[1] - (length - 1) * dilation) for i_sub in range(true_vect.shape[0]): _idx = [i_sub + j * dilation for j in range(length)] diff --git a/aeon/utils/numba/general.py b/aeon/utils/numba/general.py index 6765033f3b..e0ccaf9860 100644 --- a/aeon/utils/numba/general.py +++ b/aeon/utils/numba/general.py @@ -32,6 +32,7 @@ import numpy as np from numba import njit, prange from numba.core.registry import CPUDispatcher +from numpy.random._generator import Generator import aeon.utils.numba.stats as stats @@ -164,7 +165,7 @@ def unique_count(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]: X = np.sort(X) unique = np.zeros(X.shape[0]) unique[0] = X[0] - counts = np.zeros(X.shape[0], dtype=np.int32) + counts = np.zeros(X.shape[0], dtype=np.int_) counts[0] = 1 uc = 0 @@ -176,7 +177,7 @@ def unique_count(X: np.ndarray) -> tuple[np.ndarray, np.ndarray]: else: counts[uc] += 1 return unique[: uc + 1], counts[: uc + 1] - return np.zeros(0), np.zeros(0, dtype=np.int32) + return np.zeros(0), np.zeros(0, dtype=np.int_) @njit(fastmath=True, cache=True) @@ -438,7 +439,7 @@ def set_numba_random_seed(seed: int) -> None: @njit(fastmath=True, cache=True) -def choice_log(n_choice: int, n_sample: int) -> np.ndarray: +def choice_log(n_choice: int, n_sample: int, random_generator: Generator) -> np.ndarray: """Random choice function with log probability rather than uniform. To seed the function the `np.random.seed` must be set in a numba function prior to @@ -451,6 +452,7 @@ def choice_log(n_choice: int, n_sample: int) -> np.ndarray: n_choice-1. n_sample : int Number of choice to sample. + random_generator : random_generator Returns ------- @@ -462,12 +464,12 @@ def choice_log(n_choice: int, n_sample: int) -> np.ndarray: P = np.array([1 / 2 ** np.log(i) for i in range(1, n_choice + 1)]) # Bring everything between 0 and 1 as a cumulative probability P = P.cumsum() / P.sum() - loc = np.zeros(n_sample, dtype=np.int32) + loc = np.zeros(n_sample, dtype=np.int_) for i in prange(n_sample): - loc[i] = np.where(P >= np.random.rand())[0][0] + loc[i] = np.where(P >= random_generator.random())[0][0] return loc else: - return np.zeros(n_sample, dtype=np.int32) + return np.zeros(n_sample, dtype=np.int_) @njit(fastmath=True, cache=True) @@ -529,9 +531,9 @@ def get_subsequence_with_mean_std( The std of each channel """ n_channels, _ = X.shape - values = np.zeros((n_channels, length), dtype=np.float64) - means = np.zeros(n_channels, dtype=np.float64) - stds = np.zeros(n_channels, dtype=np.float64) + values = np.zeros((n_channels, length), dtype=np.float_) + means = np.zeros(n_channels, dtype=np.float_) + stds = np.zeros(n_channels, dtype=np.float_) for i_channel in prange(n_channels): _sum = 0 _sum2 = 0 @@ -587,7 +589,7 @@ def sliding_mean_std_one_series( for i_mod_dil in prange(dilation): # Array mainting indexes of a dilated subsequence - _idx_sub = np.zeros(length, dtype=np.int32) + _idx_sub = np.zeros(length, dtype=np.int_) for i_length in prange(length): _idx_sub[i_length] = (i_length * dilation) + i_mod_dil @@ -690,7 +692,7 @@ def combinations_1d(x: np.ndarray, y: np.ndarray) -> np.ndarray: for i in range(x.shape[0]): u_mask[np.where(u_x == x[i])[0][0], np.where(u_y == y[i])[0][0]] = True - combinations = np.zeros((u_mask.sum(), 2), dtype=np.int32) + combinations = np.zeros((u_mask.sum(), 2), dtype=np.int_) i_comb = 0 for i in range(x.shape[0]): if u_mask[np.where(u_x == x[i])[0][0], np.where(u_y == y[i])[0][0]]: @@ -846,7 +848,7 @@ def generate_combinations(n, k): """ comb_array = np.arange(k) num_combinations = _comb(n, k) # Using our efficient comb function - combinations = np.empty((num_combinations, k), dtype=np.int32) + combinations = np.empty((num_combinations, k), dtype=np.int_) for idx in range(num_combinations): combinations[idx, :] = comb_array diff --git a/aeon/visualisation/estimator/_shapelets.py b/aeon/visualisation/estimator/_shapelets.py index 43e670d17f..2a07028893 100644 --- a/aeon/visualisation/estimator/_shapelets.py +++ b/aeon/visualisation/estimator/_shapelets.py @@ -8,6 +8,8 @@ import warnings import numpy as np +from numba import njit +from numba.core.registry import CPUDispatcher from sklearn.ensemble._forest import BaseForest from sklearn.linear_model._base import LinearClassifierMixin from sklearn.pipeline import Pipeline @@ -27,7 +29,6 @@ RandomShapeletTransform, ) from aeon.transformations.collection.shapelet_based._dilated_shapelet_transform import ( - compute_shapelet_dist_vector, get_all_subsequences, normalize_subsequences, ) @@ -35,6 +36,45 @@ from aeon.utils.validation._dependencies import _check_soft_dependencies +@njit(fastmath=True, cache=True) +def compute_shapelet_dist_vector( + X_subs: np.ndarray, + values: np.ndarray, + dist_func: CPUDispatcher, +): + """Extract the features from a shapelet distance vector. + + Given a shapelet and a time series, extract three features from the resulting + distance vector: + - min + - argmin + - Shapelet Occurence : number of point in the distance vector inferior to the + threshold parameter + + Parameters + ---------- + X_subs : array, shape (n_timestamps-(length-1)*dilation, n_channels, length) + The subsequences of an input time series given the length and dilation parameter + values : array, shape (n_channels, length) + The value array of the shapelet + length : int + Length of the shapelet + distance: CPUDispatcher + A Numba function used to compute the distance between two multidimensional + time series of shape (n_channels, length). + + Returns + ------- + dist_vector : array, shape = (n_timestamps-(length-1)*dilation) + The distance vector between the shapelets and candidate subsequences + """ + n_subsequences, n_channels, length = X_subs.shape + dist_vector = np.zeros(n_subsequences) + for i_sub in range(n_subsequences): + dist_vector[i_sub] += dist_func(X_subs[i_sub], values) + return dist_vector + + class ShapeletVisualizer: """ A Shapelet object to use for ploting operations. @@ -249,9 +289,7 @@ def plot_on_X( _values = self.values # Compute distance vector - c = compute_shapelet_dist_vector( - X_subs, _values, self.length, self.distance_func - ) + c = compute_shapelet_dist_vector(X_subs, _values, self.distance_func) # Get best match index idx_best = c.argmin() @@ -361,9 +399,7 @@ def plot_distance_vector( ) else: _values = self.values - c = compute_shapelet_dist_vector( - X_subs, _values, self.length, self.distance_func - ) + c = compute_shapelet_dist_vector(X_subs, _values, self.distance_func) if ax is None: plt.style.use(matplotlib_style) @@ -406,7 +442,7 @@ def _get_shapelet(self, id_shapelet): dilation_ = self.estimator.shapelets_[3][id_shapelet] threshold_ = self.estimator.shapelets_[4][id_shapelet] normalize_ = self.estimator.shapelets_[5][id_shapelet] - distance = self.estimator.distance + distance = "manhattan" elif isinstance(self.estimator, (RSAST, SAST)): values_ = self.estimator._kernel_orig[id_shapelet] diff --git a/examples/classification/shapelet_based.ipynb b/examples/classification/shapelet_based.ipynb index 4c271f2763..d15b5f2beb 100644 --- a/examples/classification/shapelet_based.ipynb +++ b/examples/classification/shapelet_based.ipynb @@ -559,8 +559,7 @@ "\n", "\"drawing\"\n", "\n", - "\n", - "Note that here we use the euclidean distance, but we can replace it with any distance using the `distance` parameter of both `RandomDilatedShapeletTransform` and `RDSTClassifier` in aeon. For example : `RandomDilatedShapeletTransform(distance='dtw')`. " + "Note that `RDST` uses the manhattan distance instead of the euclidean distance." ] }, { @@ -771,7 +770,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.4" + "version": "3.10.13" } }, "nbformat": 4,