diff --git a/aeon/clustering/__init__.py b/aeon/clustering/__init__.py index 2eec5142cf..4b88d27585 100644 --- a/aeon/clustering/__init__.py +++ b/aeon/clustering/__init__.py @@ -8,6 +8,7 @@ "TimeSeriesKMeans", "TimeSeriesKShape", "TimeSeriesKernelKMeans", + "KASBA", "ElasticSOM", "KSpectralCentroid", "DummyClusterer", @@ -20,6 +21,7 @@ from aeon.clustering._k_medoids import TimeSeriesKMedoids from aeon.clustering._k_sc import KSpectralCentroid from aeon.clustering._k_shape import TimeSeriesKShape +from aeon.clustering._kasba import KASBA from aeon.clustering._kernel_k_means import TimeSeriesKernelKMeans from aeon.clustering.base import BaseClusterer from aeon.clustering.dummy import DummyClusterer diff --git a/aeon/clustering/_kasba.py b/aeon/clustering/_kasba.py new file mode 100644 index 0000000000..e6db7f58e0 --- /dev/null +++ b/aeon/clustering/_kasba.py @@ -0,0 +1,363 @@ +"""Time series kmeans.""" + +from typing import Optional + +__maintainer__ = ["chrisholder"] + +from typing import Callable, Union + +import numpy as np +from numpy.random import RandomState +from sklearn.utils import check_random_state + +from aeon.clustering._k_means import EmptyClusterError +from aeon.clustering.averaging import kasba_average +from aeon.clustering.base import BaseClusterer +from aeon.distances import distance as distance_func +from aeon.distances import pairwise_distance + + +class KASBA(BaseClusterer): + """KASBA clusterer [1]_. + + KASBA is a $k$-means clustering algorithm designed for use with the MSM distance + metric [2]_ however, it can be used with any elastic distance that is a metric. + KASBA finds initial clusters using an adapted form of kmeans++ to use + elastic distances, a fast assignment step that exploits the metric property + to avoid distance calculations in assignment, and an adapted elastic barycentre + average that uses a stochastic gradient descent to find the barycentre averages. + + Parameters + ---------- + n_clusters : int, default=8 + The number of clusters to form as well as the number of centroids to generate. + distance : str or callable, default='msm' + The distance metric to use. If a string, must be one of the following: + 'msm', 'twe'. The distance measure use MUST be a metric. + ba_subset_size : float, default=0.5 + The proportion of the data to use in the barycentre average step. For the first + iteration all the data will be used however, on subsequent iterations a subset + of the data will be used. This will be a % of the data passed (e.g. 0.5 = 50%). + If there are less than 10 data points, all the available data will be used + every iteration. + initial_step_size : float, default=0.05 + The initial step size for the stochastic gradient descent in the + barycentre average step. + max_iter : int, default=300 + Maximum number of iterations of the k-means algorithm before it is forcibly + stopped. + tol : float, default=1e-6 + Relative tolerance in regard to Frobenius norm of the difference + in the cluster centers of two consecutive iterations to declare + convergence. + distance_params : dict, default=None + Dictionary containing kwargs for the distance being used. For example if you + wanted to specify a cost for MSM you would pass + distance_params={"c": 0.2}. See documentation of aeon.distances for more + details. + decay_rate : float, default=0.1 + The decay rate for the step size in the barycentre average step. The + initial_step_size will be multiplied by np.exp(-decay_rate * i) every iteration + where i is the current iteration. + verbose : bool, default=False + Verbosity mode. + random_state : int, np.random.RandomState instance or None, default=None + Determines random number generation for centroid initialization. + If `int`, random_state is the seed used by the random number generator; + If `np.random.RandomState` instance, + random_state is the random number generator; + If `None`, the random number generator is the `RandomState` instance used + by `np.random`. + + Attributes + ---------- + cluster_centers_ : 3d np.ndarray + Array of shape (n_clusters, n_channels, n_timepoints)) + Time series that represent each of the cluster centers. + labels_ : 1d np.ndarray + 1d array of shape (n_case,) + Labels that is the index each time series belongs to. + inertia_ : float + Sum of squared distances of samples to their closest cluster center. + n_iter_ : int + Number of iterations run. + + References + ---------- + .. [1] Holder, Christopher & Bagnall, Anthony. (2024). + Rock the KASBA: Blazingly Fast and Accurate Time Series Clustering. + 10.48550/arXiv.2411.17838. + + .. [2] Stefan A., Athitsos V., Das G.: The Move-Split-Merge metric for time + series. IEEE Transactions on Knowledge and Data Engineering 25(6), 2013. + + Examples + -------- + >>> import numpy as np + >>> from aeon.clustering import KASBA + >>> X = np.random.random(size=(10,2,20)) + >>> clst= KASBA(distance="msm",n_clusters=2) + >>> clst.fit(X) + KASBA(n_clusters=2) + >>> preds = clst.predict(X) + """ + + _tags = { + "capability:multivariate": True, + "algorithm_type": "distance", + } + + def __init__( + self, + n_clusters: int = 8, + distance: Union[str, Callable] = "msm", + ba_subset_size: float = 0.5, + initial_step_size: float = 0.05, + max_iter: int = 300, + tol: float = 1e-6, + distance_params: Optional[dict] = None, + decay_rate: float = 0.1, + verbose: bool = False, + random_state: Optional[Union[int, RandomState]] = None, + ): + self.distance = distance + self.max_iter = max_iter + self.tol = tol + self.verbose = verbose + self.random_state = random_state + self.distance_params = distance_params + self.initial_step_size = initial_step_size + self.ba_subset_size = ba_subset_size + self.decay_rate = decay_rate + self.n_clusters = n_clusters + + self.cluster_centers_ = None + self.labels_ = None + self.inertia_ = None + self.n_iter_ = 0 + + self._random_state = None + self._distance_params = {} + + super().__init__() + + def _fit(self, X: np.ndarray, y=None): + self._check_params(X) + cluster_centres, distances_to_centres, labels = self._elastic_kmeans_plus_plus( + X, + ) + self.labels_, self.cluster_centers_, self.inertia_, self.n_iter_ = self._kasba( + X, + cluster_centres, + distances_to_centres, + labels, + ) + + return self + + def _predict(self, X: np.ndarray, y=None) -> np.ndarray: + if isinstance(self.distance, str): + pairwise_matrix = pairwise_distance( + X, self.cluster_centers_, method=self.distance, **self._distance_params + ) + else: + pairwise_matrix = pairwise_distance( + X, + self.cluster_centers_, + method=self.distance, + **self._distance_params, + ) + return pairwise_matrix.argmin(axis=1) + + def _kasba( + self, + X, + cluster_centres, + distances_to_centres, + labels, + ): + inertia = np.inf + prev_inertia = np.inf + prev_labels = None + prev_cluster_centres = None + for i in range(self.max_iter): + cluster_centres, distances_to_centres = self._recalculate_centroids( + X, + cluster_centres, + labels, + distances_to_centres, + ) + + labels, distances_to_centres, inertia = self._fast_assign( + X, + cluster_centres, + distances_to_centres, + labels, + i == 0, + ) + + labels, cluster_centres, distances_to_centres = self._handle_empty_cluster( + X, + cluster_centres, + distances_to_centres, + labels, + ) + + if np.array_equal(prev_labels, labels): + if self.verbose: + print( # noqa: T001, T201 + f"Converged at iteration {i}, " # noqa: T001, T201 + f"inertia {inertia:.5f}." # noqa: T001, T201 + ) # noqa: T001, T201 + break + + prev_inertia = inertia + prev_labels = labels.copy() + prev_cluster_centres = cluster_centres.copy() + + if self.verbose is True: + print(f"Iteration {i}, inertia {prev_inertia}.") # noqa: T001, T201 + + if inertia < prev_inertia: + return prev_labels, prev_cluster_centres, prev_inertia, i + 1 + return labels, cluster_centres, inertia, i + 1 + + def _fast_assign( + self, + X, + cluster_centres, + distances_to_centres, + labels, + is_first_iteration, + ): + distances_between_centres = pairwise_distance( + cluster_centres, + method=self.distance, + **self._distance_params, + ) + for i in range(X.shape[0]): + min_dist = distances_to_centres[i] + closest = labels[i] + for j in range(self.n_clusters): + if not is_first_iteration and j == closest: + continue + bound = distances_between_centres[j, closest] / 2.0 + if min_dist < bound: + continue + + dist = distance_func( + X[i], + cluster_centres[j], + method=self.distance, + **self._distance_params, + ) + if dist < min_dist: + min_dist = dist + closest = j + + labels[i] = closest + distances_to_centres[i] = min_dist + + inertia = np.sum(distances_to_centres**2) + if self.verbose: + print(f"{inertia:.5f}", end=" --> ") # noqa: T001, T201 + return labels, distances_to_centres, inertia + + def _recalculate_centroids( + self, + X, + cluster_centres, + labels, + distances_to_centres, + ): + for j in range(self.n_clusters): + current_cluster_indices = labels == j + + previous_distance_to_centre = distances_to_centres[current_cluster_indices] + previous_cost = np.sum(previous_distance_to_centre) + curr_centre, dist_to_centre = kasba_average( + X=X[current_cluster_indices], + init_barycenter=cluster_centres[j], + previous_cost=previous_cost, + previous_distance_to_centre=previous_distance_to_centre, + distance=self.distance, + max_iters=50, + tol=self.tol, + verbose=self.verbose, + random_state=self._random_state, + ba_subset_size=self.ba_subset_size, + initial_step_size=self.initial_step_size, + decay_rate=self.decay_rate, + **self._distance_params, + ) + + cluster_centres[j] = curr_centre + distances_to_centres[current_cluster_indices] = dist_to_centre + + return cluster_centres, distances_to_centres + + def _handle_empty_cluster( + self, + X: np.ndarray, + cluster_centres: np.ndarray, + distances_to_centres: np.ndarray, + labels: np.ndarray, + ): + empty_clusters = np.setdiff1d(np.arange(self.n_clusters), labels) + j = 0 + while empty_clusters.size > 0: + current_empty_cluster_index = empty_clusters[0] + index_furthest_from_centre = distances_to_centres.argmax() + cluster_centres[current_empty_cluster_index] = X[index_furthest_from_centre] + curr_pw = pairwise_distance( + X, cluster_centres, method=self.distance, **self._distance_params + ) + labels = curr_pw.argmin(axis=1) + distances_to_centres = curr_pw.min(axis=1) + empty_clusters = np.setdiff1d(np.arange(self.n_clusters), labels) + j += 1 + if j > self.n_clusters: + raise EmptyClusterError + + return labels, cluster_centres, distances_to_centres + + def _elastic_kmeans_plus_plus( + self, + X, + ): + initial_center_idx = self._random_state.randint(X.shape[0]) + indexes = [initial_center_idx] + + min_distances = pairwise_distance( + X, X[initial_center_idx], method=self.distance, **self._distance_params + ).flatten() + labels = np.zeros(X.shape[0], dtype=int) + + for i in range(1, self.n_clusters): + probabilities = min_distances / min_distances.sum() + next_center_idx = self._random_state.choice(X.shape[0], p=probabilities) + indexes.append(next_center_idx) + + new_distances = pairwise_distance( + X, X[next_center_idx], method=self.distance, **self._distance_params + ).flatten() + + closer_points = new_distances < min_distances + min_distances[closer_points] = new_distances[closer_points] + labels[closer_points] = i + + centers = X[indexes] + return centers, min_distances, labels + + def _check_params(self, X: np.ndarray) -> None: + self._random_state = check_random_state(self.random_state) + + if self.n_clusters > X.shape[0]: + raise ValueError( + f"n_clusters ({self.n_clusters}) cannot be larger than " + f"n_cases ({X.shape[0]})" + ) + + self._distance_params = { + **(self.distance_params or {}), + } diff --git a/aeon/clustering/averaging/__init__.py b/aeon/clustering/averaging/__init__.py index d5d12fb99d..59b10fda2d 100644 --- a/aeon/clustering/averaging/__init__.py +++ b/aeon/clustering/averaging/__init__.py @@ -7,6 +7,7 @@ "subgradient_barycenter_average", "VALID_BA_METRICS", "shift_invariant_average", + "kasba_average", ] from aeon.clustering.averaging._averaging import mean_average @@ -14,6 +15,7 @@ from aeon.clustering.averaging._ba_subgradient import subgradient_barycenter_average from aeon.clustering.averaging._ba_utils import VALID_BA_METRICS from aeon.clustering.averaging._barycenter_averaging import elastic_barycenter_average +from aeon.clustering.averaging._kasba_average import kasba_average from aeon.clustering.averaging._shift_scale_invariant_averaging import ( shift_invariant_average, ) diff --git a/aeon/clustering/averaging/_kasba_average.py b/aeon/clustering/averaging/_kasba_average.py new file mode 100644 index 0000000000..c4f5d28642 --- /dev/null +++ b/aeon/clustering/averaging/_kasba_average.py @@ -0,0 +1,177 @@ +from typing import Optional + +import numpy as np +from numba import njit +from sklearn.utils import check_random_state + +from aeon.distances import msm_alignment_path, pairwise_distance, twe_alignment_path + + +def kasba_average( + X: np.ndarray, + init_barycenter: np.ndarray, + previous_cost: float, + previous_distance_to_centre: np.ndarray, + distance: str = "msm", + max_iters: int = 50, + tol=1e-5, + ba_subset_size: float = 0.5, + initial_step_size: float = 0.05, + decay_rate: float = 0.1, + verbose: bool = False, + random_state: Optional[int] = None, + **kwargs, +) -> tuple[np.ndarray, np.ndarray]: + """KASBA average [1]_. + + The KASBA clusterer proposed an adapted version of the Stochastic Subgradient + Elastic Barycentre Average. The algorithm works by iterating randomly over X. + If it is the first iteration then all the values are used. However, if it is not + the first iteration then a subset is used. The subset size is determined by the + parameter ba_subset_size which is the percentage of the data to use. If there are + less than 10 data points, all the available data will be used every iteration. + + Parameters + ---------- + X: np.ndarray, of shape (n_cases, n_channels, n_timepoints) or + (n_cases, n_timepoints) + A collection of time series instances to take the average from. + init_barycenter: np.ndarray, of shape (n_channels, n_timepoints) + The initial barycentre to refine. + previous_cost: float + The summed total distance from all time series in X to the init_barycentre. + previous_distance_to_centre: np.ndarray, of shape (n_cases,) + The distance between each time series in X and the init_barycentre. + distance: str, default='msm' + String defining the distance to use for averaging. Distance to + compute similarity between time series. A list of valid strings for metrics + can be found in the documentation form + :func:`aeon.distances.get_distance_function`. + max_iters: int, default=30 + Maximum number iterations for dba to update over. + tol : float (default: 1e-5) + Tolerance to use for early stopping: if the decrease in cost is lower + than this value, the Expectation-Maximization procedure stops. + ba_subset_size : float, default=0.5 + The proportion of the data to use in the barycentre average step. For the first + iteration all the data will be used however, on subsequent iterations a subset + of the data will be used. This will be a % of the data passed (e.g. 0.5 = 50%). + If there are less than 10 data points, all the available data will be used + every iteration. + initial_step_size : float, default=0.05 + The initial step size for the gradient descent. + decay_rate : float, default=0.1 + The decay rate for the step size in the barycentre average step. The + initial_step_size will be multiplied by np.exp(-decay_rate * i) every iteration + where i is the current iteration. + verbose: bool, default=False + Boolean that controls the verbosity. + random_state: int or None, default=None + Random state to use for the barycenter averaging. + + Returns + ------- + np.ndarray of shape (n_channels, n_timepoints) + Time series that is the KASBA average of the collection of instances provided. + + References + ---------- + .. [1] Holder, Christopher & Bagnall, Anthony. (2024). + Rock the KASBA: Blazingly Fast and Accurate Time Series Clustering. + 10.48550/arXiv.2411.17838. + """ + if len(X) <= 1: + return X, np.zeros(X.shape[0]) + + if X.ndim == 3: + _X = X + elif X.ndim == 2: + _X = X.reshape((X.shape[0], 1, X.shape[1])) + else: + raise ValueError("X must be a 2D or 3D array") + + random_state = check_random_state(random_state) + X_size = _X.shape[0] + + barycenter = np.copy(init_barycenter) + prev_barycenter = np.copy(init_barycenter) + + distances_to_centre = np.zeros(X_size) + num_ts_to_use = min(X_size, max(10, int(ba_subset_size * X_size))) + for i in range(max_iters): + shuffled_indices = random_state.permutation(X_size) + if i > 0: + shuffled_indices = shuffled_indices[:num_ts_to_use] + + current_step_size = initial_step_size * np.exp(-decay_rate * i) + + barycenter = _kasba_refine_one_iter( + barycenter=barycenter, + X=_X, + shuffled_indices=shuffled_indices, + current_step_size=current_step_size, + distance=distance, + **kwargs, + ) + + pw_dist = pairwise_distance(_X, barycenter, method=distance, **kwargs) + cost = np.sum(pw_dist) + distances_to_centre = pw_dist.flatten() + + # Cost is the sum of distance to the centre + if abs(previous_cost - cost) < tol: + if previous_cost < cost: + barycenter = prev_barycenter + distances_to_centre = previous_distance_to_centre + break + elif previous_cost < cost: + barycenter = prev_barycenter + distances_to_centre = previous_distance_to_centre + break + + prev_barycenter = barycenter + previous_distance_to_centre = distances_to_centre.copy() + previous_cost = cost + + if verbose: + print(f"[Subset-SSG-BA] epoch {i}, cost {cost}") # noqa: T001, T201 + + return barycenter, distances_to_centre + + +@njit(cache=True, fastmath=True) +def _kasba_refine_one_iter( + barycenter: np.ndarray, + X: np.ndarray, + shuffled_indices: np.ndarray, + current_step_size, + distance: str = "msm", + nu: float = 0.001, + lmbda: float = 1.0, + independent: bool = True, + c: float = 1.0, +): + + X_size, X_dims, X_timepoints = X.shape + + barycenter_copy = np.copy(barycenter) + + for i in shuffled_indices: + curr_ts = X[i] + if distance == "twe": + curr_alignment, curr_cost = twe_alignment_path( + curr_ts, barycenter_copy, nu=nu, lmbda=lmbda + ) + elif distance == "msm": + curr_alignment, curr_cost = msm_alignment_path( + curr_ts, barycenter_copy, independent=independent, c=c + ) + else: + raise ValueError(f"Invalid distance metric: {distance}") + + new_ba = np.zeros((X_dims, X_timepoints)) + for j, k in curr_alignment: + new_ba[:, k] += barycenter_copy[:, k] - curr_ts[:, j] + + barycenter_copy -= (2.0 * current_step_size) * new_ba + return barycenter_copy diff --git a/aeon/clustering/tests/test_kasba.py b/aeon/clustering/tests/test_kasba.py new file mode 100644 index 0000000000..2d8389629e --- /dev/null +++ b/aeon/clustering/tests/test_kasba.py @@ -0,0 +1,29 @@ +"""Test KASBA.""" + +import numpy as np + +from aeon.benchmarking.metrics.clustering import clustering_accuracy_score +from aeon.clustering import KASBA +from aeon.testing.data_generation import make_example_3d_numpy + + +def test_univariate_kasba(): + """Test KASBA on univariate data.""" + X, y = make_example_3d_numpy(20, 1, 10, random_state=1, return_y=True) + + kasba = KASBA(n_clusters=len(np.unique(y)), random_state=1) + + kasba.fit(X) + score = clustering_accuracy_score(y, kasba.labels_) + assert score == 0.95 + + +def test_multivariate_kasba(): + """Test KASBA on multivariate data.""" + X, y = make_example_3d_numpy(20, 3, 10, random_state=1, return_y=True) + + kasba = KASBA(n_clusters=len(np.unique(y)), random_state=1) + + kasba.fit(X) + score = clustering_accuracy_score(y, kasba.labels_) + assert score == 0.55 diff --git a/docs/api_reference/clustering.rst b/docs/api_reference/clustering.rst index 20c8e779da..e66e54dfbc 100644 --- a/docs/api_reference/clustering.rst +++ b/docs/api_reference/clustering.rst @@ -18,6 +18,7 @@ Clustering Algorithms :toctree: auto_generated/ :template: class.rst + KASBA TimeSeriesKMeans TimeSeriesKMedoids TimeSeriesKShape @@ -76,6 +77,7 @@ Averaging :toctree: auto_generated/ :template: function.rst + kasba_average elastic_barycenter_average mean_average petitjean_barycenter_average diff --git a/docs/papers_using_aeon.md b/docs/papers_using_aeon.md index e6238906e3..30dba0fd44 100644 --- a/docs/papers_using_aeon.md +++ b/docs/papers_using_aeon.md @@ -37,6 +37,10 @@ the paper and a link to the code in your personal GitHub or other repository. with k-medoids based algorithms. In International Workshop on Advanced Analytics and Learning on Temporal Data (pp. 39-55). [Paper](https://link.springer.com/chapter/10.1007/978-3-031-49896-1_4) +- Holder, Christopher & Bagnall, Anthony. (2024). + Rock the KASBA: Blazingly Fast and Accurate Time Series Clustering. + 10.48550/arXiv.2411.17838. +[Paper](https://arxiv.org/abs/2411.17838) ## Regression