Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] KASBA clusterer #2428

Merged
merged 5 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions aeon/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"TimeSeriesKMeans",
"TimeSeriesKShape",
"TimeSeriesKernelKMeans",
"KASBA",
"ElasticSOM",
"KSpectralCentroid",
"DummyClusterer",
Expand All @@ -20,6 +21,7 @@
from aeon.clustering._k_medoids import TimeSeriesKMedoids
from aeon.clustering._k_sc import KSpectralCentroid
from aeon.clustering._k_shape import TimeSeriesKShape
from aeon.clustering._kasba import KASBA
from aeon.clustering._kernel_k_means import TimeSeriesKernelKMeans
from aeon.clustering.base import BaseClusterer
from aeon.clustering.dummy import DummyClusterer
363 changes: 363 additions & 0 deletions aeon/clustering/_kasba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,363 @@
"""Time series kmeans."""

from typing import Optional

__maintainer__ = []

from typing import Callable, Union

import numpy as np
from numpy.random import RandomState
from sklearn.utils import check_random_state

from aeon.clustering._k_means import EmptyClusterError
from aeon.clustering.averaging import kasba_average
from aeon.clustering.base import BaseClusterer
from aeon.distances import distance as distance_func
from aeon.distances import pairwise_distance


class KASBA(BaseClusterer):
"""KASBA clusterer [1]_.

KASBA is a $k$-means clustering algorithm designed for use with the MSM distance
metric [2]_ however, can be used with any elastic distance that is a metric.
KASBA finds initial clusters using an adapted form of kmeans++ to use
elastic distances, a fast assignment step that exploits the metric property
to avoid distance calculations in assignment, and an adapted elastic barycentre
average that uses a stochastic gradient descent to find the barycentre averages.

Parameters
----------
n_clusters : int, default=8
The number of clusters to form as well as the number of centroids to generate.
distance : str or callable, default='msm'
The distance metric to use. If a string, must be one of the following:
'msm', 'twe'. The distance measure use MUST be a metric.
ba_subset_size : float, default=0.5
The proportion of the data to use in the barycentre average step. For the first
iteration all the data will be used however, on subsequent iterations a subset
of the data will be used. This will be a % of the data passed (e.g. 0.5 = 50%).
If there are less than 10 data points, all the available data will be used
every iteration.
initial_step_size : float, default=0.05
The initial step size for the stochastic gradient descent in the
barycentre average step.
max_iter : int, default=300
Maximum number of iterations of the k-means algorithm before it is forcibly
stopped.
tol : float, default=1e-6
Relative tolerance in regard to Frobenius norm of the difference
in the cluster centers of two consecutive iterations to declare
convergence.
distance_params : dict, default=None
Dictionary containing kwargs for the distance being used. For example if you
wanted to specify a cost for MSM you would pass
distance_params={"c": 0.2}. See documentation of aeon.distances for more
details.
decay_rate : float, default=0.1
The decay rate for the step size in the barycentre average step. The
initial_step_size will be multiplied by np.exp(-decay_rate * i) every iteration
where i is the current iteration.
verbose : bool, default=False
Verbosity mode.
random_state : int, np.random.RandomState instance or None, default=None
Determines random number generation for centroid initialization.
If `int`, random_state is the seed used by the random number generator;
If `np.random.RandomState` instance,
random_state is the random number generator;
If `None`, the random number generator is the `RandomState` instance used
by `np.random`.

Attributes
----------
cluster_centers_ : 3d np.ndarray
Array of shape (n_clusters, n_channels, n_timepoints))
Time series that represent each of the cluster centers.
labels_ : 1d np.ndarray
1d array of shape (n_case,)
Labels that is the index each time series belongs to.
inertia_ : float
Sum of squared distances of samples to their closest cluster center.
n_iter_ : int
Number of iterations run.

References
----------
.. [1] Holder, Christopher & Bagnall, Anthony. (2024).
Rock the KASBA: Blazingly Fast and Accurate Time Series Clustering.
10.48550/arXiv.2411.17838.

.. [2] Stefan A., Athitsos V., Das G.: The Move-Split-Merge metric for time
series. IEEE Transactions on Knowledge and Data Engineering 25(6), 2013.

Examples
--------
>>> import numpy as np
>>> from aeon.clustering import KASBA
>>> X = np.random.random(size=(10,2,20))
>>> clst= KASBA(distance="msm",n_clusters=2)
>>> clst.fit(X)
KASBA(n_clusters=2)
>>> preds = clst.predict(X)
"""

_tags = {
"capability:multivariate": True,
"algorithm_type": "distance",
}

def __init__(
self,
n_clusters: int = 8,
distance: Union[str, Callable] = "msm",
ba_subset_size: float = 0.5,
initial_step_size: float = 0.05,
max_iter: int = 300,
tol: float = 1e-6,
distance_params: Optional[dict] = None,
decay_rate: float = 0.1,
verbose: bool = False,
random_state: Optional[Union[int, RandomState]] = None,
):
self.distance = distance
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
self.random_state = random_state
self.distance_params = distance_params
self.initial_step_size = initial_step_size
self.ba_subset_size = ba_subset_size
self.decay_rate = decay_rate
self.n_clusters = n_clusters

self.cluster_centers_ = None
self.labels_ = None
self.inertia_ = None
self.n_iter_ = 0

self._random_state = None
self._distance_params = {}

super().__init__()

def _fit(self, X: np.ndarray, y=None):
self._check_params(X)
cluster_centres, distances_to_centres, labels = self._elastic_kmeans_plus_plus(
X,
)
self.labels_, self.cluster_centers_, self.inertia_, self.n_iter_ = self._kasba(
X,
cluster_centres,
distances_to_centres,
labels,
)

return self

def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
if isinstance(self.distance, str):
pairwise_matrix = pairwise_distance(
X, self.cluster_centers_, method=self.distance, **self._distance_params
)
else:
pairwise_matrix = pairwise_distance(
X,
self.cluster_centers_,
method=self.distance,
**self._distance_params,
)
return pairwise_matrix.argmin(axis=1)

def _kasba(
self,
X,
cluster_centres,
distances_to_centres,
labels,
):
inertia = np.inf
prev_inertia = np.inf
prev_labels = None
prev_cluster_centres = None
for i in range(self.max_iter):
cluster_centres, distances_to_centres = self._recalculate_centroids(
X,
cluster_centres,
labels,
distances_to_centres,
)

labels, distances_to_centres, inertia = self._fast_assign(
X,
cluster_centres,
distances_to_centres,
labels,
i == 0,
)

labels, cluster_centres, distances_to_centres = self._handle_empty_cluster(
X,
cluster_centres,
distances_to_centres,
labels,
)

if np.array_equal(prev_labels, labels):
if self.verbose:
print( # noqa: T001, T201
f"Converged at iteration {i}, " # noqa: T001, T201
f"inertia {inertia:.5f}." # noqa: T001, T201
) # noqa: T001, T201
break

prev_inertia = inertia
prev_labels = labels.copy()
prev_cluster_centres = cluster_centres.copy()

if self.verbose is True:
print(f"Iteration {i}, inertia {prev_inertia}.") # noqa: T001, T201

if inertia < prev_inertia:
return prev_labels, prev_cluster_centres, prev_inertia, i + 1
return labels, cluster_centres, inertia, i + 1

def _fast_assign(
self,
X,
cluster_centres,
distances_to_centres,
labels,
is_first_iteration,
):
distances_between_centres = pairwise_distance(
cluster_centres,
method=self.distance,
**self._distance_params,
)
for i in range(X.shape[0]):
min_dist = distances_to_centres[i]
closest = labels[i]
for j in range(self.n_clusters):
if not is_first_iteration and j == closest:
continue
bound = distances_between_centres[j, closest] / 2.0
if min_dist < bound:
continue

dist = distance_func(
X[i],
cluster_centres[j],
method=self.distance,
**self._distance_params,
)
if dist < min_dist:
min_dist = dist
closest = j

labels[i] = closest
distances_to_centres[i] = min_dist

inertia = np.sum(distances_to_centres**2)
if self.verbose:
print(f"{inertia:.5f}", end=" --> ") # noqa: T001, T201
return labels, distances_to_centres, inertia

def _recalculate_centroids(
self,
X,
cluster_centres,
labels,
distances_to_centres,
):
for j in range(self.n_clusters):
current_cluster_indices = labels == j

previous_distance_to_centre = distances_to_centres[current_cluster_indices]
previous_cost = np.sum(previous_distance_to_centre)
curr_centre, dist_to_centre = kasba_average(
X=X[current_cluster_indices],
init_barycenter=cluster_centres[j],
previous_cost=previous_cost,
previous_distance_to_centre=previous_distance_to_centre,
distance=self.distance,
max_iters=50,
tol=self.tol,
verbose=self.verbose,
random_state=self._random_state,
ba_subset_size=self.ba_subset_size,
initial_step_size=self.initial_step_size,
decay_rate=self.decay_rate,
**self._distance_params,
)

cluster_centres[j] = curr_centre
distances_to_centres[current_cluster_indices] = dist_to_centre

return cluster_centres, distances_to_centres

def _handle_empty_cluster(
self,
X: np.ndarray,
cluster_centres: np.ndarray,
distances_to_centres: np.ndarray,
labels: np.ndarray,
):
empty_clusters = np.setdiff1d(np.arange(self.n_clusters), labels)
j = 0
while empty_clusters.size > 0:
current_empty_cluster_index = empty_clusters[0]
index_furthest_from_centre = distances_to_centres.argmax()
cluster_centres[current_empty_cluster_index] = X[index_furthest_from_centre]
curr_pw = pairwise_distance(
X, cluster_centres, method=self.distance, **self._distance_params
)
labels = curr_pw.argmin(axis=1)
distances_to_centres = curr_pw.min(axis=1)
empty_clusters = np.setdiff1d(np.arange(self.n_clusters), labels)
j += 1
if j > self.n_clusters:
raise EmptyClusterError

return labels, cluster_centres, distances_to_centres

def _elastic_kmeans_plus_plus(
self,
X,
):
initial_center_idx = self._random_state.randint(X.shape[0])
indexes = [initial_center_idx]

min_distances = pairwise_distance(
X, X[initial_center_idx], method=self.distance, **self._distance_params
).flatten()
labels = np.zeros(X.shape[0], dtype=int)

for i in range(1, self.n_clusters):
probabilities = min_distances / min_distances.sum()
next_center_idx = self._random_state.choice(X.shape[0], p=probabilities)
indexes.append(next_center_idx)

new_distances = pairwise_distance(
X, X[next_center_idx], method=self.distance, **self._distance_params
).flatten()

closer_points = new_distances < min_distances
min_distances[closer_points] = new_distances[closer_points]
labels[closer_points] = i

centers = X[indexes]
return centers, min_distances, labels

def _check_params(self, X: np.ndarray) -> None:
self._random_state = check_random_state(self.random_state)

if self.n_clusters > X.shape[0]:
raise ValueError(
f"n_clusters ({self.n_clusters}) cannot be larger than "
f"n_cases ({X.shape[0]})"
)

self._distance_params = {
**(self.distance_params or {}),
}
2 changes: 2 additions & 0 deletions aeon/clustering/averaging/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
"subgradient_barycenter_average",
"VALID_BA_METRICS",
"shift_invariant_average",
"kasba_average",
]

from aeon.clustering.averaging._averaging import mean_average
from aeon.clustering.averaging._ba_petitjean import petitjean_barycenter_average
from aeon.clustering.averaging._ba_subgradient import subgradient_barycenter_average
from aeon.clustering.averaging._ba_utils import VALID_BA_METRICS
from aeon.clustering.averaging._barycenter_averaging import elastic_barycenter_average
from aeon.clustering.averaging._kasba_average import kasba_average
from aeon.clustering.averaging._shift_scale_invariant_averaging import (
shift_invariant_average,
)
Loading