aeon-toolkit · TonyBagnall · Oct 31, 2024 · Aug 22, 2024 · Aug 22, 2024 · Aug 22, 2024
@@ -180,6 +180,9 @@ def plot_most_important_feature_on_ts(self, ts, feature_importance, limit: int =
         """
         import matplotlib.pyplot as plt
 
+        # get overall importance irrespective of class
+        feature_importance = [abs(x) for x in feature_importance]
+
         features = zip(self._transformer._kernel_orig, feature_importance)
         sorted_features = sorted(features, key=itemgetter(1), reverse=True)
 

@@ -56,7 +56,8 @@ class RSAST(BaseCollectionTransformer):
 
     Parameters
     ----------
-    n_random_points: int default = 10 the number of initial random points to extract
+    n_random_points: int default = 10
+        the number of initial random points to extract
     len_method:  string default="both" the type of statistical tool used to get
     the length of shapelets. "both"=ACF&PACF, "ACF"=ACF, "PACF"=PACF,
     "None"=Extract randomly any length from the TS
@@ -65,8 +66,6 @@ class RSAST(BaseCollectionTransformer):
         the number of reference time series to select per class
     seed : int, default = None
         the seed of the random generator
-    classifier : sklearn compatible classifier, default = None
-        if None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)) is used.
     n_jobs : int, default -1
         Number of threads to use for the transform.
 
@@ -114,6 +113,9 @@ def __init__(
         self._kernels = None  # z-normalized subsequences
         self._cand_length_list = {}
         self._kernel_orig = []
+        self._start_points = []
+        self._classes = []
+        self._source_series = []  # To store the index of the original time series
         self._kernels_generators = {}  # Reference time series
         super().__init__()
 
@@ -156,7 +158,12 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "RSAST":
         self.num_classes = classes.shape[0]
         m_kernel = 0
 
-        # 1--calculate ANOVA per each time t throught the lenght of the TS
+        # Initialize lists to store start positions, classes, and source series
+        self._start_points = []
+        self._classes = []
+        self._source_series = []
+
+        # 1--calculate ANOVA per each time t throughout the length of the TS
         for i in range(X_.shape[1]):
             statistic_per_class = {}
             for c in classes:
@@ -187,11 +194,15 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "RSAST":
 
             cnt = np.min([self.nb_inst_per_class, X_c.shape[0]]).astype(int)
 
-            choosen = self._random_state.permutation(X_c.shape[0])[:cnt]
+            # Store the original indices of the sampled time series
+            original_indices = np.where(y == c)[0]
+
+            chosen_indices = self._random_state.permutation(X_c.shape[0])[:cnt]
 
             self._kernels_generators[c] = []
 
-            for rep, idx in enumerate(choosen):
+            for rep, idx in enumerate(chosen_indices):
+                original_idx = original_indices[idx]  # Get the original index
                 # defining indices for length list
                 idx_len_list = c + "," + str(idx) + "," + str(rep)
 
@@ -292,6 +303,12 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "RSAST":
                         self._kernel_orig.append(np.squeeze(kernel))
                         self._kernels_generators[c].extend(X_c[idx].reshape(1, -1))
 
+                        # Store the start position,
+                        # class, and the original index in the training set
+                        self._start_points.append(i)
+                        self._classes.append(c)
+                        self._source_series.append(original_idx)
+
         # 3--save the calculated subsequences
         n_kernels = len(self._kernel_orig)
 

@@ -50,17 +50,18 @@ class SAST(BaseCollectionTransformer):
     ----------
     lengths : int[], default = None
         an array containing the lengths of the subsequences
-        to be generated. If None, will be infered during fit
+        to be generated. If None, will be inferred during fit
         as np.arange(3, X.shape[1])
     stride : int, default = 1
-        the stride used when generating subsquences
-    nb_inst_per_class : int default = 1
+        the stride used when generating subsequences
+    nb_inst_per_class : int, default = 1
         the number of reference time series to select per class
     seed : int, default = None
         the seed of the random generator
     n_jobs : int, default -1
         Number of threads to use for the transform.
-        The available cpu count is used if this value is less than 1
+        The available CPU count is used if this value is less than 1
+
 
     References
     ----------
@@ -104,6 +105,9 @@ def __init__(
         self.nb_inst_per_class = nb_inst_per_class
         self._kernels = None  # z-normalized subsequences
         self._kernel_orig = None  # non z-normalized subsequences
+        self._start_points = []  # To store the start positions
+        self._classes = []  # To store the class of each shapelet
+        self._source_series = []  # To store the index of the original time series
         self.kernels_generators_ = {}  # Reference time series
         self.n_jobs = n_jobs
         self.seed = seed
@@ -137,8 +141,10 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "SAST":
 
         classes = np.unique(y)
         self._num_classes = classes.shape[0]
-
+        class_values_of_candidates = []
         candidates_ts = []
+        source_series_indices = []  # List to store original indices
+
         for c in classes:
             X_c = X_[y == c]
 
@@ -148,6 +154,10 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "SAST":
             choosen = self._random_state.permutation(X_c.shape[0])[:cnt]
             candidates_ts.append(X_c[choosen])
             self.kernels_generators_[c] = X_c[choosen]
+            class_values_of_candidates.extend([c] * cnt)
+            source_series_indices.extend(
+                np.where(y == c)[0][choosen]
+            )  # Record the original indices
 
         candidates_ts = np.concatenate(candidates_ts, axis=0)
 
@@ -163,6 +173,9 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "SAST":
             (n_kernels, max_shp_length), dtype=np.float32, fill_value=np.nan
         )
         self._kernel_orig = []
+        self._start_points = []  # Reset start positions
+        self._classes = []  # Reset class information
+        self._source_series = []  # Reset source series information
 
         k = 0
         for shp_length in self._length_list:
@@ -172,6 +185,13 @@ def _fit(self, X: np.ndarray, y: Union[np.ndarray, list]) -> "SAST":
                     can = np.squeeze(candidates_ts[i][j:end])
                     self._kernel_orig.append(can)
                     self._kernels[k, :shp_length] = z_normalise_series(can)
+                    self._start_points.append(j)  # Store the start position
+                    self._classes.append(
+                        class_values_of_candidates[i]
+                    )  # Store the class of the shapelet
+                    self._source_series.append(
+                        source_series_indices[i]
+                    )  # Store the original index of the time series
                     k += 1
         return self
 

diff --git a/examples/classification/SastVsViz.ipynb b/examples/classification/SastVsViz.ipynb