#!/usr/bin/env python # coding: utf-8 # # Preprocessing time series with aeon # # It is common to need to preprocess time series data before applying machine learning # algorithms. So algorithms can handle these characteristics, or `aeon` transformers can be used to preprocess collections of time # series into standard format. This notebook demonstrates three common use cases # # 1. [Rescaling time series](#Rescaling-time-series) # 2. [Resizing time series](#Resizing-time-series) # 3. [Dealing with missing values](#missing-values) # # ## Rescaling time series # # Different levels of scale and variance can mask discriminative patterns in time # series. This is particularly true for methods that are based on distances. It common # to rescale time series to have zero mean and unit variance. For example, the data in # the `UnitTest` dataset is a subset of the [Chinatown dataset] # (https://timeseriesclassification.com/description.php?Dataset=Chinatown. These are # counts of pedestrians in Chinatown, Melbourne. The time series are of different means # In[3]: import numpy as np from aeon.datasets import load_unit_test X, y = load_unit_test(split="Train") np.mean(X, axis=-1)[0:5] # In[4]: np.std(X, axis=-1)[0:5] # We can rescale the time series in three ways: # 1. Normalise: subtract the mean and divide by the standard deviation to make all # series have zero mean and unit variance. # In[5]: from aeon.transformations.collection import Normalizer normalizer = Normalizer() X2 = normalizer.fit_transform(X) np.round(np.mean(X2, axis=-1)[0:5], 6) # In[6]: np.round(np.std(X2, axis=-1)[0:5], 6) # 2. Re-center: Recentering involves subtracting the mean of each series # In[7]: from aeon.transformations.collection import Centerer c = Centerer() X3 = c.fit_transform(X) np.round(np.mean(X3, axis=-1)[0:5], 6) # 3. Min-Max: Scale the data to be between 0 and 1 # In[8]: from aeon.transformations.collection import MinMaxScaler minmax = MinMaxScaler() X4 = minmax.fit_transform(X) np.round(np.min(X4, axis=-1)[0:5], 6) # In[9]: np.round(np.max(X4, axis=-1)[0:5], 6) # There is no best way to do this, although for counts such as this it is more common # to MinMax scale, so that the data still has some interpretation as proportions. # ## Resizing time series # # Suppose we have a collections of time series with different lengths, i.e. different # number of time points. Currently, most of aeon's collection estimators # (classification, clustering or regression) require equal-length time # series. Those that can handle unequal length series are tagged with # "capability:unequal". # In[10]: from aeon.classification.convolution_based import RocketClassifier from aeon.datasets import load_basic_motions, load_japanese_vowels, load_plaid from aeon.utils.validation import has_missing, is_equal_length, is_univariate # If you want to use an estimator that cannot internally handle missing values, one # option is to convert unequal length series into equal length. This can be # done through padding, truncation or resizing through fitting a function and # resampling. # ### Unequal or equal length collections time series # # If a collection contains all equal length series, it will store the data in a 3D # numpy of shape `(n_cases, n_channels, n_timepoints)`. If it is unequal length, it is # stored in a list of 2D numpy arrays: # In[11]: # Equal length multivariate data bm_X, bm_y = load_basic_motions() X = bm_X print(f"{type(X)}, {X.shape}") print( f"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal " f"length = {is_equal_length(X)}" ) # In[12]: # Unequal length univariate data plaid_X, plaid_y = load_plaid() X = plaid_X print(type(plaid_X), "\n", plaid_X[0].shape, "\n", plaid_X[10].shape) print( f"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal " f"length = {is_equal_length(X)}" ) # In[13]: vowels_X, vowels_y = load_japanese_vowels(split="train") X = vowels_X print( f"univariate = {is_univariate(X)}, has missing ={has_missing(X)}, equal " f"length = {is_equal_length(X)}" ) # # # In[14]: series_lengths = [array.shape[1] for array in plaid_X] # Find the minimum and maximum of the second dimensions min_length = min(series_lengths) max_length = max(series_lengths) print(" Min length = ", min_length, " max length = ", max_length) # There are two basic strategies for unequal length problems # 1. Use an estimator that can internally handle missing values # 2. Transform the data to be equal length by, for example, truncating or padding series # # Estimators with the tag `"capability:unequal_length": True` have the capability to # handle unequal length series. For classification, regression and # clusterign, the # current list is # In[15]: from aeon.utils.discovery import all_estimators all_estimators( type_filter=["classifier", "regressor", "clusterer"], tag_filter={"capability:unequal_length": True}, ) # You can pass these estimators unequal length series and they will work as expected. # # In[16]: from aeon.classification.distance_based import KNeighborsTimeSeriesClassifier knn = KNeighborsTimeSeriesClassifier() model = knn.fit(plaid_X, plaid_y) # If time series are unequal length, collection estimators will raise an error if they # do not have the capability to handle this characteristic. If you want to use them, # you will need to preprocess the data to be equal length. # In[17]: rc = RocketClassifier() try: rc.fit(plaid_X, plaid_y) except ValueError as e: print(f"ValueError: {e}") # ### Padding, truncating or resizing. # # We can pad, truncate or resize. By default, pad adds zeros to make all series the # length of the longest, truncate removes all values beyond the length of the shortest # and resize stretches or shrinks the series. # In[18]: from aeon.transformations.collection import Padder, Resizer, Truncator pad = Padder() truncate = Truncator() resize = Resizer(length=600) X2 = pad.fit_transform(plaid_X) X3 = truncate.fit_transform(plaid_X) X4 = resize.fit_transform(plaid_X) print(X2.shape, "\n", X3.shape, "\n", X4.shape) # In[19]: import matplotlib.pyplot as plt plt.title("Before and after padding: PLAID first case (shifted up for unpadded)") plt.plot(plaid_X[0][0] + 10) plt.plot(X2[0][0]) # You can put these transformers in a pipeline to apply to both train/test split # # In[20]: from sklearn.metrics import accuracy_score # Unequal length univariate data from aeon.pipeline import make_pipeline train_X, train_y = load_plaid(split="Train") test_X, test_y = load_plaid(split="Test") steps = [truncate, rc] pipe = make_pipeline(steps) pipe.fit(train_X, train_y) preds = pipe.predict(test_X) accuracy_score(train_y, preds) # ## Missing Values # # Missing values are indicated by `NaN` in numpy array. You can test whether any `aeon` # data structure contains missing values using the utility function # In[21]: X = np.random.random(size=(10, 2, 200)) has_missing(X) # In[22]: X[5][0][55] = np.NAN has_missing(X) # There are a range of strategies for handling missing values. These include: # # 1. Use an estimator that internally handles missing values. It is fairly easy for # some algorithms (such as decision trees) to internally deal with missing values, # usually be using it as a distinct series value after discretisation. We do not yet # have many estimators with this capability. Estimators that are able to internally # handle missing values are tagged with `"capability:missing_values": True`. # In[23]: from aeon.utils.discovery import all_estimators all_estimators( tag_filter={"capability:missing_values": True}, ) # 2. Removing series with missing: this is often desirable if the train set size is # large, the number of series with missing is small and the proportion of missing # values for these series is high. # # We do not yet have a transformer for this, but it is easy to implement yourself. # # 3. Interpolating missing values from series: estimating the missing values from the # other values in a time series is commonly done. This is # often desirable if the train set size is small and the proportion of missing values # is low. You can do this with the transformer ``SimpleImputer``. This interpolates # each series and each channel independently. So for example a mean interpolation # of series with two channels `[[NaN,1.0,2.0,3.0],[-1.0,-2.0,-3.0,-4.0]]` would be # `[[2.0,1.0,2.0,3.0],[-1.0,-2.0,-3.0,-4.0]]`. # In[26]: from aeon.transformations.collection import SimpleImputer imput = SimpleImputer(strategy="mean") X2 = imput.fit_transform(X) has_missing(X2) # In[27]: imp2 = SimpleImputer(strategy="median") X3 = imp2.fit_transform(X) has_missing(X3) # In[28]: imp3 = SimpleImputer(strategy="constant", fill_value=0) X4 = imp3.fit_transform(X) has_missing(X4) # In[ ]: