#!/usr/bin/env python # coding: utf-8 # # Downloading and loading benchmarking datasets # # It is common to use standard collections of data to compare different estimators for # classification, clustering, regression and forecasting. Some of these datasets are # shipped with aeon in the datasets/data directory. However, the files are far too # big to include them all. aeon provides tools to download these data to use in benchmarking experiments. # Classification and regression data are stored in .ts format. Forecasting # data are stored in the equivalent .tsf format. See the [data loading notebook](data_loading.ipynb) for more info. # # Classification and regression are loaded into 3D numpy arrays of shape `(n_cases, n_channels, n_timepoints)` if equal length # or a list of `[n_cases]` of 2D numpy if `n_timepoints` is different for different # cases. Forecasting data are loaded into pd.DataFrame. For more information on # aeon data types see the [data structures notebook](data_structures.ipynb). # # Note that this notebook is dependent on external websites, so will not function if # you are not online or the associated website is down. We use the following three # functions # In[ ]: # In[1]: from aeon.datasets import load_classification, load_forecasting, load_regression # ## Time Series Classification Archive # # [UCR/TSML Time Series Classification Archive](https://timeseriesclassification.com) # hosts the UCR univariate TSC archive [1], also available from [UCR](https://www.cs.ucr.edu/~eamonn/time_series_data_2018/) and # the multivariate archive [2] (previously called the UEA archive, soon to change). We # provide seven of these in the datasets/data directort: ACSF1, ArrowHead, BasicMotions, # GunPoint, ItalyPowerDemand, JapaneseVowels and PLAID. The archive is much bigger. The # last batch release was for 128 univariate [1] and 33 multivariate [2]. If you just # want to download them all, please go to the [website](https://timeseriesclassification.com) # In[2]: from aeon.datasets.tsc_data_lists import multivariate, univariate # This file also contains sub lists by type, e.g. unequal length print("Univariate length = ", len(univariate)) print("Multivariate length = ", len(multivariate)) # A default train and test split is provided for this data. The file structure for a # problem such as Chinatown is # # /Chinatown/Chinatown_TRAIN.ts # /Chinatown/Chinatown_TEST.ts # # You can load these problems directly from TSC.com and load them into memory. These # functions can return associated metadata in addition to the data. This usage combines # the train and test splits and loads them into one `X` and one `y` array. # In[3]: X, y, meta = load_classification("Chinatown", return_metadata=True) print("Shape of X = ", X.shape) print("First case = ", X[0][0], " has label = ", y[0]) print("\nMeta data = ", meta) # If you look in aeon/datasets you should see a directory called `local_data` # containing the Chinatown datasets. All of the zips have `.ts` files. Some also have # `.arff` and `.txt` files. File structure looks something like this: # # time series classification # # Within each folder are the data in text files formatted as .ts files (see the [data # loading notebook](data_loading.ipynb) for file format description). They may also be # available in .arff format and .txt format. # # time series classification # # If you load again with the same extract path it will not download again if the file is # already there. If you want to store data somewhere else, you can specify a file path. # Also, you can load the train and test separately. This code will download the data # to Temp once, and load into separate train/test splits. The split argument is not # case sensitive. Once downloaded, `load_classification` is a equivalent to a call to # `load_from_tsfile` # In[4]: X_train, y_train = load_classification( "BeetleFly", extract_path="./Temp/", split="TRAIN" ) X_test, y_test = load_classification("BeetleFly", extract_path="./Temp/", split="test") print("Train shape = ", X_train.shape) print("Test shape = ", X_test.shape) from aeon.datasets import load_from_tsfile X_train, y_train = load_from_tsfile( full_file_path_and_name="./Temp/BeetleFly/BeetleFLY_TRAIN" ) print("Loaded directly shape = ", X_train.shape) X_test[0][0][:5] # ## Time Series (Extrinsic) Regression # # [The Monash Time Series Extrinsic Regression Archive]() [3] repo (called extrinsic to # diffentiate if from sliding window based regression) currently contains 19 # regression problems in .ts format. One of these, Covid3Month, is in `datasets\data`. # We have recently expanded this repo to include 63 problems in .ts format. # The usage of `load_regression` is identical to `load_classification` # # In[ ]: from aeon.datasets.dataset_collections import get_available_tser_datasets get_available_tser_datasets() # In[8]: X, y, meta = load_regression("FloodModeling1", return_metadata=True) print("Shape of X = ", X.shape, " meta data = ", meta) # ## Time Series Forecasting # # The [Monash time series forecasting](https://forecastingdata.org/) repo contains a # large number of forecasting data, including competition data such as M1, M3 and M4. # Usage is the same as the other problems, although there is no provided train/test # splits. # # In[ ]: from aeon.datasets.dataset_collections import get_available_tsf_datasets get_available_tsf_datasets() # In[9]: X, metadata = load_forecasting("m4_yearly_dataset", return_metadata=True) print(X.shape) print(metadata) data = X.head() print(data) # ## References # [1] Dau et. al, The UCR time series archive, IEEE/CAA Journal of Automatica Sinica, 2019 # [2] Ruiz et. al, The great multivariate time series classification bake off: a review # and experimental evaluation of recent algorithmic advances, Data Mining and # Knowledge Discovery 35(2), 2021 # [3] Tan et. al, Time Series Extrinsic Regression, Data Mining and Knowledge # Discovery, 2021 # [4] Godahewa et. al, Monash Time Series Forecasting Archive,Neural Information # Processing Systems Track on Datasets and Benchmarks, 2021