#!/usr/bin/env python # coding: utf-8 # # Responsible AI dashboard for Time Series Forecasting # _**Orange Juice Sales Forecasting**_ # # Note: Time series forecasting is not yet supported by raiwidgets. This notebook uses an experimental setting to enable it. However, the API and functionality may change at any point in the future without warning. We suggest you hold off on using it until the official release. # # ## Contents # 1. [Introduction](#introduction) # 1. [Data](#data) # 1. [Train](#train) # 1. [Responsible AI Dashboard](#analyze) # ## Introduction # In this example, we use sktime to train and assess a time-series forecasting model for multiple time-series. # # The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area. # In[ ]: import pandas as pd from sktime.forecasting.arima import AutoARIMA from sktime.forecasting.base import ForecastingHorizon from sktime.forecasting.model_selection import temporal_train_test_split # ## Data # You are now ready to load the historical orange juice sales data. We will load the CSV file into a plain pandas DataFrame; the time column in the CSV is called _WeekStarting_, so it will be specially parsed into the datetime type. # Each row in the DataFrame holds a quantity of weekly sales for an OJ brand at a single store. The data also includes the sales price, a flag indicating if the OJ brand was advertised in the store that week, and some customer demographic information based on the store location. For historical reasons, the data also include the logarithm of the sales quantity. The Dominick's grocery data is commonly used to illustrate econometric modeling techniques where logarithms of quantities are generally preferred. # # The task is now to build a time-series model for the _Quantity_ column. It is important to note that this dataset is comprised of many individual time-series - one for each unique combination of _Store_ and _Brand_. To distinguish the individual time-series, we define the **time_series_id_features** the columns whose values determine the boundaries between time-series: # In[ ]: time_column_name = "WeekStarting" time_series_id_features = ["Store", "Brand"] dataset_location = "https://raw.githubusercontent.com/Azure/azureml-examples/2fe81643865e1f4591e7734bd1a729093cafb826/v1/python-sdk/tutorials/automl-with-azureml/forecasting-orange-juice-sales/dominicks_OJ.csv" data = pd.read_csv(dataset_location, parse_dates=[time_column_name]) # Drop the columns 'logQuantity' as it is a leaky feature. data.drop("logQuantity", axis=1, inplace=True) # Set up multi index with time series ID columns and time column. data.set_index(time_series_id_features + [time_column_name], inplace=True, drop=True) data = data.groupby(time_series_id_features).apply(lambda group: group.loc[group.name].asfreq("W-THU").interpolate()) data.sort_index(inplace=True, ascending=[True, True, True]) data.head(10) # In[ ]: nseries = data.groupby(time_series_id_features).ngroups print("Data contains {0} individual time-series.".format(nseries)) # For demonstration purposes, we extract sales time-series for just a few of the stores: # In[ ]: use_stores = [2, 5, 8] use_brands = ['tropicana', 'dominicks', 'minute.maid'] data_subset = data.loc[(use_stores, use_brands, slice(None)), :] nseries = data_subset.groupby(time_series_id_features).ngroups print(f"Data subset contains {nseries} individual time-series.") # ### Data Splitting # We now split the data into a training and a testing set for later forecast evaluation. The test set will contain the final 20 weeks of observed sales for each time-series. The splits should be stratified by series, so we use a group-by statement on the time series identifier columns. # In[ ]: target_column_name = "Quantity" y = pd.DataFrame(data_subset[target_column_name]) X = data_subset.drop(columns=[target_column_name]) fh_dates = pd.DatetimeIndex(y.index.get_level_values(2).unique().sort_values().to_list()[-20:], freq='W-THU') fh = ForecastingHorizon(fh_dates, is_relative=False) y_train, y_test, X_train, X_test = \ temporal_train_test_split( y=y, X=X, test_size=20) # ## Train # You can now submit a new training run. Depending on the data and number of iterations this operation may take several minutes. # Information from each iteration will be printed to the console. Validation errors and current status will be shown when setting `show_output=True` and the execution will be synchronous. # In[ ]: # When using sktime directly we need to drop the time and time series ID columns. model = AutoARIMA(suppress_warnings=True, error_action="ignore") model.fit(y=y_train, X=X_train, fh=fh) model.predict(fh=fh, X=X_test).head() # In[ ]: model.predict_quantiles(fh=fh, X=X_test, alpha=[0.025, 0.975]).head() # # Responsible AI Dashboard # In[ ]: from raiwidgets import ResponsibleAIDashboard from responsibleai import RAIInsights, FeatureMetadata # merge X, y, and the time and time series ID features into a single DataFrame train = X_train.join(y_train).join(X_train.index.to_frame(index=True)) test = X_test.join(y_test).join(X_test.index.to_frame(index=True)) train.reset_index(drop=True, inplace=True) test.reset_index(drop=True, inplace=True) feature_metadata = FeatureMetadata( time_series_id_features=time_series_id_features, categorical_features=time_series_id_features, datetime_features=[time_column_name]) insights = RAIInsights( model=model, train=train, test=test, task_type="forecasting", target_column=target_column_name, feature_metadata=feature_metadata, forecasting_enabled=True) ResponsibleAIDashboard(insights) # In[ ]: