In [1]:
import numpy as np
import pandas as pd

from sktime.forecasting.theta import ThetaForecaster
from sktime.forecasting.ets import AutoETS
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.structural import UnobservedComponents
from sktime.forecasting.exp_smoothing import ExponentialSmoothing
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.forecasting.statsforecast import StatsForecastAutoARIMA
from sktime.forecasting.fbprophet import Prophet

from sktime.datasets import load_airline
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error
from sktime.utils.plotting import plot_series, plot_correlations, plot_lags

# disable all warnings because they are super annoying
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

using sktime version 0.11.4

Demo Time Series Plots in Python

Univariate

Model setup. We'll use the sktime load_airline() function to load in the data of international air passengers from 1949-1960 from the classic Box-Jenkins ARIMA paper.

In [2]:
y = load_airline()
y_train, y_test = temporal_train_test_split(y, test_size=12)
fh = ForecastingHorizon(y_test.index, is_relative=False)

You can plot simple diagnostics like the autocorrelation function (ACF) and partial ACF (PACF) with plot_correlations()

In [3]:
# ACF, PACF
plot_correlations(y)
Out[3]:
(<Figure size 864x576 with 3 Axes>,
 array([<AxesSubplot:ylabel='Number of airline passengers'>,
        <AxesSubplot:title={'center':'Autocorrelation'}>,
        <AxesSubplot:title={'center':'Partial Autocorrelation'}>],
       dtype=object))
In [4]:
# scatter plot of lags vs itself
plot_lags(y, lags=[1,2,3])
Out[4]:
(<Figure size 576x432 with 3 Axes>,
 array([<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>,
        <AxesSubplot:xlabel='y(t)', ylabel='y(t + 2)'>,
        <AxesSubplot:xlabel='y(t)', ylabel='y(t + 3)'>], dtype=object))

Modeling

Theta Forecaster

In [5]:
forecaster = ThetaForecaster(sp=12)  # monthly seasonal periodicity
forecaster.fit(y_train)
y_pred = forecaster.predict(fh)
print("sMAPE: ", mean_absolute_percentage_error(y_test, y_pred, symmetric=True))
plot_series(y, y_pred, labels=["y", "y_pred"])
sMAPE:  0.05231934588593568
Out[5]:
(<Figure size 1152x288 with 1 Axes>,
 <AxesSubplot:ylabel='Number of airline passengers'>)

Pretty Plot with Prediction Intervals

In [6]:
# Set coverage and then call .predict_interval to return a dataframe with the upper and lower PI bounds
coverage = 0.9
y_pred_ints = forecaster.predict_interval(coverage=coverage)

fig, ax = plot_series(y, y_pred, labels=["y", "y_pred"])
ax.fill_between(
    ax.get_lines()[-1].get_xdata(),
    y_pred_ints["Coverage"][coverage]["lower"],
    y_pred_ints["Coverage"][coverage]["upper"],
    alpha=0.2,
    color=ax.get_lines()[-1].get_c(),
    label=f"{coverage}% prediction intervals",
)
ax.set_title("Theta Forecaster")
ax.legend();

Wrap up all the steps above in a function

In [7]:
def run_forecast(y_train, y_test, forecaster, fh, coverage=None, results=True, title=""):
    """Helper function to train a forecaster and plot the results"""
    forecaster.fit(y_train)
    y_pred = forecaster.predict(fh)
    fig, ax = plot_series(y, y_pred, labels=["y", "y_pred"])
    if coverage is not None:
        # Calculated and add prediction intervals to the plot
        y_pred_ints = forecaster.predict_interval(coverage=coverage)
        ax.fill_between(
            ax.get_lines()[-1].get_xdata(),
            y_pred_ints["Coverage"][coverage]["lower"],
            y_pred_ints["Coverage"][coverage]["upper"],
            alpha=0.2,
            color=ax.get_lines()[-1].get_c(),
            label=f"{coverage}% prediction intervals",
        )
        ax.legend();
    ax.set_title(title)
    print("sMAPE: ", mean_absolute_percentage_error(y_test, y_pred, symmetric=True))
    if results:
        return forecaster, y_pred

Fitting a linear trendline

In [8]:
forecaster = PolynomialTrendForecaster(degree=1)
run_forecast(y_train, y_test, forecaster, fh, results=False, 
             title="Linear Trend Forecaster")
sMAPE:  0.12074496965455546
In [9]:
forecaster = PolynomialTrendForecaster(degree=2)
run_forecast(y_train, y_test, forecaster, fh, coverage=None, results=False, 
             title="Polynomial Trend Forecaster (Degree=2)")
sMAPE:  0.11980400632932332

Naive Model

In [10]:
forecaster = NaiveForecaster(strategy='last')  # monthly seasonal periodicity
run_forecast(y_train, y_test, forecaster, fh, results=False, title="Naive Model")
sMAPE:  0.16120844670628973

Exponential Smoothing (Holt Winters)

In [11]:
forecaster = ExponentialSmoothing()
run_forecast(y_train, y_test, forecaster, fh, coverage=None, results=False, 
             title="Simple Exponential Smoothing")
sMAPE:  0.16155480390887256
In [12]:
forecaster = ExponentialSmoothing(trend='add')
run_forecast(y_train, y_test, forecaster, fh, coverage=None, results=False, 
             title="Exponential Smoothing with Additive Trend")
sMAPE:  0.14023178895227414
In [13]:
forecaster = ExponentialSmoothing(trend='add', seasonal='add', sp=12)
run_forecast(y_train, y_test, forecaster, fh, coverage=None, results=False,
            title="Exponential Smoothing with Additive Trend, Additive Seasonality")
sMAPE:  0.02795240869632776

AutoETS

In [14]:
forecaster = AutoETS(error='add', trend='add', seasonal='add', sp=12)
run_forecast(y_train, y_test, forecaster, fh, coverage=0.9, results=False,
            title="AutoETS (Error, Trend, Seasonality) Model (A,A,A)")
sMAPE:  0.027163603869653663

StatsForecast AutoARIMA (Box-Jenkins)

In [15]:
forecaster = StatsForecastAutoARIMA(sp=12, d=1, max_p=2, max_q=2)
run_forecast(y_train, y_test, forecaster, fh, coverage=0.9, results=False,
            title="AutoARIMA with 12 seasonal periods")
sMAPE:  0.040386441846221005
In [16]:
forecaster = UnobservedComponents(level="local linear trend", cycle=True, damped_cycle=True, stochastic_cycle=True)
forecaster, y_pred = run_forecast(y_train, y_test, forecaster, fh, coverage=0.9, results=True,
                                 title="Unobserved Components Model")
sMAPE:  0.1472775915337731
In [17]:
forecaster.plot_diagnostics(figsize=(10,9)) # Set the figsize to large for high resolution screens
In [18]:
forecaster.summary()
Out[18]:
Unobserved Components Results
Dep. Variable: Number of airline passengers No. Observations: 132
Model: local linear trend Log Likelihood -629.951
+ damped stochastic cycle AIC 1271.902
Date: Fri, 03 Jun 2022 BIC 1289.014
Time: 16:42:19 HQIC 1278.855
Sample: 01-31-1949
- 12-31-1959
Covariance Type: opg
coef std err z P>|z| [0.025 0.975]
sigma2.irregular 1.241e-06 97.894 1.27e-08 1.000 -191.869 191.869
sigma2.level 1013.7659 8218.264 0.123 0.902 -1.51e+04 1.71e+04
sigma2.trend 1.129e-07 30.479 3.7e-09 1.000 -59.738 59.738
sigma2.cycle 2.056e-05 8187.919 2.51e-09 1.000 -1.6e+04 1.6e+04
frequency.cycle 0.0436 0.068 0.640 0.522 -0.090 0.177
damping.cycle 1.0000 0.088 11.348 0.000 0.827 1.173
Ljung-Box (L1) (Q): 10.74 Jarque-Bera (JB): 5.67
Prob(Q): 0.00 Prob(JB): 0.06
Heteroskedasticity (H): 7.91 Skew: -0.37
Prob(H) (two-sided): 0.00 Kurtosis: 3.72


Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).

With unobserved components, it's helpful to be able to plot the components...which doesn't appear to be possible in sktime. Below we use statsmodels to plot the results:

In [19]:
import statsmodels.api as sm

# Model specifications

# Unrestricted model, using string specification
unrestricted_model = {
    'level': 'local linear trend', 'cycle': True, 'damped_cycle': True, 'stochastic_cycle': True
}

y_train2 = y_train.reset_index()["Number of airline passengers"]

# Output
output_mod = sm.tsa.UnobservedComponents(y_train2, **unrestricted_model)
output_res = output_mod.fit(method='powell', disp=False)

# print(output_res.summary())
fig = output_res.plot_components(legend_loc='lower right', figsize=(15, 15));

And we can calculate the MAPE quickly:

In [20]:
np.mean(np.abs((output_res.predict(start=132, end=132+11).values - y_test.values)/y_test.values))
Out[20]:
0.11036773716103883

FB Prophet

In [22]:
# Prophet requires to have data with a pandas.DatetimeIndex
y = load_airline().to_timestamp(freq='M')
y_train2, y_test2 = temporal_train_test_split(y, test_size=12)
fh2 = ForecastingHorizon(y_test2.index, is_relative=False)

forecaster = Prophet(
    seasonality_mode='multiplicative',
    n_changepoints=int(len(y) / 12),
    add_country_holidays={'country_name': 'Germany'},
    yearly_seasonality=True)
forecaster, y_pred = run_forecast(y_train2, y_test2, forecaster, fh2, coverage=0.9, results=True,
                                  title="FB Prophet Model with Multiplicative Yearly Seasonality")
2022-06-03 16:42:36 prophet INFO: Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
2022-06-03 16:42:36 prophet INFO: Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
sMAPE:  0.044003264359127055
In [23]:
# redo y
y = load_airline()

Comparing Against a Heuristic Baseline

Now let's try a 'linearity' model which predicts quarterly spending based a monthly "percentage of full quarter spending" approach. This is a common approach when predicting how the end of quarter spending is shaping up.

In [24]:
# "Linearity" based on date
y_t = pd.DataFrame(y)
y_t["Q"] = y_t.index.quarter
y_t["YYYYQ"] = y_t.index.year.astype(str) + y_t.index.quarter.astype(str)

# Cumulative Percentage of the Quarter
y_t["cum_pct"] = y_t.groupby(by="YYYYQ")["Number of airline passengers"].transform(lambda x: np.cumsum(x)/sum(x))
y_t["cum_sum"] = y_t.groupby(by="YYYYQ")["Number of airline passengers"].transform(lambda x: np.cumsum(x))
y_t["qtr_total"] = y_t.groupby(by="YYYYQ")["Number of airline passengers"].transform(lambda x: sum(x))
In [25]:
# Find the month, and month in quarter using a mapping dictionary
month_in_qtr = dict()
for j in np.arange(1,4):
    for i in range(j,13,3):
        month_in_qtr[i] = j
y_t["M"] = y_t.index.month
y_t["Month_in_Qtr"] = y_t["M"].map(month_in_qtr)
In [26]:
# Calculate the Previous 4Q Average Percent of Qtr Spending
y_t["P4Q_avg"] = y_t.groupby("Month_in_Qtr")['cum_pct'].transform(lambda x: x.rolling(4).mean())
In [27]:
y_t.query("Month_in_Qtr==1")["cum_pct"].plot()
Out[27]:
<AxesSubplot:xlabel='Period'>
In [28]:
# Make the Prediction 
y_t["Pred1"] = y_t["cum_sum"]/y_t["P4Q_avg"]
In [29]:
# Calculate accuracy scores and plot the forecasts
subset = y_t.query("Month_in_Qtr==1")
print("MAPE: ", mean_absolute_percentage_error(subset.qtr_total[4:], subset.Pred1[4:], symmetric=False))
print("sMAPE: ", mean_absolute_percentage_error(subset.qtr_total[4:], subset.Pred1[4:], symmetric=True))
# Quarterly Data
plot_series(subset.qtr_total, subset.Pred1, labels=["Actuals", "Pred"])
MAPE:  0.04624210089028735
sMAPE:  0.04644477882368189
Out[29]:
(<Figure size 1152x288 with 1 Axes>, <AxesSubplot:ylabel='qtr_total'>)
In [30]:
subset = y_t.query("Month_in_Qtr==2")
print("MAPE: ", mean_absolute_percentage_error(subset.qtr_total[4:], subset.Pred1[4:], symmetric=False))
print("sMAPE: ", mean_absolute_percentage_error(subset.qtr_total[4:], subset.Pred1[4:], symmetric=True))
# Quarterly Data
plot_series(subset.qtr_total, subset.Pred1, labels=["Actuals", "Pred"])
MAPE:  0.03226663000165359
sMAPE:  0.032058941340971044
Out[30]:
(<Figure size 1152x288 with 1 Axes>, <AxesSubplot:ylabel='qtr_total'>)