pmdarima
for auto ARIMA¶Playing around with the pmdarima
library for basic ARIMA and ARIMAX models in python - a nice alternative to statsmodels
. Time series modeling in Python in general seems very scattered compared to the experience in R
unfortunately.
From pmdarima
toy datasets: https://alkaline-ml.com/pmdarima/modules/datasets.html
import numpy as np
import pandas as pd
import pmdarima as pm
from pmdarima import model_selection
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
# Load a local helper file with the data cleaning and all that
from create_dataset import load_combined_dataset
Using the sample datasets from the pmdarima
library we can create a combined dataframe with data from 1980 to 1994 covering Australian beer production (in megaliters), residents, and wine sales (in bottles)
df = load_combined_dataset()
df.head()
timeperiod | Wine | Residents | Beer | |
---|---|---|---|---|
0 | 19801 | 51885.0 | 14515.7 | 513.0 |
1 | 19802 | 54954.0 | 14554.9 | 427.0 |
2 | 19803 | 67765.0 | 14602.5 | 473.0 |
3 | 19804 | 79117.0 | 14646.4 | 526.0 |
4 | 19811 | 53013.0 | 14695.4 | 548.0 |
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df.Wine, name="Wine Sales (bottles)"))
fig.add_trace(go.Scatter(x=df.index, y=df.Residents, name="Residents"))
fig.update_layout(title="Wine Sales and Residents Over Time",
xaxis_title="Quarters since Q1 1980", yaxis_title="Quarterly Total")
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df.Wine, name="Wine Sales"))
fig.add_trace(go.Scatter(x=df.index, y=df.Beer*100, name="Beer Sales (megaliter)"))
fig.update_layout(title="Wine Sales (bottles) and Beer Production (megaliters) Over Time",
xaxis_title="Quarters since Q1 1980", yaxis_title="Quarterly Total")
We can also plot some decompositions of the dataset too:
decomposed = pm.arima.decompose(df.Wine, 'additive', m=4)
# Use the built in helper function to plot the 4 components with matplotlib
figure_kwargs = {'figsize': (12, 6)}
axes = pm.utils.decomposed_plot(decomposed, figure_kwargs=figure_kwargs,
show=False)
axes[0].set_title("Australian Wine Sales Seasonal Decomposition")
Text(0.5, 1.0, 'Australian Wine Sales Seasonal Decomposition')
Let's start with the most basic example of an automated ARIMA model to see how the ARIMA model fits the quarterly wine sales data.
train, test = model_selection.train_test_split(df.Wine, train_size=.75)
# Fit a simple auto_arima model
arima = pm.auto_arima(train, error_action='ignore', trace=True,
suppress_warnings=True, maxiter=5,
seasonal=True, m=12)
Performing stepwise search to minimize aic ARIMA(2,1,2)(1,1,1)[12] : AIC=598.611, Time=0.08 sec ARIMA(0,1,0)(0,1,0)[12] : AIC=592.659, Time=0.01 sec ARIMA(1,1,0)(1,1,0)[12] : AIC=590.795, Time=0.04 sec ARIMA(0,1,1)(0,1,1)[12] : AIC=591.383, Time=0.03 sec ARIMA(1,1,0)(0,1,0)[12] : AIC=588.997, Time=0.01 sec ARIMA(1,1,0)(0,1,1)[12] : AIC=590.814, Time=0.03 sec ARIMA(1,1,0)(1,1,1)[12] : AIC=592.807, Time=0.04 sec ARIMA(2,1,0)(0,1,0)[12] : AIC=591.333, Time=0.02 sec ARIMA(1,1,1)(0,1,0)[12] : AIC=591.774, Time=0.02 sec ARIMA(0,1,1)(0,1,0)[12] : AIC=589.532, Time=0.01 sec ARIMA(2,1,1)(0,1,0)[12] : AIC=593.054, Time=0.02 sec ARIMA(1,1,0)(0,1,0)[12] intercept : AIC=590.362, Time=0.02 sec Best model: ARIMA(1,1,0)(0,1,0)[12] Total fit time: 0.328 seconds
Predict:
preds = arima.predict(n_periods=test.shape[0])
MAPE:
np.mean(np.abs(preds - test)/np.abs(test))
0.05395221164363103
# #############################################################################
# Plot actual test vs. forecasts:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df.Wine, name="Wine Sales"))
fig.add_trace(go.Scatter(x=test.index, y=preds, name="Prediction"))
fig.update_layout(title="Wine Sales (bottles) ARIMA Prediction on Test Set",
xaxis_title="Quarters since Q1 1980", yaxis_title="Quarterly Total",
template='simple_white')
You can get a confidence interval too:
preds, conf_int = arima.predict(n_periods=test.shape[0],
return_conf_int=True)
low = conf_int[:, 0]
high = conf_int[:, 1]
fig.add_trace(go.Scatter(x=test.index, y=low, name="Low", line_dash='dot', line_color='gray', marker_size=1))
fig.add_trace(go.Scatter(x=test.index, y=high, name="High", line_dash='dot', line_color='gray', marker_size=1))
We can also plot the below as if we were forecasting into the future
# #############################################################################
# Plot actual test vs. forecasts:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=train, name="Wine Sales"))
fig.add_trace(go.Scatter(x=test.index, y=arima.predict(n_periods=test.shape[0]), name="Prediction"))
fig.update_layout(title="Wine Sales (bottles) ARIMA Prediction",
xaxis_title="Quarters since Q1 1980", yaxis_title="Quarterly Total",
template='simple_white')
fig.add_trace(go.Scatter(x=test.index, y=low, name="Low", line_dash='dot', line_color='gray', marker_size=1))
fig.add_trace(go.Scatter(x=test.index, y=high, name="High", line_dash='dot', line_color='gray', marker_size=1))
We'd like to demonstrate ARIMAX in pmdarima
using the wine production, with the Resident
data as an exogenous variable.
We'll use the train_test_split
method but this tie we'll need to include our regressor.
train, test = model_selection.train_test_split(df[['Wine', 'Residents']])
# For exogenous variables, it's expecting a 2D array so if you only have a pandas series or a single array,
# you need to reshape it into a matrix.
X_train = train.Residents.values.reshape(-1, 1)
X_test = test.Residents.values.reshape(-1, 1)
# Fit a auto_arima model with 'Residents' as the exogenous variable and 'Wine' as endog
arima = pm.auto_arima(y=train.Wine, X=X_train,
error_action='ignore', trace=True,
suppress_warnings=True, maxiter=5,
seasonal=True, m=12)
Performing stepwise search to minimize aic ARIMA(2,1,2)(1,1,1)[12] : AIC=604.108, Time=0.09 sec ARIMA(0,1,0)(0,1,0)[12] : AIC=594.565, Time=0.03 sec ARIMA(1,1,0)(1,1,0)[12] : AIC=596.982, Time=0.04 sec ARIMA(0,1,1)(0,1,1)[12] : AIC=597.615, Time=0.04 sec ARIMA(0,1,0)(1,1,0)[12] : AIC=595.627, Time=0.04 sec ARIMA(0,1,0)(0,1,1)[12] : AIC=594.747, Time=0.02 sec ARIMA(0,1,0)(1,1,1)[12] : AIC=596.719, Time=0.04 sec ARIMA(1,1,0)(0,1,0)[12] : AIC=597.478, Time=0.01 sec ARIMA(0,1,1)(0,1,0)[12] : AIC=596.873, Time=0.01 sec ARIMA(1,1,1)(0,1,0)[12] : AIC=599.365, Time=0.02 sec ARIMA(0,1,0)(0,1,0)[12] intercept : AIC=596.367, Time=0.01 sec Best model: ARIMA(0,1,0)(0,1,0)[12] Total fit time: 0.372 seconds
preds, conf_int = arima.predict(n_periods=test.shape[0], X=X_test, return_conf_int=True)
MAPE:
np.mean(np.abs(preds - test.Wine)/np.abs(test.Wine))
0.06404290877400837
low = conf_int[:, 0]
high = conf_int[:, 1]
# #############################################################################
# Plot actual test vs. forecasts:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.index, y=df.Wine, name="Wine Sales"))
fig.add_trace(go.Scatter(x=test.index, y=preds, name="Prediction"))
fig.update_layout(title="Wine Sales (bottles) ARIMAX Prediction on Test Set",
xaxis_title="Quarters since Q1 1980", yaxis_title="Quarterly Total",
template='simple_white')
fig.add_trace(go.Scatter(x=test.index, y=low, name="Low", line_dash='dot', line_color='gray', marker_size=1))
fig.add_trace(go.Scatter(x=test.index, y=high, name="High", line_dash='dot', line_color='gray', marker_size=1))