Predictive Analytics (QBUS2820)

Tutorial 13 (self-study): ARIMA

#
# # This tutorial we add the ARIMA model to our analysis of the Australian inflation data from a previous tutorial. # # Data: Australian CPI inflation
# ARIMA model identification
# Estimation
# Model diagnostics
# Model validation
# Forecast
# # This notebook relies on the following imports and settings. # In[1]: # Packages import warnings warnings.filterwarnings("ignore") import numpy as np from scipy import stats import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm # In[2]: # Plot settings sns.set_context('notebook') sns.set_style('ticks') red='#D62728' blue='#1F77B4' get_ipython().run_line_magic('matplotlib', 'inline') # ##Data: Australian CPI inflation # # In[3]: data=pd.read_csv('inflation.csv', index_col='Date', parse_dates=True, dayfirst=True) data.index=data.index.to_period(freq='Q') # converting the index to quarterly period instead of dates data=data['01-1980':] # filtering the use data from Jan/1980 onwards data.tail() # In[4]: y=data['Inflation'] fig, ax= plt.subplots(figsize=(8,5)) y.plot(color=red) ax.set_xlabel('') ax.set_ylabel('Inflation') ax.set_title('Australian Quarterly CPI Inflation') ax.set_xticks([], minor=True) # I prefer to remove the minor ticks for a cleaner plot sns.despine() plt.show() # ##ARIMA model identification # # The next cell plots the sample autocorrelations (ACF) and partial autocorrelations (PACF) for the entire series. The autocorrelations decay slowly, reflecting the non-stationarity of the series. # In[5]: fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(y, lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(y, lags=40, ax=ax[1]) sns.despine() fig.tight_layout() plt.show() # We then consider a first difference transform, which seems to lead to a stationary series for this data. The pandas [shift](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.shift.html) method is the tool for obtaining lagged values of a column. # # The ACF plot for the first difference series has a cut-off after lag one, while the PACF shows a gradual decrease, suggesting a MA(1) for the data. # In[6]: diff=y-y.shift(1) diff=diff.dropna() fig, ax= plt.subplots(figsize=(8,5)) diff.plot(color=red) ax.set_xlabel('') ax.set_title('First differenced series') ax.set_xticks([], minor=True) # I prefer to remove the minor ticks for a cleaner plot sns.despine() plt.show() # In[7]: fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(diff, lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(diff, lags=40, ax=ax[1]) sns.despine() fig.tight_layout() plt.show() # In the EDA, we saw that the series behaves differently in the post-80s period, so that we can also consider focusing on this part of the dataset only. The need for first differencing is less clear in this case, so that we will consider specifications with and without the transformation # In[8]: fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(y['1991':], lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(y['1991':], lags=40, ax=ax[1]) sns.despine() fig.tight_layout() plt.show() # With the first difference, the ACF and PACF are again consistent with an MA(1) process. # In[9]: fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(diff['1991':], lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(diff['1991':], lags=40, ax=ax[1]) sns.despine() fig.tight_layout() plt.show() # ###Estimation # # We now estimate the ARIMA(0,1,1) model for the full data using the statsmodels package. # In[10]: arima1 = sm.tsa.ARIMA(y, order=(0, 1, 1)).fit(trend='nc') # trend='nc' option estimates a model without a drift print(arima1.summary()) # Restricting attention to the data since 1991, we consider two specifications: ARIMA(1,0,1) and ARIMA(0,1,1). Note that for the ARIMA(1,0,1) estimation, I remove the first observation to make it comparable to the ARIMA(0,1,1) output (which loses the first observation due to first differencing). # # The AIC and BIC criteria select the ARIMA(0,1,1) specification. Using only the 1991-2016 period substantially changes the estimate of the MA(1) coefficient. # In[11]: arima2 = sm.tsa.ARIMA(y['1991':].iloc[1:], order=(1, 0, 1)) .fit(trend='nc') print(arima2.summary()) # In[12]: arima3 = sm.tsa.ARIMA(y['1991':], order=(0, 1, 1)).fit(trend='nc') print(arima3.summary()) # ###Model diagnostics # # To obtain the residual series: # In[13]: resid=arima3.resid # As diagnostics, we plot the residual series, the ACF and PACF plots, and the histogram of the residual data. We find no autocorrelation patterns in the residuals, confirming that the ARIMA(0,1,1) adequately captures the time series dependence in the series. The histogram shows that the residual series is non-Gaussian, so that we should not rely on this assumption for building forecast intervals. # In[14]: fig, ax= plt.subplots(figsize=(8,5)) resid.plot(color=blue) ax.set_xlabel('') ax.set_xticks([], minor=True) ax.set_title('Residual plot') sns.despine() plt.show() # In[15]: fig, ax = plt.subplots(1,2, figsize=(12,5)) sm.graphics.tsa.plot_acf(resid, lags=40, ax=ax[0]) sm.graphics.tsa.plot_pacf(resid, lags=40, ax=ax[1]) plt.tight_layout() plt.show() # In[16]: from forecast import histogram, qq_plot histogram(resid) plt.show() qq_plot(resid) plt.show() # ##Model validation # # We implement a real time forecasting exercise to select the best model for the final forecast. # In[17]: import forecast # Real time forecasting validation=y['2004Q1':].index # the validation period is Q1 2004 onwards start = y.index.get_loc('2004Q1') # numerical index corresponding to Q1 2004 recent = y.index.get_loc('2000Q1') results=pd.DataFrame(0.0, index=validation, columns=['RW', 'SES', 'Trend corrected', 'ARIMA', 'ARIMA (1991-)','Actual']) results['Actual'] = y.iloc[start:] for i in range(start, len(y)): j=i-start # random walk forecast results.iloc[j,0]=y.iloc[i-1] # simple exponential smoothing model = forecast.ses(y.iloc[:i]) model.fit() results.iloc[j,1]= model.forecast(1)[0] # trend corrrected model = forecast.holt(y.iloc[:i]) model.fit() results.iloc[j,2]=model.forecast(1)[0] # ARIMA model = sm.tsa.ARIMA(y.iloc[:i], order=(0, 1, 1)).fit() results.iloc[j,3]=model.forecast()[0][0] # ARIMA (recent sample) model = sm.tsa.ARIMA(y.iloc[recent:i], order=(0, 1, 1)).fit() results.iloc[j,4]= model.forecast()[0][0] # In[18]: from statlearning import rmse_jack table = pd.DataFrame(0.0, index=results.columns[:-1], columns=['RMSE','SE']) for i in range(5): table.iloc[i,0], table.iloc[i,1] = rmse_jack(results.iloc[:,i], results.iloc[:,-1]) table.round(3) # ##Forecast # # We compute the final forecast using the ARIMA(0,1,1) model estimated on the full sample. # In[20]: h=8 pred, stderr, intv1 = arima1.forecast(steps=h, alpha=0.2) pred, stderr, intv2 = arima1.forecast(steps=h, alpha=0.1) pred, stderr, intv3 = arima1.forecast(steps=h, alpha=0.01) test=pd.period_range(start=y.index[-1]+1, periods=h, freq='Q') pred=pd.Series(pred, index=test) intv1=pd.DataFrame(intv1, index=test) intv2=pd.DataFrame(intv2, index=test) intv3=pd.DataFrame(intv3, index=test) fig, ax = forecast.fanchart(y['2012':], pred, intv1, intv2, intv3) ax.set_xlabel('') ax.set_xticks([], minor=True) ax.set_ylabel('Quartely inflation') sns.despine() plt.title('ARIMA(0,1,1) CPI inflation forecast') plt.show()