# changing column to a datetime and setting as index
df['date'] = pd.to_datetime(df.date, format='%Y%m%d')
df.set_index('date', inplace = True)
df.index #ensures date is index
# Use pandas grouper to group values using annual frequency
df_year=df.groupby(pd.Grouper(freq = 'A')) #grouping function can be changed to 'M','D', etc.
#Create a new DataFrame and store yearly values in columns
new_df = pd.DataFrame()
for yr, data in df_year:
new_df[yr.year] = data.values.ravel()
new_df.plot (figsize = (13,13), subplots = True, legend = True); # overlap by setting subplots = False
Stationarity: A time series is said to be stationary if its statistical properties such as mean, variance remain constant over time. When time series models are not stationary, we say there is a trend
No Trend: goes up and down, but there is no clear direction, and over time, the index oscillates around 0.
Linear Trend: observation grows bigger over time, or declines. (upward and downward)
Exponential Trend: observation grows exponentially over time
Periodic Trend: Cyclical, will have repeating highs and lows (also called seasonality)
Increasing Variance: a periodic trend where the high and low increases/decreases over time.
can be combinations of these trends
#Determine rolling statistics
rollmean = df.rolling(window=8,center=False).mean()
rollstd = df.rolling(window=8, center=False).std()
#Plot rolling statistics
fig = plt.figure(figsize=(12,7))
orig = plt.plot(df, color='blue',label='Original')
mean = plt.plot(rollmean, color='red', label='Rolling Mean')
std = plt.plot(rollstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
from statsmodels.tsa.stattools import adfuller
dftest = adfuller(df['#Passengers'])
# Extract and display test results in a user friendly manner
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print(dftest)
print ('Results of Dickey-Fuller Test:')
print(dfoutput)
def stationarity_check(TS):
# Import adfuller
from statsmodels.tsa.stattools import adfuller
# Calculate rolling statistics
rolmean = TS.rolling(window = 8, center = False).mean()
rolstd = TS.rolling(window = 8, center = False).std()
# Perform the Dickey Fuller Test
dftest = adfuller(TS['#Passengers']) # change the passengers column as required
#Plot rolling statistics:
fig = plt.figure(figsize=(12,6))
orig = plt.plot(TS, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Print Dickey-Fuller test results
print ('Results of Dickey-Fuller Test:')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
return None
stationarity_check(data)
# Log transform to make the time series more "uniform" over time and compare with original
df_log = np.log(df)
fig = plt.figure(figsize=(12,6))
plt.plot(df_log, color="blue")
plt.xlabel("month", fontsize=14)
plt.ylabel("log(data)", fontsize=14)
plt.show();
# Sqrt transform to make the time series more "uniform" over time and compare with original
df_sqrt = np.sqrt(df)
fig = plt.figure(figsize=(11,7))
plt.plot(df_sqrt, color="blue")
plt.xlabel("month", fontsize=14)
plt.ylabel("sqrt(data)", fontsize=14)
plt.show()
rolmean = df_log.rolling(window = 7).mean() #can change the window length, which will affect what your eventual time series will look like
fig = plt.figure(figsize=(11,7))
orig = plt.plot(df_log, color='blue',label='Original') #log transformed data
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=False)
# Subtract the moving average from the original data and check head for Nans
data_minus_rolmean = data - rolmean
data_minus_rolmean.head(15)
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html
# Use Pandas ewma() to calculate Weighted Moving Average of ts_log
exp_rolmean = data.ewm(halflife = 2).mean() #check parameters and experiment
# Plot the original data with exp weighted average
fig = plt.figure(figsize=(12,7))
orig = plt.plot(data, color='blue',label='Original')
mean = plt.plot(exp_rolmean, color='red', label='Exponentially Weighted Rolling Mean')
plt.legend(loc='best')
plt.title('Exponentially Weighted Rolling Mean & Standard Deviation')
plt.show(block=False)
# Subtract the moving average from the original data and check head for Nans
data_minus_exp_rolmean = data - exp_rolmean
data_minus_exp_rolmean.head(15)
fig = plt.figure(figsize=(11,7))
plt.plot(data_minus_exp_rolmean, color='blue',label='Sales - rolling mean')
plt.legend(loc='best')
plt.title('Sales while the rolling mean is subtracted')
plt.show(block=False)
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.diff.html
data_diff = data.diff(periods=1)
data_diff.head(10)
fig = plt.figure(figsize=(11,7))
plt.plot(data_diff, color='blue',label='Sales - rolling mean')
plt.legend(loc='best')
plt.title('Differenced sales series')
plt.show(block=False)
http://www.statsmodels.org/dev/generated/statsmodels.tsa.seasonal.seasonal_decompose.html
# import seasonal_decompose
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(np.log(ts))
# Gather the trend, seasonality and noise of decomposed object
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
# Plot gathered statistics
plt.figure(figsize=(12,8))
plt.subplot(411)
plt.plot(np.log(ts), label='Original', color="blue")
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend', color="blue")
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality', color="blue")
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals', color="blue")
plt.legend(loc='best')
plt.tight_layout()
The white noise model has three properties:
A special case of a White Noise model is Gaussian White Noise, where the constant mean is equal to zero, and the constant variance is equal to 1.
Contrary to the white noise model, the random walk model, however
-Has a strong depencence over time The changes over time are basically a white noise model. Mathematically, this can be written as:
Yt=Yt−1+ϵt Yt=Yt−1+ϵt
Where ϵtϵt is a mean zero white noise model!
An extension of the Random Walk model is a so-called "Random Walk with a Drift", specified as follows:
Yt=c+Yt−1+ϵt Yt=c+Yt−1+ϵt Here, there is a drift parameter cc steering in a certain direction!
gtrends_diff = gtrends.diff(periods=1)
gtrends_diff.plot(figsize=(18,6))
plt.xlabel('Year', fontsize=14);
gtrends_diff.corr()
plt.figure(figsize=(12,6))
pd.plotting.autocorrelation_plot(diet_diff)
from statsmodels.graphics.tsaplots import plot_pacf
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 14, 5
plot_pacf(diet, lags = 100);
ARMA models combines autoregression techniques (analyses that assume that previous observations are good predictors for future values and perform an autoregression analysis to forecast for those future values) and moving average techniques — models that measure the level of the constant time series and then update the forecast model if any changes are detected. Not to be used with small (<50) observations. http://www.statsmodels.org/dev/generated/statsmodels.tsa.arima_model.ARMA.html
# assuming your time series are stored in an object "series"
# Import the ARMA module from statsmodels
from statsmodels.tsa.arima_model import ARMA
import statsmodels.api as sm
# Fit an MA(1) model to the first simulated data
mod_arma = ARMA(series, order=(1,0))
res_arma = mod_arma.fit()
# Print out summary information on the fit
print(res_arma.summary())
# Print out the estimate for the constant and for theta
print(res_arma.params)
http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import warnings
# evaluate an ARIMA model for a given order (p,d,q)
def evaluate_arima_model(X, arima_order):
# prepare training dataset
train_size = int(len(X) * 0.66)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
# make predictions
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=arima_order)
model_fit = model.fit(disp=0)
yhat = model_fit.forecast()[0]
predictions.append(yhat)
history.append(test[t])
# calculate out of sample error
error = mean_squared_error(test, predictions)
return error
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
dataset = dataset.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
for d in d_values:
for q in q_values:
order = (p,d,q)
try:
mse = evaluate_arima_model(dataset, order)
if mse < best_score:
best_score, best_cfg = mse, order
print('ARIMA%s MSE=%.3f' % (order,mse))
except:
continue
print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
warnings.filterwarnings("ignore")
evaluate_models(series.values, p_values, d_values, q_values)
http://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html
# Import necessary libraries
import warnings
warnings.filterwarnings('ignore')
import itertools
import pandas as pd
import numpy as np
import statsmodels.api as sm
# Define the p, d and q parameters to take any value between 0 and 2
p = [0, 1, 2, 4, 6, 8, 10]
d = range(0,3)
q = range(0,3)
# Generate all different combinations of p, q and q triplets
pdq = list(itertools.product(p,d,q))
# Generate all different combinations of seasonal p, q and q triplets (use 12 for frequency)
pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
# Run a grid with pdq and seasonal pdq parameters calculated above and get the best AIC value
ans = []
for comb in pdq:
for combs in pdqs:
try:
mod = sm.tsa.statespace.SARIMAX(CO2,
order=comb,
seasonal_order=combs,
enforce_stationarity=False,
enforce_invertibility=False)
output = mod.fit()
ans.append([comb, combs, output.aic])
print('ARIMA {} x {}12 : AIC Calculated ={}'.format(comb, combs, output.aic))
except:
continue
Facebook prophet uses an elegant yet simple method for analyzing and predicting periodic data known as the additive modelling. The idea is straightforward: represent a time-series as a combination of patterns at different scales such as daily, weekly, seasonally, and yearly, along with an overall trend. Your energy use might rise in the summer and decrease in the winter, but have an overall decreasing trend as you increase the energy efficiency of your home. An additive model can show us both patterns/trends and make predictions based on these observations. https://facebook.github.io/prophet/docs/quick_start.html
from fbprophet import Prophet as proph
# set the uncertainty interval to 95% (the Prophet default is 80%)
Model = proph(interval_width=0.95)
# Fit the timeseries into Model
Model.fit(ts)
# Use make_future_dataframe with a monthly frequency and periods = 36 for 3 years
future_dates = Model.make_future_dataframe(periods=36, freq='MS')
future_dates.tail()
#This future dates dataframe can now be used as input to the predict method of the fitted model.
# Predict the values for future dates and take the head of forecast
forecast = Model.predict(future_dates)
forecast.head()
We can see that prophet returns a large table with many interesting columns, but we subset our output to the columns most relevant to forecasting, which are:
ds: the datestamp of the forecasted value
yhat: the forecasted value of our metric (in Statistics, yhat is a notation traditionally used to represent the predicted values of a value y)
yhat_lower: the lower bound of our forecasts
yhat_upper: the upper bound of our forecasts.
#Subset above mentioned columns and view the tail
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
# Use prophet's plot function to plot the predictions
Model.plot(forecast, uncertainty=True)
plt.show()
#Prophet plots the observed values of the time-series (the black dots), the forecasted values (blue line) and the uncertainty intervals of our forecasts (the blue shaded regions)
# Plot model components
Model.plot_components(forecast)
plt.show()
A good time series model should have following characteristics-
results = model.fit()
print(results.summary().tables[1])
results.plot_diagnostics(figsize=(15, 18))
plt.show()
# Get predictions starting from 01-01-1998 and calculate confidence intervals.
pred = output.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=False)
pred_conf = pred.conf_int()
# Plot real vs predicted values along with confidence interval
rcParams['figure.figsize'] = 15, 6
#Plot observed values
ax = CO2['1990':].plot(label='observed')
#Plot predicted values
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.9)
#Plot the range for confidence intervals
ax.fill_between(pred_conf.index,
pred_conf.iloc[:, 0],
pred_conf.iloc[:, 1], color='g', alpha=.5)
#Set axes labels
ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')
plt.legend()
plt.show()
# Get dynamic predictions with confidence intervals as above.
pred_dynamic = output.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=True, full_results=True)
pred_dynamic_conf = pred_dynamic.conf_int()
# Plot the dynamic forecast with confidence intervals as above
ax = CO2['1990':].plot(label='observed', figsize=(20, 15))
pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)
ax.fill_between(pred_dynamic_conf.index,
pred_dynamic_conf.iloc[:, 0],
pred_dynamic_conf.iloc[:, 1], color='g', alpha=.3)
ax.fill_betweenx(ax.get_ylim(), pd.to_datetime('1998-01-01'), CO2_forecasted.index[-1], alpha=.1, zorder=-1)
ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')
plt.legend()
plt.show()
# Get the Real and predicted values
data_forecasted = pred.predicted_mean
data_truth = data['1998-01-01':]
# Compute the mean square error
mse = ((data_forecasted - data_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))
# Get forecast 500 steps ahead in future
prediction = results.get_forecast(steps=500)
# Get confidence intervals of forecasts
pred_conf = prediction.conf_int()
# Plot future predictions with confidence intervals
ax = data.plot(label='observed', figsize=(20, 15))
prediction.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_conf.index,
pred_conf.iloc[:, 0],
pred_conf.iloc[:, 1], color='k', alpha=.25)
ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')
plt.legend()
plt.show()