#!/usr/bin/env python # coding: utf-8 # ### Data Indexing # #### While working with time series data in Python, it's important to always ensure that dates are used as index values and are set as a timestamp object. Timestamp is the pandas equivalent of python’s Datetime and is interchangeable with it in most cases. # # changing column to a datetime and setting as index # df['date'] = pd.to_datetime(df.date, format='%Y%m%d') # df.set_index('date', inplace = True) # df.index #ensures date is index # ### Time Series Visuals # # #### Line Plot (allows check for cyclical data) # #### Dot Plot(allows dense plots to be seen as individual points # #### Grouping visuals (to view spans of time over a period) # # Use pandas grouper to group values using annual frequency # df_year=df.groupby(pd.Grouper(freq = 'A')) #grouping function can be changed to 'M','D', etc. # # #Create a new DataFrame and store yearly values in columns # new_df = pd.DataFrame() # for yr, data in df_year: # new_df[yr.year] = data.values.ravel() # new_df.plot (figsize = (13,13), subplots = True, legend = True); # overlap by setting subplots = False # #### Histogram with a Density Plot (to check normality of data) # #### Boxplots(check distribution of data and outliers) - can be grouped by time periods like above # #### Heatmap (show visual trends - lower numbers/cooler colors - higher numbers/warmer colors) # # # # ### Trends # - Stationarity: A time series is said to be stationary if its statistical properties such as mean, variance remain constant over time. When time series models are not stationary, we say there is a trend # # - No Trend: goes up and down, but there is no clear direction, and over time, the index oscillates around 0. # # - Linear Trend: observation grows bigger over time, or declines. (upward and downward) # # - Exponential Trend: observation grows exponentially over time # # - Periodic Trend: Cyclical, will have repeating highs and lows (also called seasonality) # # - Increasing Variance: a periodic trend where the high and low increases/decreases over time. # # - **can be combinations of these trends** # ### Testing for Trends # #### Rolling Statistics (mean, std) with visual: plot the moving average or moving variance and see if it varies with time. # #Determine rolling statistics # rollmean = df.rolling(window=8,center=False).mean() # rollstd = df.rolling(window=8, center=False).std() # #Plot rolling statistics # fig = plt.figure(figsize=(12,7)) # orig = plt.plot(df, color='blue',label='Original') # mean = plt.plot(rollmean, color='red', label='Rolling Mean') # std = plt.plot(rollstd, color='black', label = 'Rolling Std') # plt.legend(loc='best') # plt.title('Rolling Mean & Standard Deviation') # plt.show(block=False) # #### Dickey-Fuller Test: statistical test for testing stationarity. The Null-hypothesis for the test is that the time series is not stationary. So if the test statistic is less than the critical value, we reject the null hypothesis and say that the series is stationary. # from statsmodels.tsa.stattools import adfuller # dftest = adfuller(df['#Passengers']) # # Extract and display test results in a user friendly manner # dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) # for key,value in dftest[4].items(): # dfoutput['Critical Value (%s)'%key] = value # print(dftest) # print ('Results of Dickey-Fuller Test:') # print(dfoutput) # # #### Function to perform a stationarity check with visual # def stationarity_check(TS): # # # Import adfuller # from statsmodels.tsa.stattools import adfuller # # # Calculate rolling statistics # rolmean = TS.rolling(window = 8, center = False).mean() # rolstd = TS.rolling(window = 8, center = False).std() # # # Perform the Dickey Fuller Test # dftest = adfuller(TS['#Passengers']) # change the passengers column as required # # #Plot rolling statistics: # fig = plt.figure(figsize=(12,6)) # orig = plt.plot(TS, color='blue',label='Original') # mean = plt.plot(rolmean, color='red', label='Rolling Mean') # std = plt.plot(rolstd, color='black', label = 'Rolling Std') # plt.legend(loc='best') # plt.title('Rolling Mean & Standard Deviation') # plt.show(block=False) # # # Print Dickey-Fuller test results # print ('Results of Dickey-Fuller Test:') # # dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used']) # for key,value in dftest[4].items(): # dfoutput['Critical Value (%s)'%key] = value # print (dfoutput) # # return None # stationarity_check(data) # ### Removing Trends # #### Log and Square Root Transform (use either/or) # # Log transform to make the time series more "uniform" over time and compare with original # df_log = np.log(df) # fig = plt.figure(figsize=(12,6)) # plt.plot(df_log, color="blue") # plt.xlabel("month", fontsize=14) # plt.ylabel("log(data)", fontsize=14) # plt.show(); # # # Sqrt transform to make the time series more "uniform" over time and compare with original # df_sqrt = np.sqrt(df) # fig = plt.figure(figsize=(11,7)) # plt.plot(df_sqrt, color="blue") # plt.xlabel("month", fontsize=14) # plt.ylabel("sqrt(data)", fontsize=14) # plt.show() # #### Subtract the Rolling Mean: calculate the rolling mean and subtract it from the time series to make sure your time series is stationary. # rolmean = df_log.rolling(window = 7).mean() #can change the window length, which will affect what your eventual time series will look like # fig = plt.figure(figsize=(11,7)) # orig = plt.plot(df_log, color='blue',label='Original') #log transformed data # mean = plt.plot(rolmean, color='red', label='Rolling Mean') # plt.legend(loc='best') # plt.title('Rolling Mean & Standard Deviation') # plt.show(block=False) # # # Subtract the moving average from the original data and check head for Nans # data_minus_rolmean = data - rolmean # data_minus_rolmean.head(15) # #### Weighted Rolling Mean: more recent values are given a higher weight. # ##### Exponentially weighted moving average: popular weighted rolling mean using pandas # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.ewm.html # # # Use Pandas ewma() to calculate Weighted Moving Average of ts_log # exp_rolmean = data.ewm(halflife = 2).mean() #check parameters and experiment # # # Plot the original data with exp weighted average # fig = plt.figure(figsize=(12,7)) # orig = plt.plot(data, color='blue',label='Original') # mean = plt.plot(exp_rolmean, color='red', label='Exponentially Weighted Rolling Mean') # plt.legend(loc='best') # plt.title('Exponentially Weighted Rolling Mean & Standard Deviation') # plt.show(block=False) # # # Subtract the moving average from the original data and check head for Nans # data_minus_exp_rolmean = data - exp_rolmean # data_minus_exp_rolmean.head(15) # # fig = plt.figure(figsize=(11,7)) # plt.plot(data_minus_exp_rolmean, color='blue',label='Sales - rolling mean') # plt.legend(loc='best') # plt.title('Sales while the rolling mean is subtracted') # plt.show(block=False) # # #### Differencing: One of the most common methods of dealing with both trend and seasonality is differencing. Take the difference of an observation at a particular time instant with that at the previous instant (i.e. a co-called 1-period "lag"). First order differencing can be done in Pandas using the .diff() function. Set periods = to the seasonality trend you want to remove(daily=1, annual=365, monthly=12) # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.diff.html # # data_diff = data.diff(periods=1) # data_diff.head(10) # # fig = plt.figure(figsize=(11,7)) # plt.plot(data_diff, color='blue',label='Sales - rolling mean') # plt.legend(loc='best') # plt.title('Differenced sales series') # plt.show(block=False) # # #### Decomposition: Time series decomposition is a mathematical procedure which transforms a time series into multiple different time series. The original time series is often split into 3 component series: Seasonal, Trend, Random. LOG TRANSFORMATION MUST BE PERFORMED BEFORE DECOMPOSITION!!!!! # ##### additive seasonality: does magnitude stay the same over a period of time # ##### multiplicative seasonality: does the magnitude increase when time series increases # http://www.statsmodels.org/dev/generated/statsmodels.tsa.seasonal.seasonal_decompose.html # # # import seasonal_decompose # from statsmodels.tsa.seasonal import seasonal_decompose # decomposition = seasonal_decompose(np.log(ts)) # # # Gather the trend, seasonality and noise of decomposed object # trend = decomposition.trend # seasonal = decomposition.seasonal # residual = decomposition.resid # # # Plot gathered statistics # plt.figure(figsize=(12,8)) # plt.subplot(411) # plt.plot(np.log(ts), label='Original', color="blue") # plt.legend(loc='best') # plt.subplot(412) # plt.plot(trend, label='Trend', color="blue") # plt.legend(loc='best') # plt.subplot(413) # plt.plot(seasonal,label='Seasonality', color="blue") # plt.legend(loc='best') # plt.subplot(414) # plt.plot(residual, label='Residuals', color="blue") # plt.legend(loc='best') # plt.tight_layout() # ### Basic Time Series Models # #### White Noise # The white noise model has three properties: # # - There is a fixed and constant mean # - There is a fixed and constant variance # - There is no correlation over time (we'll talk about correlation in time series later, essentially, what this means is that the pattern seems truly "random). # A special case of a White Noise model is Gaussian White Noise, where the constant mean is equal to zero, and the constant variance is equal to 1. # #### Random Walk Model # Contrary to the white noise model, the random walk model, however # - Has no specified mean or variance # -Has a strong depencence over time # The changes over time are basically a white noise model. Mathematically, this can be written as: # # Yt=Yt−1+ϵt # Yt=Yt−1+ϵt # # Where ϵtϵt is a mean zero white noise model! # - Random walk processes are very common in finance. # #### Random Walk with Drift # An extension of the Random Walk model is a so-called "Random Walk with a Drift", specified as follows: # # Yt=c+Yt−1+ϵt # Yt=c+Yt−1+ϵt # Here, there is a drift parameter cc steering in a certain direction! # ### Correlation in Time Series # #### Turns out you can more easily find out if time series are correlated if you detrend them first. Let's use differencing to detrend these time series and then calculate the correlation again! # gtrends_diff = gtrends.diff(periods=1) # gtrends_diff.plot(figsize=(18,6)) # plt.xlabel('Year', fontsize=14); # gtrends_diff.corr() # # #### Autocorrelation: very powerful tool for time series analysis. it helps us study how each time series observation is related to its recent (or not so recent) past. # ##### Autocorrelation function (dont forget to detrend!!!) ACF # plt.figure(figsize=(12,6)) # pd.plotting.autocorrelation_plot(diet_diff) # #### Partial Autocorrelation gives the partial correlation of a time series with its own lagged values, controlling for the values of the time series at all shorter lags. It contrasts with the autocorrelation function, which does not control for other lags. # ##### Partial Autocorrelation function PACF # from statsmodels.graphics.tsaplots import plot_pacf # from matplotlib.pylab import rcParams # rcParams['figure.figsize'] = 14, 5 # plot_pacf(diet, lags = 100); # ### ARMA (autoregressive moving average) Models # ARMA models combines autoregression techniques (analyses that assume that previous observations are good predictors for future values and perform an autoregression analysis to forecast for those future values) and moving average techniques — models that measure the level of the constant time series and then update the forecast model if any changes are detected. Not to be used with small (<50) observations. # http://www.statsmodels.org/dev/generated/statsmodels.tsa.arima_model.ARMA.html # # # assuming your time series are stored in an object "series" # # Import the ARMA module from statsmodels # from statsmodels.tsa.arima_model import ARMA # import statsmodels.api as sm # # # Fit an MA(1) model to the first simulated data # mod_arma = ARMA(series, order=(1,0)) # res_arma = mod_arma.fit() # # # Print out summary information on the fit # print(res_arma.summary()) # # # Print out the estimate for the constant and for theta # print(res_arma.params) # ### ARIMA Model (autoregressive integrated moving average) # http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html # #### This code will perform a grid search on p,d,q values and evaluate models with mean squared error (no seasonality) # from statsmodels.tsa.arima_model import ARIMA # from sklearn.metrics import mean_squared_error # import warnings # # evaluate an ARIMA model for a given order (p,d,q) # def evaluate_arima_model(X, arima_order): # # prepare training dataset # train_size = int(len(X) * 0.66) # train, test = X[0:train_size], X[train_size:] # history = [x for x in train] # # make predictions # predictions = list() # for t in range(len(test)): # model = ARIMA(history, order=arima_order) # model_fit = model.fit(disp=0) # yhat = model_fit.forecast()[0] # predictions.append(yhat) # history.append(test[t]) # # calculate out of sample error # error = mean_squared_error(test, predictions) # return error # # # evaluate combinations of p, d and q values for an ARIMA model # def evaluate_models(dataset, p_values, d_values, q_values): # dataset = dataset.astype('float32') # best_score, best_cfg = float("inf"), None # for p in p_values: # for d in d_values: # for q in q_values: # order = (p,d,q) # try: # mse = evaluate_arima_model(dataset, order) # if mse < best_score: # best_score, best_cfg = mse, order # print('ARIMA%s MSE=%.3f' % (order,mse)) # except: # continue # print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score)) # # evaluate parameters # p_values = [0, 1, 2, 4, 6, 8, 10] # d_values = range(0, 3) # q_values = range(0, 3) # warnings.filterwarnings("ignore") # evaluate_models(series.values, p_values, d_values, q_values) # # ### SARIMAX (seasonal autoreg. moving average X) # http://www.statsmodels.org/dev/generated/statsmodels.tsa.statespace.sarimax.SARIMAX.html # #### This code will perform SARIMAX to perform gridsearch on seasonal datasets with AIC as evaluation # # Import necessary libraries # import warnings # warnings.filterwarnings('ignore') # import itertools # import pandas as pd # import numpy as np # import statsmodels.api as sm # # # Define the p, d and q parameters to take any value between 0 and 2 # p = [0, 1, 2, 4, 6, 8, 10] # d = range(0,3) # q = range(0,3) # # Generate all different combinations of p, q and q triplets # pdq = list(itertools.product(p,d,q)) # # Generate all different combinations of seasonal p, q and q triplets (use 12 for frequency) # pdqs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))] # # Run a grid with pdq and seasonal pdq parameters calculated above and get the best AIC value # ans = [] # for comb in pdq: # for combs in pdqs: # try: # mod = sm.tsa.statespace.SARIMAX(CO2, # order=comb, # seasonal_order=combs, # enforce_stationarity=False, # enforce_invertibility=False) # # output = mod.fit() # ans.append([comb, combs, output.aic]) # print('ARIMA {} x {}12 : AIC Calculated ={}'.format(comb, combs, output.aic)) # except: # continue # ### Facebook Prophet # #### need to !pip install pystan and !pip install fbprophet before importing package # Facebook prophet uses an elegant yet simple method for analyzing and predicting periodic data known as the additive modelling. The idea is straightforward: represent a time-series as a combination of patterns at different scales such as daily, weekly, seasonally, and yearly, along with an overall trend. Your energy use might rise in the summer and decrease in the winter, but have an overall decreasing trend as you increase the energy efficiency of your home. An additive model can show us both patterns/trends and make predictions based on these observations. # https://facebook.github.io/prophet/docs/quick_start.html # # from fbprophet import Prophet as proph # #### Read in Data # #### Must rename input columns to 'ds' and the metric column to 'y' # # set the uncertainty interval to 95% (the Prophet default is 80%) # Model = proph(interval_width=0.95) # # Fit the timeseries into Model # Model.fit(ts) # # Use make_future_dataframe with a monthly frequency and periods = 36 for 3 years # future_dates = Model.make_future_dataframe(periods=36, freq='MS') # future_dates.tail() # #This future dates dataframe can now be used as input to the predict method of the fitted model. # # Predict the values for future dates and take the head of forecast # forecast = Model.predict(future_dates) # forecast.head() # # We can see that prophet returns a large table with many interesting columns, but we subset our output to the columns most relevant to forecasting, which are: # # - ds: the datestamp of the forecasted value # - yhat: the forecasted value of our metric (in Statistics, yhat is a notation traditionally used to represent the predicted values of a value y) # - yhat_lower: the lower bound of our forecasts # - yhat_upper: the upper bound of our forecasts. # # #Subset above mentioned columns and view the tail # forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() # # Use prophet's plot function to plot the predictions # Model.plot(forecast, uncertainty=True) # plt.show() # #Prophet plots the observed values of the time-series (the black dots), the forecasted values (blue line) and the uncertainty intervals of our forecasts (the blue shaded regions) # # Plot model components # Model.plot_components(forecast) # plt.show() # ### Evaluation of a Time Series Model # A good time series model should have following characteristics- # # - Residuals shouldn’t show any trends over time. # - Auto correlation Factors(ACF) and Partial Auto correlation Factor (PACF) shouldn’t have large values (beyond significance level)for any lags. ACF indicates correlation between the current value to all the previous values in a range. PACF is an extension of ACF, where it removes the correlation of the intermediate lags. You can read more on this here. # - Errors shouldn’t show any seasonality # - Errors should be normally distributed # - Error (MAE, MAPE, MSE etc.) should be low # - AIC, BIC should be relatively lower compared to other alternative models. # # #### ARIMA and SARIMAX summary # results = model.fit() # print(results.summary().tables[1]) # #### ARIMA plot diagnostics (density, qq, correlogram, standardized residual) # results.plot_diagnostics(figsize=(15, 18)) # plt.show() # #### One step ahead forecasting # # Get predictions starting from 01-01-1998 and calculate confidence intervals. # pred = output.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=False) # pred_conf = pred.conf_int() # # # Plot real vs predicted values along with confidence interval # rcParams['figure.figsize'] = 15, 6 # # #Plot observed values # ax = CO2['1990':].plot(label='observed') # # #Plot predicted values # pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.9) # # #Plot the range for confidence intervals # ax.fill_between(pred_conf.index, # pred_conf.iloc[:, 0], # pred_conf.iloc[:, 1], color='g', alpha=.5) # # #Set axes labels # ax.set_xlabel('Date') # ax.set_ylabel('CO2 Levels') # plt.legend() # # plt.show() # #### Dynamic forecasting # # # Get dynamic predictions with confidence intervals as above. # pred_dynamic = output.get_prediction(start=pd.to_datetime('1998-01-01'), dynamic=True, full_results=True) # pred_dynamic_conf = pred_dynamic.conf_int() # # # Plot the dynamic forecast with confidence intervals as above # ax = CO2['1990':].plot(label='observed', figsize=(20, 15)) # pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax) # # ax.fill_between(pred_dynamic_conf.index, # pred_dynamic_conf.iloc[:, 0], # pred_dynamic_conf.iloc[:, 1], color='g', alpha=.3) # # ax.fill_betweenx(ax.get_ylim(), pd.to_datetime('1998-01-01'), CO2_forecasted.index[-1], alpha=.1, zorder=-1) # # ax.set_xlabel('Date') # ax.set_ylabel('CO2 Levels') # # plt.legend() # plt.show() # #### Mean Squared Error # # Get the Real and predicted values # data_forecasted = pred.predicted_mean # data_truth = data['1998-01-01':] # # # Compute the mean square error # mse = ((data_forecasted - data_truth) ** 2).mean() # print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2))) # #### Producing and Visualizing forecasts # # Get forecast 500 steps ahead in future # prediction = results.get_forecast(steps=500) # # # Get confidence intervals of forecasts # pred_conf = prediction.conf_int() # # # Plot future predictions with confidence intervals # ax = data.plot(label='observed', figsize=(20, 15)) # prediction.predicted_mean.plot(ax=ax, label='Forecast') # ax.fill_between(pred_conf.index, # pred_conf.iloc[:, 0], # pred_conf.iloc[:, 1], color='k', alpha=.25) # ax.set_xlabel('Date') # ax.set_ylabel('CO2 Levels') # # plt.legend() # plt.show()