import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import mean_squared_error from google.colab import drive drive.mount("/content/drive") california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date']) new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date']) ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date']) california.head() california.tail() # remove final 5 days because it does not make sense to have zero cases california = california.iloc[:-5, :] california.info() california.describe() fig,ax = plt.subplots(1,1, figsize = (12,6)) plt.plot(california.index.values, california['JHU_cases']) plt.xlabel('Time') plt.ylabel('Cases') plt.title('COVID-19 cases in California over time') plt.show() california['JHU_cases'] = california['JHU_cases'].fillna(0) california['JHU_cases'][california['JHU_cases'] <0] california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0 california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0 california['JHU_cases'][california['JHU_cases'] <0] # Applying rolling average window = 5 california['JHU_cases'] = california['JHU_cases'].transform(lambda x: x.rolling(window).mean()) california['JHU_cases'] = california['JHU_cases'].fillna(0) fig,ax = plt.subplots(1,1, figsize = (12,6)) plt.plot(california.index.values, california['JHU_cases']) plt.xlabel('Time') plt.ylabel('Cases') plt.title('COVID-19 cases in California over time') plt.show() # Applying log transformation fig,ax = plt.subplots(1,1, figsize = (12,6)) log_cases = np.log(california['JHU_cases']+1) plt.plot(california.index.values, log_cases) plt.title("log of COVID-19 cases in CA over time") plt.show() any(california.index.isna()) any(log_cases.isna()) # Define functions to calculate AIC and BIC from scipy.stats import norm def evaluate_AIC(k, residuals): """ Finds the AIC given the number of parameters estimated and the residuals of the model. Assumes residuals are distributed Gaussian with unknown variance. """ standard_deviation = np.std(residuals) log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation) return 2 * k - 2 * np.sum(log_likelihood) def evaluate_BIC(k, residuals): """ Finds the AIC given the number of parameters estimated and the residuals of the model. Assumes residuals are distributed Gaussian with unknown variance. """ standard_deviation = np.std(residuals) log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation) return k * np.log(len(residuals)) - 2 * np.sum(log_likelihood) from sklearn import linear_model clf = linear_model.LinearRegression() index_len = len(california.index.values) index = np.linspace(1,index_len,index_len).reshape(-1,1) # index = california.index.values.astype('datetime64[D]').reshape(-1,1) clf.fit(index, log_cases) print(clf.coef_) linear_prediction = clf.predict(index) fig, axs = plt.subplots(2,1, figsize = (12,12)) axs[0].plot(california.index.values, log_cases, label='log COVID-19 cases (original data)') axs[0].plot(california.index.values, linear_prediction, 'r', label='fitted line') axs[0].legend() axs[0].set_title("True Log Cases vs Fitted Linear Line") linear_residuals = log_cases - linear_prediction axs[1].plot(california.index.values, linear_residuals, 'o') axs[1].set_title("Residuals: True Log Cases - Fitted Linear Line") print("MSE with linear fit:", np.mean((linear_residuals)**2)) print("AIC:", evaluate_AIC(1, linear_residuals)) print("BIC:", evaluate_BIC(1, linear_residuals)) # After linear fit, it seems like a higher order model is needed from sklearn import linear_model clf = linear_model.LinearRegression() index_len = len(california.index.values) index = np.linspace(1,index_len,index_len).reshape(-1,1) new_x = np.hstack((index, index ** 2, index ** 3)) clf.fit(new_x, log_cases) print(clf.coef_) cubic_prediction = clf.predict(new_x) fig, axs = plt.subplots(2,1, figsize=(12,12)) axs[0].plot(california.index.values, log_cases, label='log COVID-19 cases (original data)') axs[0].plot(california.index.values, cubic_prediction, 'r', label='fitted line') axs[0].legend() axs[0].set_title("True Log Cases vs Fitted Cubic Line") cubic_residuals = log_cases - cubic_prediction axs[1].plot(california.index.values, cubic_residuals, 'o') axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line") print("MSE with cubic fit:", np.mean((cubic_residuals)**2)) print("AIC:", evaluate_AIC(2, cubic_residuals)) print("BIC:", evaluate_BIC(2, cubic_residuals)) import statsmodels.api as sm fig, axs = plt.subplots(2,1, figsize = (12,10)) sm.graphics.tsa.plot_acf(cubic_residuals, lags=30, ax = axs[0]) sm.graphics.tsa.plot_pacf(cubic_residuals, lags=30, ax = axs[1]) plt.show() from statsmodels.tsa.arima.model import ARIMA def grid_search_ARIMA(data, AR_range, MA_range, verbose=False): min_aic = np.inf min_bic = np.inf min_aic_index = None min_bic_index = None aic_matrix = np.zeros((len(AR_range), len(MA_range))) bic_matrix = np.zeros((len(AR_range), len(MA_range))) for AR_order in AR_range: for MA_order in MA_range: arma = ARIMA(data, order=(AR_order, 0, MA_order)).fit() aic_matrix[AR_order, MA_order] = arma.aic bic_matrix[AR_order, MA_order] = arma.bic if arma.aic < min_aic: min_aic = arma.aic min_aic_index = (AR_order, 0, MA_order) if arma.bic < min_bic: min_bic = arma.bic min_bic_index = (AR_order, 0, MA_order) if verbose: print("Minimizing AIC order: ", min_aic_index) print("Minimizing BIC order: ", min_bic_index ) print("matrix of AIC", aic_matrix) print("Matrix of BIC", bic_matrix) return min_aic_index, min_bic_index, aic_matrix, bic_matrix min_aic_index, min_bic_index, _, _ = grid_search_ARIMA(cubic_residuals, range(4), range(4), verbose=True) if min_aic_index == min_bic_index: arma = ARIMA(log_cases, order=min_bic_index).fit() print(arma.summary()) arma_predictions = arma.predict() arma_residuals = log_cases - arma_predictions arma_residuals = arma_residuals[1:] # Fitting AR 1 model means removing one observation fig, axs = plt.subplots(2,1, figsize=(12,12)) axs[0].plot(log_cases[1:], label='AR time series') axs[0].plot(arma_predictions[1:], 'r', label='fitted line') axs[0].set_title("True Log Cases vs Fitted ARIMA Line") axs[0].legend() axs[1].plot(arma_residuals, 'o') axs[1].set_title("ARIMA Residuals: True Log Cases - Fitted ARIMA Line") plt.show() print("Automatic selection finds model with AR {0}, MA {2}".format(*min_aic_index)) print("MSE with selected model:", np.mean(arma_residuals**2)) else: print("AIC, BIC do not agree.") train_test_split = int(len(log_cases) * 0.8) train_price, test_price = cubic_residuals[:train_test_split], cubic_residuals[train_test_split:] train_date, test_date = california.index.values[:train_test_split], california.index.values[train_test_split:] assert(len(train_date) + len(test_date) == len(california.index.values)) ## First, let's see how this does with the AIC selected values. arma = ARIMA(train_price, order=min_aic_index).fit() print(arma.summary()) fig, ax = plt.subplots(figsize=(15, 5)) # Construct the forecasts fcast = arma.get_forecast(len(test_price)).summary_frame() arma_predictions = arma.predict() ax.plot(california.index.values, cubic_residuals, label='log cases, converted to stationary time series') predicted_values = arma_predictions ax.plot(train_date, predicted_values, 'r', label='fitted line') forecast_means = fcast['mean'].values.reshape(-1,1) test_set_mse = np.mean((forecast_means.reshape(test_price.shape) - test_price)**2) ax.plot(test_date, forecast_means, 'k--', label='mean forecast') ax.fill_between(test_date.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1); plt.legend() print("Test set mean squared error: ", test_set_mse) ## Then, let's see how this does with the AR(1) and MA(3). arma = ARIMA(train_price, order=(1,0,3)).fit() print(arma.summary()) fig, ax = plt.subplots(figsize=(15, 5)) # Construct the forecasts fcast = arma.get_forecast(len(test_price)).summary_frame() arma_predictions = arma.predict() ax.plot(california.index.values, cubic_residuals, label='log cases, converted to stationary time series') predicted_values = arma_predictions ax.plot(train_date, predicted_values, 'r', label='fitted line') forecast_means = fcast['mean'].values.reshape(-1,1) test_set_mse = np.mean((forecast_means.reshape(test_price.shape) - test_price)**2) ax.plot(test_date, forecast_means, 'k--', label='mean forecast') ax.fill_between(test_date.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1); plt.legend() print("Test set mean squared error: ", test_set_mse) # predicted_values - week_cubic_residuals # forecast_means - week_test predicted_labels = np.concatenate((predicted_values.reset_index().predicted_mean, forecast_means.flatten()), axis=None) true_labels = cubic_residuals.reset_index().JHU_cases week_arima_mse = mean_squared_error(true_labels, predicted_labels) print(f"MSE of ARIMA model for weekly prediction: {week_arima_mse:.4f}") collapsed = log_cases.groupby(pd.Grouper(freq ='W')).mean() week_date = collapsed.reset_index().date.dt.date.values.reshape(-1,1) week_log_case = collapsed.reset_index().JHU_cases.values.reshape(-1,1) fig, ax = plt.subplots(1,1, figsize = (12,6)) plt.plot(week_date, week_log_case) plt.title("log of COVID-19 cases over time") plt.show() clf = linear_model.LinearRegression() index = collapsed.reset_index().index.values.reshape(-1,1) new_x = np.hstack((index, index **2, index **3)) clf.fit(new_x, week_log_case) print(clf.coef_) # To print the coefficient estimate of the series. month_quad_prediction = clf.predict(new_x) fig, axs = plt.subplots(2,1, figsize=(12,12)) axs[0].plot(week_date, week_log_case, label='original data') axs[0].plot(week_date, month_quad_prediction, 'r', label='fitted line') axs[0].legend() axs[0].set_title("True Log Cases vs Fitting Cubic Line") week_cubic_residuals = week_log_case - month_quad_prediction axs[1].plot(week_date, week_cubic_residuals, 'o') axs[0].set_title("Residuals: True Log Cases - Fitting Cubic Line") plt.show(); print("MSE with cubic fit:", np.mean((week_cubic_residuals)**2)) fig, axs = plt.subplots(2,1, figsize = (12,12)) sm.graphics.tsa.plot_acf(week_cubic_residuals, lags=14, ax = axs[0]) sm.graphics.tsa.plot_pacf(week_cubic_residuals, lags=14, ax = axs[1]) plt.show() week_train_test = int(0.8 * len(week_date)) week_train, week_test = week_cubic_residuals[:week_train_test], week_cubic_residuals[week_train_test:] week_date_train, week_date_test = week_date[:week_train_test], week_date[week_train_test:] min_aic_index, min_bic_index, *other = grid_search_ARIMA(week_cubic_residuals, range(4), range(4), verbose=True) if min_aic_index == min_bic_index: arma = ARIMA(week_cubic_residuals, order=min_bic_index).fit() print(arma.summary()) arma_predictions = arma.predict() arma_residuals = week_cubic_residuals.reshape(arma_predictions.shape) - arma_predictions arma_residuals = arma_residuals # Fitting AR 1 model means removing one observation fig, axs = plt.subplots(2,1, figsize = (12,12)) axs[0].plot(week_cubic_residuals, label='Residuals from fitted cubic line') axs[0].plot(arma_predictions, 'r', label='fitted ARMA process') axs[0].legend() axs[1].plot(arma_residuals, 'o') plt.show() print("Automatic selection finds model with AR {0}, MA {2}".format(*min_aic_index)) print("MSE with selected model:", np.mean(arma_residuals**2)) else: print("AIC, BIC do not agree.") arma = ARIMA(week_train, order=min_bic_index).fit() fig, ax = plt.subplots(figsize=(15, 5)) # Construct the forecasts fcast = arma.get_forecast(len(week_test)).summary_frame() arma_predictions = arma.predict() ax.plot(week_date, week_cubic_residuals, label='original data') predicted_values = arma_predictions.reshape(-1,1) ax.plot(week_date_train, predicted_values, 'r', label='fitted line') forecast_means = fcast['mean'].values.reshape(-1,1) ax.plot(week_date_test, forecast_means, 'k--', label='mean forecast') ax.fill_between(week_date_test.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1); plt.legend(); test_set_mse = np.mean((forecast_means.reshape(week_test.shape) - week_test)**2) print("Test set mean squared error: ", test_set_mse) # predicted_values - week_cubic_residuals # forecast_means - week_test predicted_labels = np.concatenate((predicted_values.flatten(), forecast_means.flatten()), axis=None) true_labels = week_cubic_residuals.flatten() week_arima_mse = mean_squared_error(true_labels, predicted_labels) print(f"MSE of ARIMA model for weekly prediction: {week_arima_mse:.4f}")