import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


from google.colab import drive
drive.mount("/content/drive")

california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date'])
new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date'])
ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date'])

california.head()

california.tail()

# remove final 5 days because it does not make sense to have zero cases
california = california.iloc[:-5, :]

california.info()

california.describe()

fig,ax = plt.subplots(1,1, figsize = (12,6))
plt.plot(california.index.values, california['JHU_cases'])
plt.xlabel('Time')
plt.ylabel('Cases')
plt.title('COVID-19 cases in California over time')
plt.show()

california['JHU_cases'] = california['JHU_cases'].fillna(0)

california['JHU_cases'][california['JHU_cases'] <0]

california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0
california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0

california['JHU_cases'][california['JHU_cases'] <0]

# Applying rolling average
window = 5
california['JHU_cases'] = california['JHU_cases'].transform(lambda x: x.rolling(window).mean())

california['JHU_cases'] = california['JHU_cases'].fillna(0)

fig,ax = plt.subplots(1,1, figsize = (12,6))
plt.plot(california.index.values, california['JHU_cases'])
plt.xlabel('Time')
plt.ylabel('Cases')
plt.title('COVID-19 cases in California over time')
plt.show()

# Applying log transformation
fig,ax = plt.subplots(1,1, figsize = (12,6))
log_cases = np.log(california['JHU_cases']+1)
plt.plot(california.index.values, log_cases)
plt.title("log of COVID-19 cases in CA over time")
plt.show()

any(california.index.isna())

any(log_cases.isna())

# Define functions to calculate AIC and BIC
from scipy.stats import norm 

def evaluate_AIC(k, residuals):
  """
  Finds the AIC given the number of parameters estimated and 
  the residuals of the model. Assumes residuals are distributed 
  Gaussian with unknown variance. 
  """
  standard_deviation = np.std(residuals)
  log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation)
  return 2 * k - 2 * np.sum(log_likelihood)

def evaluate_BIC(k, residuals):
  """
  Finds the AIC given the number of parameters estimated and 
  the residuals of the model. Assumes residuals are distributed 
  Gaussian with unknown variance. 
  """
  standard_deviation = np.std(residuals)
  log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation)
  return k * np.log(len(residuals)) - 2 * np.sum(log_likelihood)

from sklearn import linear_model
clf = linear_model.LinearRegression()
index_len = len(california.index.values)
index = np.linspace(1,index_len,index_len).reshape(-1,1)
# index = california.index.values.astype('datetime64[D]').reshape(-1,1)

clf.fit(index, log_cases)
print(clf.coef_) 
linear_prediction = clf.predict(index)

fig, axs = plt.subplots(2,1, figsize = (12,12))
axs[0].plot(california.index.values, log_cases, label='log COVID-19 cases (original data)')
axs[0].plot(california.index.values, linear_prediction, 'r', label='fitted line')
axs[0].legend()
axs[0].set_title("True Log Cases vs Fitted Linear Line")
linear_residuals = log_cases - linear_prediction
axs[1].plot(california.index.values, linear_residuals, 'o')
axs[1].set_title("Residuals: True Log Cases - Fitted Linear Line")

print("MSE with linear fit:", np.mean((linear_residuals)**2))
print("AIC:", evaluate_AIC(1, linear_residuals))
print("BIC:", evaluate_BIC(1, linear_residuals))

# After linear fit, it seems like a higher order model is needed
from sklearn import linear_model
clf = linear_model.LinearRegression()
index_len = len(california.index.values)
index = np.linspace(1,index_len,index_len).reshape(-1,1)

new_x = np.hstack((index, index ** 2, index ** 3))
clf.fit(new_x, log_cases)
print(clf.coef_) 
cubic_prediction = clf.predict(new_x)

fig, axs = plt.subplots(2,1, figsize=(12,12))
axs[0].plot(california.index.values, log_cases, label='log COVID-19 cases (original data)')
axs[0].plot(california.index.values, cubic_prediction, 'r', label='fitted line')
axs[0].legend()
axs[0].set_title("True Log Cases vs Fitted Cubic Line")
cubic_residuals = log_cases - cubic_prediction
axs[1].plot(california.index.values, cubic_residuals, 'o')
axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line")
print("MSE with cubic fit:", np.mean((cubic_residuals)**2))
print("AIC:", evaluate_AIC(2, cubic_residuals))
print("BIC:", evaluate_BIC(2, cubic_residuals))

import statsmodels.api as sm
fig, axs = plt.subplots(2,1, figsize = (12,10))
sm.graphics.tsa.plot_acf(cubic_residuals, lags=30, ax = axs[0])
sm.graphics.tsa.plot_pacf(cubic_residuals, lags=30, ax = axs[1])
plt.show()

from statsmodels.tsa.arima.model import ARIMA

def grid_search_ARIMA(data, AR_range, MA_range, verbose=False):
  min_aic = np.inf 
  min_bic = np.inf
  min_aic_index = None
  min_bic_index = None 
  aic_matrix = np.zeros((len(AR_range), len(MA_range)))
  bic_matrix = np.zeros((len(AR_range), len(MA_range)))
  for AR_order in AR_range:
    for MA_order in MA_range:
      arma = ARIMA(data, order=(AR_order, 0, MA_order)).fit()
      aic_matrix[AR_order, MA_order] = arma.aic
      bic_matrix[AR_order, MA_order] = arma.bic
      if arma.aic < min_aic:
        min_aic = arma.aic
        min_aic_index = (AR_order, 0, MA_order)
      if arma.bic < min_bic:
        min_bic = arma.bic
        min_bic_index = (AR_order, 0, MA_order)
  if verbose:
    print("Minimizing AIC order: ", min_aic_index)
    print("Minimizing BIC order: ", min_bic_index )
    print("matrix of AIC", aic_matrix)
    print("Matrix of BIC", bic_matrix)
  return min_aic_index, min_bic_index, aic_matrix, bic_matrix

min_aic_index, min_bic_index, _, _ = grid_search_ARIMA(cubic_residuals, range(4), range(4), verbose=True)
if min_aic_index == min_bic_index:
  arma = ARIMA(log_cases, order=min_bic_index).fit()
  print(arma.summary())
  arma_predictions = arma.predict()
  arma_residuals = log_cases - arma_predictions
  arma_residuals = arma_residuals[1:] # Fitting AR 1 model means removing one observation
  
  fig, axs = plt.subplots(2,1, figsize=(12,12))
  axs[0].plot(log_cases[1:], label='AR time series')
  axs[0].plot(arma_predictions[1:], 'r', label='fitted line')
  axs[0].set_title("True Log Cases vs Fitted ARIMA Line")
  axs[0].legend()
  axs[1].plot(arma_residuals, 'o')
  axs[1].set_title("ARIMA Residuals: True Log Cases - Fitted ARIMA Line")
  plt.show()
  print("Automatic selection finds model with AR {0}, MA {2}".format(*min_aic_index))
  print("MSE with selected model:", np.mean(arma_residuals**2))
else:
  print("AIC, BIC do not agree.")

train_test_split = int(len(log_cases) * 0.8)
train_price, test_price = cubic_residuals[:train_test_split], cubic_residuals[train_test_split:]
train_date, test_date = california.index.values[:train_test_split], california.index.values[train_test_split:]
assert(len(train_date) + len(test_date) == len(california.index.values))

## First, let's see how this does with the AIC selected values. 
arma = ARIMA(train_price, order=min_aic_index).fit()
print(arma.summary())
fig, ax = plt.subplots(figsize=(15, 5))

# Construct the forecasts
fcast = arma.get_forecast(len(test_price)).summary_frame()

arma_predictions = arma.predict()
ax.plot(california.index.values, cubic_residuals, label='log cases, converted to stationary time series')
predicted_values = arma_predictions
ax.plot(train_date, predicted_values, 'r', label='fitted line')
forecast_means = fcast['mean'].values.reshape(-1,1)
test_set_mse = np.mean((forecast_means.reshape(test_price.shape) - test_price)**2)
ax.plot(test_date, forecast_means, 'k--', label='mean forecast')
ax.fill_between(test_date.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1);
plt.legend()
print("Test set mean squared error: ", test_set_mse)

## Then, let's see how this does with the AR(1) and MA(3). 
arma = ARIMA(train_price, order=(1,0,3)).fit()
print(arma.summary())
fig, ax = plt.subplots(figsize=(15, 5))

# Construct the forecasts
fcast = arma.get_forecast(len(test_price)).summary_frame()

arma_predictions = arma.predict()
ax.plot(california.index.values, cubic_residuals, label='log cases, converted to stationary time series')
predicted_values = arma_predictions
ax.plot(train_date, predicted_values, 'r', label='fitted line')
forecast_means = fcast['mean'].values.reshape(-1,1)
test_set_mse = np.mean((forecast_means.reshape(test_price.shape) - test_price)**2)
ax.plot(test_date, forecast_means, 'k--', label='mean forecast')
ax.fill_between(test_date.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1);
plt.legend()
print("Test set mean squared error: ", test_set_mse)

# predicted_values - week_cubic_residuals
# forecast_means - week_test

predicted_labels = np.concatenate((predicted_values.reset_index().predicted_mean, forecast_means.flatten()), axis=None)
true_labels = cubic_residuals.reset_index().JHU_cases

week_arima_mse = mean_squared_error(true_labels, predicted_labels)
print(f"MSE of ARIMA model for weekly prediction: {week_arima_mse:.4f}")

collapsed = log_cases.groupby(pd.Grouper(freq ='W')).mean()
week_date = collapsed.reset_index().date.dt.date.values.reshape(-1,1)
week_log_case = collapsed.reset_index().JHU_cases.values.reshape(-1,1)
fig, ax = plt.subplots(1,1, figsize = (12,6))
plt.plot(week_date, week_log_case)
plt.title("log of COVID-19 cases over time")
plt.show()

clf = linear_model.LinearRegression()
index = collapsed.reset_index().index.values.reshape(-1,1)
new_x = np.hstack((index, index **2, index **3))
clf.fit(new_x, week_log_case)
print(clf.coef_) # To print the coefficient estimate of the series. 
month_quad_prediction = clf.predict(new_x)

fig, axs = plt.subplots(2,1, figsize=(12,12))
axs[0].plot(week_date, week_log_case, label='original data')
axs[0].plot(week_date, month_quad_prediction, 'r', label='fitted line')
axs[0].legend()
axs[0].set_title("True Log Cases vs Fitting Cubic Line")

week_cubic_residuals = week_log_case - month_quad_prediction
axs[1].plot(week_date, week_cubic_residuals, 'o')
axs[0].set_title("Residuals: True Log Cases - Fitting Cubic Line")
plt.show();
print("MSE with cubic fit:", np.mean((week_cubic_residuals)**2))

fig, axs = plt.subplots(2,1, figsize = (12,12))
sm.graphics.tsa.plot_acf(week_cubic_residuals, lags=14, ax = axs[0])
sm.graphics.tsa.plot_pacf(week_cubic_residuals, lags=14, ax = axs[1])
plt.show()

week_train_test = int(0.8 * len(week_date))
week_train, week_test = week_cubic_residuals[:week_train_test], week_cubic_residuals[week_train_test:]
week_date_train, week_date_test = week_date[:week_train_test], week_date[week_train_test:]

min_aic_index, min_bic_index, *other = grid_search_ARIMA(week_cubic_residuals, range(4), range(4), verbose=True)
if min_aic_index == min_bic_index:
  arma = ARIMA(week_cubic_residuals, order=min_bic_index).fit()
  print(arma.summary())
  arma_predictions = arma.predict()
  arma_residuals = week_cubic_residuals.reshape(arma_predictions.shape) - arma_predictions
  arma_residuals = arma_residuals # Fitting AR 1 model means removing one observation
  
  fig, axs = plt.subplots(2,1, figsize = (12,12))
  axs[0].plot(week_cubic_residuals, label='Residuals from fitted cubic line')
  axs[0].plot(arma_predictions, 'r', label='fitted ARMA process')
  axs[0].legend()
  axs[1].plot(arma_residuals, 'o')
  plt.show()
  print("Automatic selection finds model with AR {0}, MA {2}".format(*min_aic_index))
  print("MSE with selected model:", np.mean(arma_residuals**2))
else:
  print("AIC, BIC do not agree.")

arma = ARIMA(week_train, order=min_bic_index).fit()
fig, ax = plt.subplots(figsize=(15, 5))

# Construct the forecasts
fcast = arma.get_forecast(len(week_test)).summary_frame()

arma_predictions = arma.predict()
ax.plot(week_date, week_cubic_residuals, label='original data')
predicted_values = arma_predictions.reshape(-1,1)
ax.plot(week_date_train, predicted_values, 'r', label='fitted line')
forecast_means = fcast['mean'].values.reshape(-1,1)
ax.plot(week_date_test, forecast_means, 'k--', label='mean forecast')
ax.fill_between(week_date_test.flatten(), fcast['mean_ci_lower'], fcast['mean_ci_upper'], color='k', alpha=0.1);
plt.legend();

test_set_mse = np.mean((forecast_means.reshape(week_test.shape) - week_test)**2)
print("Test set mean squared error: ", test_set_mse)

# predicted_values - week_cubic_residuals
# forecast_means - week_test
predicted_labels = np.concatenate((predicted_values.flatten(), forecast_means.flatten()), axis=None)
true_labels = week_cubic_residuals.flatten()

week_arima_mse = mean_squared_error(true_labels, predicted_labels)
print(f"MSE of ARIMA model for weekly prediction: {week_arima_mse:.4f}")