import os
import random as rn
import tensorflow as tf
import pandas as pd
import numpy as np
import missingno as meso
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.impute import KNNImputer
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Flatten, Dropout, Bidirectional, TimeDistributed, Conv1D, MaxPool1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, LambdaCallback
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

# Import class for reservoir computing
# from pyESN import ESN # Written by user cknd: https://github.com/cknd/pyESN
# from keras.models import Sequential
# from keras.layers import Dense
# from keras.layers import Flatten
# from keras.layers.convolutional import Conv1D
# from keras.layers.convolutional import MaxPooling1D
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount("/content/drive")

os.environ['PYTHONHASHSEED'] = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = ''
tf.random.set_seed(109)
np.random.seed(109)
rn.seed(109)

california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date'])
new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date'])
ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date'])

print(california.tail(10)) # remove the final 5 days since it's NA
print(new_york.tail(10)) # remove the final 5 days since it's NA
print(ma.tail(10)) # remove the final 5 days since it's NA

california = california.iloc[:-5, :]
new_york = new_york.iloc[:-5, :]
ma = ma.iloc[:-5, :]

california.describe()

new_york.describe()

ma.describe()

california.shape, new_york.shape, ma.shape

fig,axs = plt.subplots(3,1, figsize = (12,15))
axs[0].plot(california.index.values, california['JHU_cases'])
axs[1].plot(new_york.index.values, new_york['JHU_cases'])
axs[2].plot(ma.index.values, ma['JHU_cases'])
for i in range(3):
  axs[i].set_xlabel('Time')
axs[0].set_ylabel('Cases')
axs[1].set_ylabel('Cases')
axs[2].set_ylabel('Cases')
axs[0].set_title('COVID-19 cases over time in California')
axs[1].set_title('COVID-19 cases over time in New York')
axs[2].set_title('COVID-19 cases over time in Massachusetts')


meso.matrix(california)

meso.matrix(new_york.iloc[:,:200])

meso.matrix(ma.iloc[:,:200])

for state in [california, new_york, ma]:
  state['JHU_cases']=state['JHU_cases'].fillna(0)
  print(state['JHU_cases'][state['JHU_cases'] < 0])

california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0
california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0
ma['JHU_cases'][ma['JHU_cases'] == -280.0] = 280

california.shape, new_york.shape, ma.shape

fig = plt.subplots(1,1, figsize = (18,8))
plt.plot(california.index.values,california['JHU_cases'],label="California")
plt.plot(new_york.index.values,new_york['JHU_cases'],label="New York")
plt.plot(ma.index.values,ma['JHU_cases'],label="Massachusetts")
plt.xlabel('Time')
plt.ylabel('Cases')
plt.title('COVID-19 cases over time')
plt.legend()

#separate feature and outcome tables
cali_features=california.iloc[:,2:]
cali_outcomes=california.iloc[:,0]
ny_features=new_york.iloc[:,2:]
ny_outcomes=new_york.iloc[:,0]
ma_features=ma.iloc[:,2:]
ma_outcomes=ma.iloc[:,0]

#extract only google trend features
gt_cols=[]
for col in cali_features.columns:
  if "gt_" in col or "gt2_" in col:
    gt_cols.append(col)
cali_gt=cali_features[gt_cols]

gt_cols=[]
for col in ny_features.columns:
  if "gt_" in col or "gt2_" in col:
    gt_cols.append(col)
ny_gt=ny_features[gt_cols]

gt_cols=[]
for col in ma_features.columns:
  if "gt_" in col or "gt2_" in col:
    gt_cols.append(col)
ma_gt=ma_features[gt_cols]

# Window == number of days in rolling window over which to calculate mean
window = 5
cali_outcomes = cali_outcomes.transform(lambda x: x.rolling(window).mean())
ny_outcomes = ny_outcomes.transform(lambda x: x.rolling(window).mean())
ma_outcomes = ma_outcomes.transform(lambda x: x.rolling(window).mean())

cali_outcomes = cali_outcomes.fillna(0)
ny_outcomes = ny_outcomes.fillna(0)
ma_outcomes = ma_outcomes.fillna(0)

# cali_outcomes=cali_outcomes.apply(lambda x: x*100000/39499738)
# ny_outcomes=ny_outcomes.apply(lambda x: x*100000/20154933)
# ma_outcomes=ma_outcomes.apply(lambda x: x*100000/7022220)

fig = plt.subplots(1,1, figsize = (18,8))
plt.plot(california.index.values,cali_outcomes,label="California")
plt.plot(new_york.index.values,ny_outcomes,label="New York")
plt.plot(ma.index.values,ma_outcomes,label="Massachusetts")
plt.xlabel('Time')
plt.ylabel('Cases')
plt.title('COVID-19 cases over time')
plt.legend()

log_cali_outcomes = np.log(cali_outcomes+1)
log_ny_outcomes = np.log(ny_outcomes+1)
log_ma_outcomes = np.log(ma_outcomes+1)

fig = plt.subplots(1,1, figsize = (18,8))
plt.plot(california.index.values,log_cali_outcomes,label="California")
plt.plot(new_york.index.values,log_ny_outcomes,label="New York")
plt.plot(ma.index.values,log_ma_outcomes,label="Massachusetts")
plt.xlabel('Time')
plt.ylabel('Log Cases')
plt.title('Log  COVID-19 cases over time')
plt.legend()

#imputation using KNN
col_names=cali_gt.columns
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
imputer.fit(cali_gt)
imputed_data = imputer.transform(cali_gt)
imputed_cali_gt = pd.DataFrame(imputed_data,columns=col_names)

col_names=ny_gt.columns
imputer.fit(ny_gt)
imputed_data = imputer.transform(ny_gt)
imputed_ny_gt = pd.DataFrame(imputed_data,columns=col_names)

col_names=ma_gt.columns
imputer.fit(ma_gt)
imputed_data = imputer.transform(ma_gt)
imputed_ma_gt = pd.DataFrame(imputed_data,columns=col_names)

#Drop highly correlated features common among all three datasets
imputed_cali_gt=imputed_cali_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1)
imputed_ny_gt=imputed_ny_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1)
imputed_ma_gt=imputed_ma_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1)

imputed_cali_gt.shape, imputed_ny_gt.shape, imputed_ma_gt.shape

log_cali_outcomes.shape, log_ny_outcomes.shape, log_ma_outcomes.shape

log_cali_outcomes

# # Define functions to calculate AIC and BIC
# from scipy.stats import norm 

# def evaluate_AIC(k, residuals):
#   """
#   Finds the AIC given the number of parameters estimated and 
#   the residuals of the model. Assumes residuals are distributed 
#   Gaussian with unknown variance. 
#   """
#   standard_deviation = np.std(residuals)
#   log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation)
#   return 2 * k - 2 * np.sum(log_likelihood)

# def evaluate_BIC(k, residuals):
#   """
#   Finds the AIC given the number of parameters estimated and 
#   the residuals of the model. Assumes residuals are distributed 
#   Gaussian with unknown variance. 
#   """
#   standard_deviation = np.std(residuals)
#   log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation)
#   return k * np.log(len(residuals)) - 2 * np.sum(log_likelihood)

# # After linear fit, it seems like a higher order model is needed
# from sklearn import linear_model
# clf = linear_model.LinearRegression()
# index_len = len(log_cali_outcomes.index.values)
# index = np.linspace(1,index_len,index_len).reshape(-1,1)

# new_x = np.hstack((index, index ** 2, index ** 3))
# clf.fit(new_x, log_cali_outcomes)
# print(clf.coef_) 
# cubic_prediction = clf.predict(new_x)

# fig, axs = plt.subplots(2,1, figsize=(12,12))
# axs[0].plot(california.index.values, log_cali_outcomes, label='log COVID-19 cases (original data)')
# axs[0].plot(california.index.values, cubic_prediction, 'r', label='fitted line')
# axs[0].legend()
# axs[0].set_title("True Log Cases vs Fitted Cubic Line (CA)")
# cali_cubic_residuals = log_cali_outcomes - cubic_prediction
# axs[1].plot(california.index.values, cali_cubic_residuals, 'o')
# axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line")
# print("MSE with cubic fit:", np.mean((cali_cubic_residuals)**2))
# print("AIC:", evaluate_AIC(2, cali_cubic_residuals))
# print("BIC:", evaluate_BIC(2, cali_cubic_residuals))

# # After linear fit, it seems like a higher order model is needed
# from sklearn import linear_model
# clf = linear_model.LinearRegression()
# index_len = len(log_ny_outcomes.index.values)
# index = np.linspace(1,index_len,index_len).reshape(-1,1)

# new_x = np.hstack((index, index ** 2, index ** 3))
# clf.fit(new_x, log_ny_outcomes)
# print(clf.coef_) 
# cubic_prediction = clf.predict(new_x)

# fig, axs = plt.subplots(2,1, figsize=(12,12))
# axs[0].plot(new_york.index.values, log_ny_outcomes, label='log COVID-19 cases (original data)')
# axs[0].plot(new_york.index.values, cubic_prediction, 'r', label='fitted line')
# axs[0].legend()
# axs[0].set_title("True Log Cases vs Fitted Cubic Line (NY)")
# ny_cubic_residuals = log_ny_outcomes - cubic_prediction
# axs[1].plot(new_york.index.values, ny_cubic_residuals, 'o')
# axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line")
# print("MSE with cubic fit:", np.mean((ny_cubic_residuals)**2))
# print("AIC:", evaluate_AIC(2, ny_cubic_residuals))
# print("BIC:", evaluate_BIC(2, ny_cubic_residuals))

# # After linear fit, it seems like a higher order model is needed
# from sklearn import linear_model
# clf = linear_model.LinearRegression()
# index_len = len(log_ny_outcomes.index.values)
# index = np.linspace(1,index_len,index_len).reshape(-1,1)

# new_x = np.hstack((index, index ** 2, index ** 3))
# clf.fit(new_x, log_ny_outcomes)
# print(clf.coef_) 
# cubic_prediction = clf.predict(new_x)

# fig, axs = plt.subplots(2,1, figsize=(12,12))
# axs[0].plot(new_york.index.values, log_ny_outcomes, label='log COVID-19 cases (original data)')
# axs[0].plot(new_york.index.values, cubic_prediction, 'r', label='fitted line')
# axs[0].legend()
# axs[0].set_title("True Log Cases vs Fitted Cubic Line (NY)")
# ma_cubic_residuals = log_ny_outcomes - cubic_prediction
# axs[1].plot(new_york.index.values, ma_cubic_residuals, 'o')
# axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line")
# print("MSE with cubic fit:", np.mean((ma_cubic_residuals)**2))
# print("AIC:", evaluate_AIC(2, ma_cubic_residuals))
# print("BIC:", evaluate_BIC(2, ma_cubic_residuals))

def data_process(features, outcome):
    outcome = outcome.reset_index(drop = True)
    features = features.reset_index(drop = True)
    cols = features.columns
    standardizer = StandardScaler()
    standardizer.fit(features)
    scaled_features = pd.DataFrame(standardizer.transform(features),columns=cols)
    combo = pd.concat([scaled_features, outcome], axis = 1)
    combo["JHU_cases"] = combo["JHU_cases"].fillna(0)
    return combo

def times_series_split(time_series_df, response:str, days_train, days_pred, print_shape=False):
    
    # Initialize x, y arrays as zeroes
    # Decide if you want to hold out response variable
    x = np.zeros((0, days_train, time_series_df.shape[-1]-1))
    y = np.zeros((0, days_pred))
        
    # For every row in the dataframe
    for i in range(len(time_series_df)):
        
        # Define the windows of training and prediction
        idx_in = i + days_train
        idx_out = idx_in + days_pred
        
        # If window of prediction tries to go past end of data, stop
        if idx_out > len(time_series_df):
            break

        # Create windowed training sequence for all non-response columns
        seq_in = np.array(time_series_df.iloc[i:idx_in, time_series_df.columns != response])

        # Create windowed prediction sequence for response column
        response_idx = time_series_df.columns.get_loc(response)
        seq_out = np.array(time_series_df.iloc[idx_in:idx_out, response_idx])
        
        # Add sequences to respective array, shaped for supervised learning
        x = np.concatenate((x, seq_in.reshape(((1,)+seq_in.shape))), axis = 0)
        y = np.concatenate((y, seq_out.reshape(((1,)+seq_out.shape))), axis = 0)
        
    # Decide if you want to print the output shape for clarity
    if print_shape == True:
        print("x_train shape: ", x.shape, "y_train shape: ", y.shape)
    return np.array(x), np.array(y)

# ca_combo = data_process(imputed_cali_gt, log_cali_outcomes)
ca_combo = data_process(imputed_cali_gt, log_cali_outcomes)
ny_combo = data_process(imputed_ny_gt, log_ny_outcomes)
ma_combo = data_process(imputed_ma_gt, log_ma_outcomes)

days_train = 14
days_pred = 7
X_ca, y_ca = times_series_split(ca_combo, "JHU_cases", days_train, days_pred)
X_ny, y_ny = times_series_split(ny_combo, "JHU_cases", days_train, days_pred)
X_ma, y_ma = times_series_split(ma_combo, "JHU_cases", days_train, days_pred)
features_ca = X_ca.shape[-1]
features_ny = X_ny.shape[-1]
features_ma = X_ma.shape[-1]

X_ca.shape, y_ca.shape

# Define RNN model
def model_rnn(days_train, days_pred, features, summary=True):
    
    rnn = Sequential()
    # rnn.add(SimpleRNN(100, input_shape=(days_train, features), return_sequences=True))
    rnn.add(SimpleRNN(100, input_shape=(days_train, features), return_sequences=False))
    rnn.add(Dense(days_pred, activation="linear"))
    
    rnn.compile(optimizer=Adam(), loss='mse')

    if summary == True:
        rnn.summary()
    
    return rnn

# Define LSTM model
def model_lstm(days_train, days_pred, features, summary=True):

    lstm = Sequential()
    lstm.add(LSTM(200, activation='relu', return_sequences=False, input_shape=(days_train, features)))
    lstm.add(Dense(days_pred, activation="linear"))
    
    lstm.compile(optimizer=Adam(), loss='mse')

    if summary == True:
        lstm.summary()
    
    return lstm

# Define Bidirectional LSTM model
def model_lstm_bd(days_train, days_pred, features, summary=True):

    lstm_bd = Sequential()
    lstm_bd.add(Bidirectional(LSTM(200, activation='relu', return_sequences=False), input_shape=(days_train, features)))
    lstm_bd.add(Dense(days_pred, activation="linear"))
    
    lstm_bd.compile(optimizer=Adam(), loss='mse')

    if summary == True:
        lstm_bd.summary()
    
    return lstm_bd

# Define function to train an inputted model
def fit_model(model, x_train, y_train, val_split=0.0, epochs=10, verbose=1, batch_size=1, cb = []):
    history = model.fit(x_train, y_train, #validation_split=val_split,
                        epochs=epochs, verbose=verbose, batch_size=batch_size, callbacks = cb)
    return history

# Define function to plot single-day prediction
def plot_prediction(prediction, y_train, scale_max, ax=None):
    
    # Rescale data
    prediction_rescaled = prediction * scale_max
    y_train_rescaled = y_train * scale_max

    # Plot figure
    if ax is not None:
        plot_loc=ax
    else:
        plot_loc=plt
        plt.figure(figsize=(20,10))

    plot_loc.plot(y_train_rescaled, label="Actual")
    plot_loc.plot(prediction_rescaled, ls='--', label="Prediction")
    
    if ax is None:
        plt.legend(fontsize=15, loc=2)
        # plt.title(f"Plotting {response} over time", size=20)
        plt.xlabel("Days since first case", size=15)

# Define function to allow windowed prediction plotting
# when predicting multiple days into the future
def plot_multiday_predictions(y_train, predictions, scale_max, ax=None, skip = 1):

    # Rescale data
    predictions_rescaled = predictions * scale_max
    y_train_rescaled = y_train * scale_max
    
    # Plot actual data
    cas_obs = []
    for i in range(len(y_train_rescaled)):
        cas_obs.append(y_train_rescaled[i, 0])
    
    if ax is not None:
        plot_loc=ax
    else:
        plot_loc=plt
        plt.figure(figsize=(20,10))
        
    plot_loc.plot(np.arange(0, len(cas_obs)), cas_obs, label = 'Observed')
    for i in range(len(predictions_rescaled)):
        if i % skip != 0:
            continue
        plot_loc.plot(np.arange(i, i+predictions_rescaled.shape[-1]), predictions_rescaled[i], c = 'orange')
        
    if ax is None:
        plt.legend(['Observed', 'Predicted'], fontsize=15, loc=2)

# Define function to predict
def predict_model(model, x_test):
    prediction = model.predict(x_test)
    return prediction

# es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=0)
lcall = LambdaCallback(on_epoch_end=lambda epoch, logs: print("epoch " + str(epoch) + ", loss " + str(logs['loss'])) if epoch % 20 == 0 else None)

# Simple RNN
rnn = model_rnn(days_train, days_pred, features_ca)
history_rnn = fit_model(rnn, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
rnn_pred_ca = predict_model(rnn, X_ca)
plot_multiday_predictions(y_ca, rnn_pred_ca, max(ca_combo['JHU_cases']), skip = 5)

rescale_rnn_pred_ca = rnn_pred_ca * max(ca_combo['JHU_cases'])
rescale_y_ca = y_ca * max(ca_combo['JHU_cases'])

rnn_ca_mse = mean_squared_error(rescale_rnn_pred_ca.flatten(), rescale_y_ca.flatten())
print(f"MSE of RNN: {rnn_ca_mse:.4f}")

# LSTM
lstm = model_lstm(days_train, days_pred, features_ca)
history_lstm = fit_model(lstm, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
lstm_pred_ca = predict_model(lstm, X_ca)
plot_multiday_predictions(y_ca, lstm_pred_ca, max(ca_combo['JHU_cases']), skip = 5)

rescale_lstm_pred_ca = lstm_pred_ca * max(ca_combo['JHU_cases'])
rescale_y_ca = y_ca * max(ca_combo['JHU_cases'])

lstm_ca_mse = mean_squared_error(rescale_lstm_pred_ca.flatten(), rescale_y_ca.flatten())
print(f"MSE of lstm: {lstm_ca_mse:.4f}")

# bidirectional LSTM
bd_lstm = model_lstm_bd(days_train, days_pred, features_ca)
history_bd_lstm = fit_model(bd_lstm, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
bd_lstm_pred_ca = predict_model(bd_lstm, X_ca)
plot_multiday_predictions(y_ca, bd_lstm_pred_ca, max(ca_combo['JHU_cases']), skip = 5)

rescale_bd_lstm_pred_ca = bd_lstm_pred_ca * max(ca_combo['JHU_cases'])
rescale_y_ca = y_ca * max(ca_combo['JHU_cases'])

bd_lstm_ca_mse = mean_squared_error(rescale_bd_lstm_pred_ca.flatten(), rescale_y_ca.flatten())
print(f"MSE of bilstm: {bd_lstm_ca_mse:.4f}")

# Simple RNN
rnn = model_rnn(days_train, days_pred, features_ny)
history_rnn = fit_model(rnn, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
rnn_pred_ny = predict_model(rnn, X_ny)
plot_multiday_predictions(y_ny, rnn_pred_ny, max(ny_combo['JHU_cases']), skip = 5)

rescale_rnn_pred_ny = rnn_pred_ny * max(ny_combo['JHU_cases'])
rescale_y_ny = y_ny * max(ny_combo['JHU_cases'])

rnn_ny_mse = mean_squared_error(rescale_rnn_pred_ny.flatten(), rescale_y_ny.flatten())
print(f"MSE of RNN: {rnn_ny_mse:.4f}")

# LSTM
lstm = model_lstm(days_train, days_pred, features_ny)
history_lstm = fit_model(lstm, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
lstm_pred_ny = predict_model(lstm, X_ny)
plot_multiday_predictions(y_ny, lstm_pred_ny, max(ny_combo['JHU_cases']), skip = 5)

rescale_lstm_pred_ny = lstm_pred_ny * max(ny_combo['JHU_cases'])
rescale_y_ny = y_ny * max(ny_combo['JHU_cases'])

lstm_ny_mse = mean_squared_error(rescale_lstm_pred_ny.flatten(), rescale_y_ny.flatten())
print(f"MSE of lstm: {lstm_ny_mse:.4f}")

# bidirectional LSTM
bd_lstm = model_lstm_bd(days_train, days_pred, features_ny)
history_bd_lstm = fit_model(bd_lstm, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
bd_lstm_pred_ny = predict_model(bd_lstm, X_ny)
plot_multiday_predictions(y_ny, bd_lstm_pred_ny, max(ny_combo['JHU_cases']), skip = 5)

rescale_bd_lstm_pred_ny = bd_lstm_pred_ny * max(ca_combo['JHU_cases'])
rescale_y_ny = y_ny * max(ny_combo['JHU_cases'])

bd_lstm_ny_mse = mean_squared_error(rescale_bd_lstm_pred_ny.flatten(), rescale_y_ny.flatten())
print(f"MSE of bilstm: {bd_lstm_ny_mse:.4f}")

# Simple RNN
rnn = model_rnn(days_train, days_pred, features_ma)
history_rnn = fit_model(rnn, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
rnn_pred_ma = predict_model(rnn, X_ma)
plot_multiday_predictions(y_ma, rnn_pred_ma, max(ma_combo['JHU_cases']), skip = 5)

rescale_rnn_pred_ma = rnn_pred_ma * max(ma_combo['JHU_cases'])
rescale_y_ma = y_ma * max(ma_combo['JHU_cases'])

rnn_ma_mse = mean_squared_error(rescale_rnn_pred_ma.flatten(), rescale_y_ma.flatten())
print(f"MSE of RNN: {rnn_ma_mse:.4f}")

# LSTM
lstm = model_lstm(days_train, days_pred, features_ma)
history_lstm = fit_model(lstm, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
lstm_pred_ma = predict_model(lstm, X_ma)
plot_multiday_predictions(y_ma, lstm_pred_ma, max(ma_combo['JHU_cases']), skip = 5)

rescale_lstm_pred_ma = lstm_pred_ma * max(ma_combo['JHU_cases'])
rescale_y_ma = y_ma * max(ma_combo['JHU_cases'])

lstm_ma_mse = mean_squared_error(rescale_lstm_pred_ma.flatten(), rescale_y_ma.flatten())
print(f"MSE of lstm: {lstm_ma_mse:.4f}")

# bidirectional LSTM
bd_lstm = model_lstm_bd(days_train, days_pred, features_ma)
history_bd_lstm = fit_model(bd_lstm, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0)
bd_lstm_pred_ma = predict_model(bd_lstm, X_ma)
plot_multiday_predictions(y_ma, bd_lstm_pred_ma, max(ma_combo['JHU_cases']), skip = 5)

rescale_bd_lstm_pred_ma = bd_lstm_pred_ma * max(ma_combo['JHU_cases'])
rescale_y_ma = y_ma * max(ma_combo['JHU_cases'])

bd_lstm_ma_mse = mean_squared_error(rescale_bd_lstm_pred_ma.flatten(), rescale_y_ma.flatten())
print(f"MSE of bilstm: {bd_lstm_ma_mse:.4f}")

def feature_importance(model, X_ca, y_ca, COLS, top_num, model_name, results = None):
    
    if not results:
        results = []
        print(' Computing feature importance...')

        # COMPUTE BASELINE (NO SHUFFLE)
        base_preds = predict_model(model, X_ca)
        baseline_mse = mean_squared_error(base_preds, y_ca)
        results.append({'feature':'BASELINE','mse':baseline_mse}) 

        for k in range(len(COLS)):            
            # SHUFFLE FEATURE K
            save_col = X_ca[:,:,k].copy()
            np.random.shuffle(X_ca[:,:,k])
                                    
            # COMPUTE prediction MAE WITH FEATURE K SHUFFLED
            shuf_preds = predict_model(model, X_ca)
            #   mae = np.mean(np.abs( shuf_preds-y_ca ))
            mse = mean_squared_error(shuf_preds, y_ca)
            results.append({'feature':COLS[k],'mse':mse})
            X_ca[:,:,k] = save_col
    else:
        # COMPUTE BASELINE (NO SHUFFLE)
        base_preds = predict_model(model, X_ca)
        baseline_mse = mean_squared_error(base_preds, y_ca)
    df = pd.DataFrame(results)
    df = df.sort_values('mse')
    df = df.reset_index(drop = True)[-top_num:]
    plt.figure(figsize=(5,8))
    plt.barh(np.arange(len(df)),df.mse)
    plt.yticks(np.arange(len(df)),df.feature.values)
    plt.title(model_name + 'Feature Importance',size=16)
    plt.ylim((-1,len(df)))
    plt.plot([baseline_mse, baseline_mse],[-1,len(df)], '--', color='orange',label=f'Baseline \nMSE={baseline_mse:.3f}')
    plt.ylabel('Feature',size=14)
    plt.legend()
    plt.show()
    return results

re_bd_lstm = feature_importance(bd_lstm, X_ca, y_ca, list(imputed_cali_gt.columns), 10, "bd_LSTM", None)