import os import random as rn import tensorflow as tf import pandas as pd import numpy as np import missingno as meso import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline from sklearn.impute import KNNImputer from scipy.stats import spearmanr from sklearn.preprocessing import StandardScaler from tensorflow.keras.models import Sequential from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Flatten, Dropout, Bidirectional, TimeDistributed, Conv1D, MaxPool1D, BatchNormalization from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, LambdaCallback from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.metrics import mean_squared_error from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_validate, GridSearchCV # Import class for reservoir computing # from pyESN import ESN # Written by user cknd: https://github.com/cknd/pyESN # from keras.models import Sequential # from keras.layers import Dense # from keras.layers import Flatten # from keras.layers.convolutional import Conv1D # from keras.layers.convolutional import MaxPooling1D import warnings warnings.filterwarnings('ignore') from google.colab import drive drive.mount("/content/drive") os.environ['PYTHONHASHSEED'] = '0' os.environ['CUDA_VISIBLE_DEVICES'] = '' tf.random.set_seed(109) np.random.seed(109) rn.seed(109) california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date']) new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date']) ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date']) print(california.tail(10)) # remove the final 5 days since it's NA print(new_york.tail(10)) # remove the final 5 days since it's NA print(ma.tail(10)) # remove the final 5 days since it's NA california = california.iloc[:-5, :] new_york = new_york.iloc[:-5, :] ma = ma.iloc[:-5, :] california.describe() new_york.describe() ma.describe() california.shape, new_york.shape, ma.shape fig,axs = plt.subplots(3,1, figsize = (12,15)) axs[0].plot(california.index.values, california['JHU_cases']) axs[1].plot(new_york.index.values, new_york['JHU_cases']) axs[2].plot(ma.index.values, ma['JHU_cases']) for i in range(3): axs[i].set_xlabel('Time') axs[0].set_ylabel('Cases') axs[1].set_ylabel('Cases') axs[2].set_ylabel('Cases') axs[0].set_title('COVID-19 cases over time in California') axs[1].set_title('COVID-19 cases over time in New York') axs[2].set_title('COVID-19 cases over time in Massachusetts') meso.matrix(california) meso.matrix(new_york.iloc[:,:200]) meso.matrix(ma.iloc[:,:200]) for state in [california, new_york, ma]: state['JHU_cases']=state['JHU_cases'].fillna(0) print(state['JHU_cases'][state['JHU_cases'] < 0]) california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0 california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0 ma['JHU_cases'][ma['JHU_cases'] == -280.0] = 280 california.shape, new_york.shape, ma.shape fig = plt.subplots(1,1, figsize = (18,8)) plt.plot(california.index.values,california['JHU_cases'],label="California") plt.plot(new_york.index.values,new_york['JHU_cases'],label="New York") plt.plot(ma.index.values,ma['JHU_cases'],label="Massachusetts") plt.xlabel('Time') plt.ylabel('Cases') plt.title('COVID-19 cases over time') plt.legend() #separate feature and outcome tables cali_features=california.iloc[:,2:] cali_outcomes=california.iloc[:,0] ny_features=new_york.iloc[:,2:] ny_outcomes=new_york.iloc[:,0] ma_features=ma.iloc[:,2:] ma_outcomes=ma.iloc[:,0] #extract only google trend features gt_cols=[] for col in cali_features.columns: if "gt_" in col or "gt2_" in col: gt_cols.append(col) cali_gt=cali_features[gt_cols] gt_cols=[] for col in ny_features.columns: if "gt_" in col or "gt2_" in col: gt_cols.append(col) ny_gt=ny_features[gt_cols] gt_cols=[] for col in ma_features.columns: if "gt_" in col or "gt2_" in col: gt_cols.append(col) ma_gt=ma_features[gt_cols] # Window == number of days in rolling window over which to calculate mean window = 5 cali_outcomes = cali_outcomes.transform(lambda x: x.rolling(window).mean()) ny_outcomes = ny_outcomes.transform(lambda x: x.rolling(window).mean()) ma_outcomes = ma_outcomes.transform(lambda x: x.rolling(window).mean()) cali_outcomes = cali_outcomes.fillna(0) ny_outcomes = ny_outcomes.fillna(0) ma_outcomes = ma_outcomes.fillna(0) # cali_outcomes=cali_outcomes.apply(lambda x: x*100000/39499738) # ny_outcomes=ny_outcomes.apply(lambda x: x*100000/20154933) # ma_outcomes=ma_outcomes.apply(lambda x: x*100000/7022220) fig = plt.subplots(1,1, figsize = (18,8)) plt.plot(california.index.values,cali_outcomes,label="California") plt.plot(new_york.index.values,ny_outcomes,label="New York") plt.plot(ma.index.values,ma_outcomes,label="Massachusetts") plt.xlabel('Time') plt.ylabel('Cases') plt.title('COVID-19 cases over time') plt.legend() log_cali_outcomes = np.log(cali_outcomes+1) log_ny_outcomes = np.log(ny_outcomes+1) log_ma_outcomes = np.log(ma_outcomes+1) fig = plt.subplots(1,1, figsize = (18,8)) plt.plot(california.index.values,log_cali_outcomes,label="California") plt.plot(new_york.index.values,log_ny_outcomes,label="New York") plt.plot(ma.index.values,log_ma_outcomes,label="Massachusetts") plt.xlabel('Time') plt.ylabel('Log Cases') plt.title('Log COVID-19 cases over time') plt.legend() #imputation using KNN col_names=cali_gt.columns imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') imputer.fit(cali_gt) imputed_data = imputer.transform(cali_gt) imputed_cali_gt = pd.DataFrame(imputed_data,columns=col_names) col_names=ny_gt.columns imputer.fit(ny_gt) imputed_data = imputer.transform(ny_gt) imputed_ny_gt = pd.DataFrame(imputed_data,columns=col_names) col_names=ma_gt.columns imputer.fit(ma_gt) imputed_data = imputer.transform(ma_gt) imputed_ma_gt = pd.DataFrame(imputed_data,columns=col_names) #Drop highly correlated features common among all three datasets imputed_cali_gt=imputed_cali_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1) imputed_ny_gt=imputed_ny_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1) imputed_ma_gt=imputed_ma_gt.drop(['gt2_Acute bronchitis','gt2_Cough',"gt2_Bell's palsy"],axis=1) imputed_cali_gt.shape, imputed_ny_gt.shape, imputed_ma_gt.shape log_cali_outcomes.shape, log_ny_outcomes.shape, log_ma_outcomes.shape log_cali_outcomes # # Define functions to calculate AIC and BIC # from scipy.stats import norm # def evaluate_AIC(k, residuals): # """ # Finds the AIC given the number of parameters estimated and # the residuals of the model. Assumes residuals are distributed # Gaussian with unknown variance. # """ # standard_deviation = np.std(residuals) # log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation) # return 2 * k - 2 * np.sum(log_likelihood) # def evaluate_BIC(k, residuals): # """ # Finds the AIC given the number of parameters estimated and # the residuals of the model. Assumes residuals are distributed # Gaussian with unknown variance. # """ # standard_deviation = np.std(residuals) # log_likelihood = norm.logpdf(residuals, 0, scale=standard_deviation) # return k * np.log(len(residuals)) - 2 * np.sum(log_likelihood) # # After linear fit, it seems like a higher order model is needed # from sklearn import linear_model # clf = linear_model.LinearRegression() # index_len = len(log_cali_outcomes.index.values) # index = np.linspace(1,index_len,index_len).reshape(-1,1) # new_x = np.hstack((index, index ** 2, index ** 3)) # clf.fit(new_x, log_cali_outcomes) # print(clf.coef_) # cubic_prediction = clf.predict(new_x) # fig, axs = plt.subplots(2,1, figsize=(12,12)) # axs[0].plot(california.index.values, log_cali_outcomes, label='log COVID-19 cases (original data)') # axs[0].plot(california.index.values, cubic_prediction, 'r', label='fitted line') # axs[0].legend() # axs[0].set_title("True Log Cases vs Fitted Cubic Line (CA)") # cali_cubic_residuals = log_cali_outcomes - cubic_prediction # axs[1].plot(california.index.values, cali_cubic_residuals, 'o') # axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line") # print("MSE with cubic fit:", np.mean((cali_cubic_residuals)**2)) # print("AIC:", evaluate_AIC(2, cali_cubic_residuals)) # print("BIC:", evaluate_BIC(2, cali_cubic_residuals)) # # After linear fit, it seems like a higher order model is needed # from sklearn import linear_model # clf = linear_model.LinearRegression() # index_len = len(log_ny_outcomes.index.values) # index = np.linspace(1,index_len,index_len).reshape(-1,1) # new_x = np.hstack((index, index ** 2, index ** 3)) # clf.fit(new_x, log_ny_outcomes) # print(clf.coef_) # cubic_prediction = clf.predict(new_x) # fig, axs = plt.subplots(2,1, figsize=(12,12)) # axs[0].plot(new_york.index.values, log_ny_outcomes, label='log COVID-19 cases (original data)') # axs[0].plot(new_york.index.values, cubic_prediction, 'r', label='fitted line') # axs[0].legend() # axs[0].set_title("True Log Cases vs Fitted Cubic Line (NY)") # ny_cubic_residuals = log_ny_outcomes - cubic_prediction # axs[1].plot(new_york.index.values, ny_cubic_residuals, 'o') # axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line") # print("MSE with cubic fit:", np.mean((ny_cubic_residuals)**2)) # print("AIC:", evaluate_AIC(2, ny_cubic_residuals)) # print("BIC:", evaluate_BIC(2, ny_cubic_residuals)) # # After linear fit, it seems like a higher order model is needed # from sklearn import linear_model # clf = linear_model.LinearRegression() # index_len = len(log_ny_outcomes.index.values) # index = np.linspace(1,index_len,index_len).reshape(-1,1) # new_x = np.hstack((index, index ** 2, index ** 3)) # clf.fit(new_x, log_ny_outcomes) # print(clf.coef_) # cubic_prediction = clf.predict(new_x) # fig, axs = plt.subplots(2,1, figsize=(12,12)) # axs[0].plot(new_york.index.values, log_ny_outcomes, label='log COVID-19 cases (original data)') # axs[0].plot(new_york.index.values, cubic_prediction, 'r', label='fitted line') # axs[0].legend() # axs[0].set_title("True Log Cases vs Fitted Cubic Line (NY)") # ma_cubic_residuals = log_ny_outcomes - cubic_prediction # axs[1].plot(new_york.index.values, ma_cubic_residuals, 'o') # axs[1].set_title("Residuals: True Log Cases - Fitted Cubic Line") # print("MSE with cubic fit:", np.mean((ma_cubic_residuals)**2)) # print("AIC:", evaluate_AIC(2, ma_cubic_residuals)) # print("BIC:", evaluate_BIC(2, ma_cubic_residuals)) def data_process(features, outcome): outcome = outcome.reset_index(drop = True) features = features.reset_index(drop = True) cols = features.columns standardizer = StandardScaler() standardizer.fit(features) scaled_features = pd.DataFrame(standardizer.transform(features),columns=cols) combo = pd.concat([scaled_features, outcome], axis = 1) combo["JHU_cases"] = combo["JHU_cases"].fillna(0) return combo def times_series_split(time_series_df, response:str, days_train, days_pred, print_shape=False): # Initialize x, y arrays as zeroes # Decide if you want to hold out response variable x = np.zeros((0, days_train, time_series_df.shape[-1]-1)) y = np.zeros((0, days_pred)) # For every row in the dataframe for i in range(len(time_series_df)): # Define the windows of training and prediction idx_in = i + days_train idx_out = idx_in + days_pred # If window of prediction tries to go past end of data, stop if idx_out > len(time_series_df): break # Create windowed training sequence for all non-response columns seq_in = np.array(time_series_df.iloc[i:idx_in, time_series_df.columns != response]) # Create windowed prediction sequence for response column response_idx = time_series_df.columns.get_loc(response) seq_out = np.array(time_series_df.iloc[idx_in:idx_out, response_idx]) # Add sequences to respective array, shaped for supervised learning x = np.concatenate((x, seq_in.reshape(((1,)+seq_in.shape))), axis = 0) y = np.concatenate((y, seq_out.reshape(((1,)+seq_out.shape))), axis = 0) # Decide if you want to print the output shape for clarity if print_shape == True: print("x_train shape: ", x.shape, "y_train shape: ", y.shape) return np.array(x), np.array(y) # ca_combo = data_process(imputed_cali_gt, log_cali_outcomes) ca_combo = data_process(imputed_cali_gt, log_cali_outcomes) ny_combo = data_process(imputed_ny_gt, log_ny_outcomes) ma_combo = data_process(imputed_ma_gt, log_ma_outcomes) days_train = 14 days_pred = 7 X_ca, y_ca = times_series_split(ca_combo, "JHU_cases", days_train, days_pred) X_ny, y_ny = times_series_split(ny_combo, "JHU_cases", days_train, days_pred) X_ma, y_ma = times_series_split(ma_combo, "JHU_cases", days_train, days_pred) features_ca = X_ca.shape[-1] features_ny = X_ny.shape[-1] features_ma = X_ma.shape[-1] X_ca.shape, y_ca.shape # Define RNN model def model_rnn(days_train, days_pred, features, summary=True): rnn = Sequential() # rnn.add(SimpleRNN(100, input_shape=(days_train, features), return_sequences=True)) rnn.add(SimpleRNN(100, input_shape=(days_train, features), return_sequences=False)) rnn.add(Dense(days_pred, activation="linear")) rnn.compile(optimizer=Adam(), loss='mse') if summary == True: rnn.summary() return rnn # Define LSTM model def model_lstm(days_train, days_pred, features, summary=True): lstm = Sequential() lstm.add(LSTM(200, activation='relu', return_sequences=False, input_shape=(days_train, features))) lstm.add(Dense(days_pred, activation="linear")) lstm.compile(optimizer=Adam(), loss='mse') if summary == True: lstm.summary() return lstm # Define Bidirectional LSTM model def model_lstm_bd(days_train, days_pred, features, summary=True): lstm_bd = Sequential() lstm_bd.add(Bidirectional(LSTM(200, activation='relu', return_sequences=False), input_shape=(days_train, features))) lstm_bd.add(Dense(days_pred, activation="linear")) lstm_bd.compile(optimizer=Adam(), loss='mse') if summary == True: lstm_bd.summary() return lstm_bd # Define function to train an inputted model def fit_model(model, x_train, y_train, val_split=0.0, epochs=10, verbose=1, batch_size=1, cb = []): history = model.fit(x_train, y_train, #validation_split=val_split, epochs=epochs, verbose=verbose, batch_size=batch_size, callbacks = cb) return history # Define function to plot single-day prediction def plot_prediction(prediction, y_train, scale_max, ax=None): # Rescale data prediction_rescaled = prediction * scale_max y_train_rescaled = y_train * scale_max # Plot figure if ax is not None: plot_loc=ax else: plot_loc=plt plt.figure(figsize=(20,10)) plot_loc.plot(y_train_rescaled, label="Actual") plot_loc.plot(prediction_rescaled, ls='--', label="Prediction") if ax is None: plt.legend(fontsize=15, loc=2) # plt.title(f"Plotting {response} over time", size=20) plt.xlabel("Days since first case", size=15) # Define function to allow windowed prediction plotting # when predicting multiple days into the future def plot_multiday_predictions(y_train, predictions, scale_max, ax=None, skip = 1): # Rescale data predictions_rescaled = predictions * scale_max y_train_rescaled = y_train * scale_max # Plot actual data cas_obs = [] for i in range(len(y_train_rescaled)): cas_obs.append(y_train_rescaled[i, 0]) if ax is not None: plot_loc=ax else: plot_loc=plt plt.figure(figsize=(20,10)) plot_loc.plot(np.arange(0, len(cas_obs)), cas_obs, label = 'Observed') for i in range(len(predictions_rescaled)): if i % skip != 0: continue plot_loc.plot(np.arange(i, i+predictions_rescaled.shape[-1]), predictions_rescaled[i], c = 'orange') if ax is None: plt.legend(['Observed', 'Predicted'], fontsize=15, loc=2) # Define function to predict def predict_model(model, x_test): prediction = model.predict(x_test) return prediction # es = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=0) lcall = LambdaCallback(on_epoch_end=lambda epoch, logs: print("epoch " + str(epoch) + ", loss " + str(logs['loss'])) if epoch % 20 == 0 else None) # Simple RNN rnn = model_rnn(days_train, days_pred, features_ca) history_rnn = fit_model(rnn, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) rnn_pred_ca = predict_model(rnn, X_ca) plot_multiday_predictions(y_ca, rnn_pred_ca, max(ca_combo['JHU_cases']), skip = 5) rescale_rnn_pred_ca = rnn_pred_ca * max(ca_combo['JHU_cases']) rescale_y_ca = y_ca * max(ca_combo['JHU_cases']) rnn_ca_mse = mean_squared_error(rescale_rnn_pred_ca.flatten(), rescale_y_ca.flatten()) print(f"MSE of RNN: {rnn_ca_mse:.4f}") # LSTM lstm = model_lstm(days_train, days_pred, features_ca) history_lstm = fit_model(lstm, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) lstm_pred_ca = predict_model(lstm, X_ca) plot_multiday_predictions(y_ca, lstm_pred_ca, max(ca_combo['JHU_cases']), skip = 5) rescale_lstm_pred_ca = lstm_pred_ca * max(ca_combo['JHU_cases']) rescale_y_ca = y_ca * max(ca_combo['JHU_cases']) lstm_ca_mse = mean_squared_error(rescale_lstm_pred_ca.flatten(), rescale_y_ca.flatten()) print(f"MSE of lstm: {lstm_ca_mse:.4f}") # bidirectional LSTM bd_lstm = model_lstm_bd(days_train, days_pred, features_ca) history_bd_lstm = fit_model(bd_lstm, X_ca, y_ca, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) bd_lstm_pred_ca = predict_model(bd_lstm, X_ca) plot_multiday_predictions(y_ca, bd_lstm_pred_ca, max(ca_combo['JHU_cases']), skip = 5) rescale_bd_lstm_pred_ca = bd_lstm_pred_ca * max(ca_combo['JHU_cases']) rescale_y_ca = y_ca * max(ca_combo['JHU_cases']) bd_lstm_ca_mse = mean_squared_error(rescale_bd_lstm_pred_ca.flatten(), rescale_y_ca.flatten()) print(f"MSE of bilstm: {bd_lstm_ca_mse:.4f}") # Simple RNN rnn = model_rnn(days_train, days_pred, features_ny) history_rnn = fit_model(rnn, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) rnn_pred_ny = predict_model(rnn, X_ny) plot_multiday_predictions(y_ny, rnn_pred_ny, max(ny_combo['JHU_cases']), skip = 5) rescale_rnn_pred_ny = rnn_pred_ny * max(ny_combo['JHU_cases']) rescale_y_ny = y_ny * max(ny_combo['JHU_cases']) rnn_ny_mse = mean_squared_error(rescale_rnn_pred_ny.flatten(), rescale_y_ny.flatten()) print(f"MSE of RNN: {rnn_ny_mse:.4f}") # LSTM lstm = model_lstm(days_train, days_pred, features_ny) history_lstm = fit_model(lstm, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) lstm_pred_ny = predict_model(lstm, X_ny) plot_multiday_predictions(y_ny, lstm_pred_ny, max(ny_combo['JHU_cases']), skip = 5) rescale_lstm_pred_ny = lstm_pred_ny * max(ny_combo['JHU_cases']) rescale_y_ny = y_ny * max(ny_combo['JHU_cases']) lstm_ny_mse = mean_squared_error(rescale_lstm_pred_ny.flatten(), rescale_y_ny.flatten()) print(f"MSE of lstm: {lstm_ny_mse:.4f}") # bidirectional LSTM bd_lstm = model_lstm_bd(days_train, days_pred, features_ny) history_bd_lstm = fit_model(bd_lstm, X_ny, y_ny, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) bd_lstm_pred_ny = predict_model(bd_lstm, X_ny) plot_multiday_predictions(y_ny, bd_lstm_pred_ny, max(ny_combo['JHU_cases']), skip = 5) rescale_bd_lstm_pred_ny = bd_lstm_pred_ny * max(ca_combo['JHU_cases']) rescale_y_ny = y_ny * max(ny_combo['JHU_cases']) bd_lstm_ny_mse = mean_squared_error(rescale_bd_lstm_pred_ny.flatten(), rescale_y_ny.flatten()) print(f"MSE of bilstm: {bd_lstm_ny_mse:.4f}") # Simple RNN rnn = model_rnn(days_train, days_pred, features_ma) history_rnn = fit_model(rnn, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) rnn_pred_ma = predict_model(rnn, X_ma) plot_multiday_predictions(y_ma, rnn_pred_ma, max(ma_combo['JHU_cases']), skip = 5) rescale_rnn_pred_ma = rnn_pred_ma * max(ma_combo['JHU_cases']) rescale_y_ma = y_ma * max(ma_combo['JHU_cases']) rnn_ma_mse = mean_squared_error(rescale_rnn_pred_ma.flatten(), rescale_y_ma.flatten()) print(f"MSE of RNN: {rnn_ma_mse:.4f}") # LSTM lstm = model_lstm(days_train, days_pred, features_ma) history_lstm = fit_model(lstm, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) lstm_pred_ma = predict_model(lstm, X_ma) plot_multiday_predictions(y_ma, lstm_pred_ma, max(ma_combo['JHU_cases']), skip = 5) rescale_lstm_pred_ma = lstm_pred_ma * max(ma_combo['JHU_cases']) rescale_y_ma = y_ma * max(ma_combo['JHU_cases']) lstm_ma_mse = mean_squared_error(rescale_lstm_pred_ma.flatten(), rescale_y_ma.flatten()) print(f"MSE of lstm: {lstm_ma_mse:.4f}") # bidirectional LSTM bd_lstm = model_lstm_bd(days_train, days_pred, features_ma) history_bd_lstm = fit_model(bd_lstm, X_ma, y_ma, val_split=0.2, epochs = 150, batch_size = 16, cb = [lcall], verbose = 0) bd_lstm_pred_ma = predict_model(bd_lstm, X_ma) plot_multiday_predictions(y_ma, bd_lstm_pred_ma, max(ma_combo['JHU_cases']), skip = 5) rescale_bd_lstm_pred_ma = bd_lstm_pred_ma * max(ma_combo['JHU_cases']) rescale_y_ma = y_ma * max(ma_combo['JHU_cases']) bd_lstm_ma_mse = mean_squared_error(rescale_bd_lstm_pred_ma.flatten(), rescale_y_ma.flatten()) print(f"MSE of bilstm: {bd_lstm_ma_mse:.4f}") def feature_importance(model, X_ca, y_ca, COLS, top_num, model_name, results = None): if not results: results = [] print(' Computing feature importance...') # COMPUTE BASELINE (NO SHUFFLE) base_preds = predict_model(model, X_ca) baseline_mse = mean_squared_error(base_preds, y_ca) results.append({'feature':'BASELINE','mse':baseline_mse}) for k in range(len(COLS)): # SHUFFLE FEATURE K save_col = X_ca[:,:,k].copy() np.random.shuffle(X_ca[:,:,k]) # COMPUTE prediction MAE WITH FEATURE K SHUFFLED shuf_preds = predict_model(model, X_ca) # mae = np.mean(np.abs( shuf_preds-y_ca )) mse = mean_squared_error(shuf_preds, y_ca) results.append({'feature':COLS[k],'mse':mse}) X_ca[:,:,k] = save_col else: # COMPUTE BASELINE (NO SHUFFLE) base_preds = predict_model(model, X_ca) baseline_mse = mean_squared_error(base_preds, y_ca) df = pd.DataFrame(results) df = df.sort_values('mse') df = df.reset_index(drop = True)[-top_num:] plt.figure(figsize=(5,8)) plt.barh(np.arange(len(df)),df.mse) plt.yticks(np.arange(len(df)),df.feature.values) plt.title(model_name + 'Feature Importance',size=16) plt.ylim((-1,len(df))) plt.plot([baseline_mse, baseline_mse],[-1,len(df)], '--', color='orange',label=f'Baseline \nMSE={baseline_mse:.3f}') plt.ylabel('Feature',size=14) plt.legend() plt.show() return results re_bd_lstm = feature_importance(bd_lstm, X_ca, y_ca, list(imputed_cali_gt.columns), 10, "bd_LSTM", None)