from google.colab import drive
drive.mount("/content/drive")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import seaborn as sns

california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date'])
new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date'])
ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date'])

print(california.tail(10)) # remove the final 5 days since it's NA
print(new_york.tail(10)) # remove the final 5 days since it's NA
print(ma.tail(10)) # remove the final 5 days since it's NA

california = california.iloc[:-5, :]
new_york = new_york.iloc[:-5, :]
ma = ma.iloc[:-5, :]

for state in [california, new_york, ma]:
  state['JHU_cases']=state['JHU_cases'].fillna(0) # fill missing values with 0
  print(state['JHU_cases'][state['JHU_cases'] < 0])

california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0
california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0
ma['JHU_cases'][ma['JHU_cases'] == -280.0] = 280

fig,axs = plt.subplots(3,1, figsize = (12,15))
axs[0].plot(california.index.values, california['JHU_cases'])
axs[1].plot(new_york.index.values, new_york['JHU_cases'])
axs[2].plot(ma.index.values, ma['JHU_cases'])
for i in range(3):
  axs[i].set_xlabel('Time')
axs[0].set_ylabel('Cases')
axs[1].set_ylabel('Cases')
axs[2].set_ylabel('Cases')
axs[0].set_title('COVID-19 cases over time in California')
axs[1].set_title('COVID-19 cases over time in New York')
axs[2].set_title('COVID-19 cases over time in Massachusetts')


#separate feature and outcome tables
cali_features=california.iloc[:,3:]
cali_outcomes=california.iloc[:,0]
ny_features=new_york.iloc[:,3:]
ny_outcomes=new_york.iloc[:,0]
ma_features=ma.iloc[:,3:]
ma_outcomes=ma.iloc[:,0]

cali_features.shape, cali_outcomes.shape

# Window == number of days in rolling window over which to calculate mean
window = 5
cali_outcomes = cali_outcomes.transform(lambda x: x.rolling(window).mean())
ny_outcomes = ny_outcomes.transform(lambda x: x.rolling(window).mean())
ma_outcomes = ma_outcomes.transform(lambda x: x.rolling(window).mean())

cali_features.shape, cali_outcomes.shape

cali_outcomes = cali_outcomes.fillna(0)
ny_outcomes = ny_outcomes.fillna(0)
ma_outcomes = ma_outcomes.fillna(0)

fig,axs = plt.subplots(3,1, figsize = (12,15))
axs[0].plot(california.index.values, cali_outcomes)
axs[1].plot(new_york.index.values, ny_outcomes)
axs[2].plot(ma.index.values, ma_outcomes)
for i in range(3):
  axs[i].set_xlabel('Time')
axs[0].set_ylabel('Cases')
axs[1].set_ylabel('Cases')
axs[2].set_ylabel('Cases')
axs[0].set_title('COVID-19 cases over time in California')
axs[1].set_title('COVID-19 cases over time in New York')
axs[2].set_title('COVID-19 cases over time in Massachusetts')


missing_ratio = cali_features.isna().sum(axis=0) / cali_features.shape[0]
print(missing_ratio.reset_index().sort_values(by = 0, ascending = False))
cali_features_selected = cali_features[cali_features.columns[missing_ratio < 0.2]] 
cali_features_selected.shape

missing_ratio = ny_features.isna().sum(axis=0) / ny_features.shape[0]
print(missing_ratio.reset_index().sort_values(by = 0, ascending = False))
ny_features_selected = ny_features[ny_features.columns[missing_ratio < 0.2]] 
ny_features_selected.shape

missing_ratio = ma_features.isna().sum(axis=0) / ma_features.shape[0]
print(missing_ratio.reset_index().sort_values(by = 0, ascending = False))
ma_features_selected = ma_features[ma_features.columns[missing_ratio < 0.2]] 
ma_features_selected.shape

gt_features = [col for col in cali_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col]
cali_features_selected = cali_features_selected[gt_features]

gt_features = [col for col in ny_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col]
ny_features_selected = ny_features_selected[gt_features]

gt_features = [col for col in ma_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col]
ma_features_selected = ma_features_selected[gt_features]

#imputation using KNN
col_names=cali_features_selected.columns
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
imputer.fit(cali_features_selected)
imputed_data = imputer.transform(cali_features_selected)
imputed_cali_gt = pd.DataFrame(imputed_data,columns=col_names)

col_names=ny_features_selected.columns
imputer.fit(ny_features_selected)
imputed_data = imputer.transform(ny_features_selected)
imputed_ny_gt = pd.DataFrame(imputed_data,columns=col_names)

col_names=ma_features_selected.columns
imputer.fit(ma_features_selected)
imputed_data = imputer.transform(ma_features_selected)
imputed_ma_gt = pd.DataFrame(imputed_data,columns=col_names)

# Pearson's correlation requires that each dataset be normally distributed
from scipy.stats.stats import pearsonr
from scipy.ndimage.interpolation import shift


corr_value = []
p_value = []
for i in range(len(imputed_cali_gt.columns)):
  res = pearsonr(imputed_cali_gt.iloc[:,i], cali_outcomes)
  corr_value.append(res[0])
  p_value.append(res[1])

res_tab = pd.DataFrame({"features" : imputed_cali_gt.columns, 
              "corr": corr_value,
              'abs_corr': [abs(x) for x in corr_value],
              "p_value": p_value})

res_tab['relation'] = np.where(res_tab['corr'] > 0, 'positive', 'negative')
res_tab = res_tab.sort_values(by='abs_corr', ascending = False)

res_tab.head()

def pcorrcoef(feature, outcome, shift_by):
  corr_value = []
  p_value = []

  if shift_by != 0:
    shifted_data = shift(outcome.reset_index()['JHU_cases'].values, shift_by, cval = 0)
    outcome = shifted_data
  
  for i in range(len(feature.columns)):
    res = pearsonr(feature.iloc[:,i], outcome)
    corr_value.append(res[0])
    p_value.append(res[1])

  res_tab = pd.DataFrame({f"features_{shift_by}" : feature.columns, 
                f"corr_{shift_by}": corr_value,
                f"abs_corr_{shift_by}": [abs(x) for x in corr_value],
                f"p_value_{shift_by}": p_value})

  res_tab[f"relation_{shift_by}"] = np.where(res_tab[f"corr_{shift_by}"] > 0, 'positive', 'negative')
  # res_tab = res_tab.sort_values(by=f"abs_corr_{shift_by}", ascending = False)
  return res_tab

res_tab_0 = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=0)
res_tab_0 = res_tab_0.sort_values(by="abs_corr_0", ascending = False)
res_tab_0

# features with correlation coefficent > 0.5 have strong correlation with cases
strong_cor = res_tab_0[res_tab_0['corr_0'] > 0.5]

fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.barplot(y = 'features_0', x = 'corr_0', data = strong_cor)
plt.title("Correlation coefficients of strong correlated features with COVID-19 cases")
plt.xlabel("correlation coefficients")
plt.ylabel("features")

# features with correlation coefficent > 0.3 and < 0.5 have moderate correlation with cases
moderate_cor = res_tab_0[(res_tab_0['corr_0'] >= 0.3) & (res_tab_0['corr_0'] < 0.5)]

fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.barplot(y = 'features_0', x = 'corr_0', data = moderate_cor)
plt.title("Correlation coefficients of moderate correlated features with COVID-19 cases")
plt.xlabel("correlation coefficients")
plt.ylabel("features")

shift_sample = np.linspace(-14, 14, 29).astype('int')
for i in shift_sample:
  if i == shift_sample[0]:
    final_res_tab = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=i)
  else:
    tmp_res_tab = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=i)
    final_res_tab = pd.concat((final_res_tab,tmp_res_tab), axis = 1)

final_res_tab

select_abs_corr = [col for col in final_res_tab.columns if 'abs_corr' in col]
select_tab = final_res_tab[select_abs_corr]
select_tab['minmax_diff'] = select_tab.apply(lambda x: max(x)-min(x), axis=1)
select_tab[['minmax_diff']].describe()

final_res_tab['max_abs_corr'] = select_tab.apply(lambda x: max(x), axis=1)
final_res_tab = final_res_tab.sort_values(by = 'max_abs_corr', ascending = False)

# features with correlation coefficent > 0.5 have strong correlation with cases
strong_cor = final_res_tab[final_res_tab['max_abs_corr'] > 0.5]

fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.barplot(y = 'features_0', x = 'max_abs_corr', data = strong_cor)
plt.title("Correlation coefficients of strong correlated features with COVID-19 cases")
plt.xlabel("correlation coefficients")
plt.ylabel("features")

# features with correlation coefficent > 0.3 and < 0.5 have moderate correlation with cases
moderate_cor = final_res_tab[(final_res_tab['max_abs_corr'] < 0.5) & (final_res_tab['max_abs_corr'] >= 0.3)]

fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.barplot(y = 'features_0', x = 'max_abs_corr', data = moderate_cor)
plt.title("Correlation coefficients of moderate correlated features with COVID-19 cases")
plt.xlabel("correlation coefficients")
plt.ylabel("features")

# features with correlation coefficent > 0.3 a
strong_moderate_cor = final_res_tab[final_res_tab['max_abs_corr'] >= 0.3]

fig, ax = plt.subplots(1,1, figsize=(8,8))
sns.barplot(y = 'features_0', x = 'max_abs_corr', data = strong_moderate_cor)
plt.title("Correlation coefficients of strongly and moderately correlated features with COVID-19 cases")
plt.xlabel("correlation coefficients")
plt.ylabel("features")

moderate_features = moderate_cor['features_0'].values
print(moderate_features)
strong_features = strong_cor['features_0'].values
print(strong_features)

full_selected_features = list(strong_features) + list(moderate_features)
print(full_selected_features)

fig = plt.subplots(1,1, figsize = (15,8))
corr_res = imputed_cali_gt[full_selected_features].corr()
sns.heatmap(corr_res)

full_selected_features.remove('gt2_Facial nerve paralysis')
full_selected_features