from google.colab import drive drive.mount("/content/drive") import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.impute import KNNImputer import seaborn as sns california = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/California.csv",parse_dates=['date'],index_col=['date']) new_york = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/New York.csv",parse_dates=['date'],index_col=['date']) ma = pd.read_csv("./drive/MyDrive/CS109 Project/state_level_data/Massachusetts.csv",parse_dates=['date'],index_col=['date']) print(california.tail(10)) # remove the final 5 days since it's NA print(new_york.tail(10)) # remove the final 5 days since it's NA print(ma.tail(10)) # remove the final 5 days since it's NA california = california.iloc[:-5, :] new_york = new_york.iloc[:-5, :] ma = ma.iloc[:-5, :] for state in [california, new_york, ma]: state['JHU_cases']=state['JHU_cases'].fillna(0) # fill missing values with 0 print(state['JHU_cases'][state['JHU_cases'] < 0]) california['JHU_cases'][california['JHU_cases'] == -2019.0] = 2019.0 california['JHU_cases'][california['JHU_cases'] == -3935.0] = 3935.0 ma['JHU_cases'][ma['JHU_cases'] == -280.0] = 280 fig,axs = plt.subplots(3,1, figsize = (12,15)) axs[0].plot(california.index.values, california['JHU_cases']) axs[1].plot(new_york.index.values, new_york['JHU_cases']) axs[2].plot(ma.index.values, ma['JHU_cases']) for i in range(3): axs[i].set_xlabel('Time') axs[0].set_ylabel('Cases') axs[1].set_ylabel('Cases') axs[2].set_ylabel('Cases') axs[0].set_title('COVID-19 cases over time in California') axs[1].set_title('COVID-19 cases over time in New York') axs[2].set_title('COVID-19 cases over time in Massachusetts') #separate feature and outcome tables cali_features=california.iloc[:,3:] cali_outcomes=california.iloc[:,0] ny_features=new_york.iloc[:,3:] ny_outcomes=new_york.iloc[:,0] ma_features=ma.iloc[:,3:] ma_outcomes=ma.iloc[:,0] cali_features.shape, cali_outcomes.shape # Window == number of days in rolling window over which to calculate mean window = 5 cali_outcomes = cali_outcomes.transform(lambda x: x.rolling(window).mean()) ny_outcomes = ny_outcomes.transform(lambda x: x.rolling(window).mean()) ma_outcomes = ma_outcomes.transform(lambda x: x.rolling(window).mean()) cali_features.shape, cali_outcomes.shape cali_outcomes = cali_outcomes.fillna(0) ny_outcomes = ny_outcomes.fillna(0) ma_outcomes = ma_outcomes.fillna(0) fig,axs = plt.subplots(3,1, figsize = (12,15)) axs[0].plot(california.index.values, cali_outcomes) axs[1].plot(new_york.index.values, ny_outcomes) axs[2].plot(ma.index.values, ma_outcomes) for i in range(3): axs[i].set_xlabel('Time') axs[0].set_ylabel('Cases') axs[1].set_ylabel('Cases') axs[2].set_ylabel('Cases') axs[0].set_title('COVID-19 cases over time in California') axs[1].set_title('COVID-19 cases over time in New York') axs[2].set_title('COVID-19 cases over time in Massachusetts') missing_ratio = cali_features.isna().sum(axis=0) / cali_features.shape[0] print(missing_ratio.reset_index().sort_values(by = 0, ascending = False)) cali_features_selected = cali_features[cali_features.columns[missing_ratio < 0.2]] cali_features_selected.shape missing_ratio = ny_features.isna().sum(axis=0) / ny_features.shape[0] print(missing_ratio.reset_index().sort_values(by = 0, ascending = False)) ny_features_selected = ny_features[ny_features.columns[missing_ratio < 0.2]] ny_features_selected.shape missing_ratio = ma_features.isna().sum(axis=0) / ma_features.shape[0] print(missing_ratio.reset_index().sort_values(by = 0, ascending = False)) ma_features_selected = ma_features[ma_features.columns[missing_ratio < 0.2]] ma_features_selected.shape gt_features = [col for col in cali_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col] cali_features_selected = cali_features_selected[gt_features] gt_features = [col for col in ny_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col] ny_features_selected = ny_features_selected[gt_features] gt_features = [col for col in ma_features_selected.columns if ('gt' in col or 'gt2' in col) and 'neighbor' not in col] ma_features_selected = ma_features_selected[gt_features] #imputation using KNN col_names=cali_features_selected.columns imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean') imputer.fit(cali_features_selected) imputed_data = imputer.transform(cali_features_selected) imputed_cali_gt = pd.DataFrame(imputed_data,columns=col_names) col_names=ny_features_selected.columns imputer.fit(ny_features_selected) imputed_data = imputer.transform(ny_features_selected) imputed_ny_gt = pd.DataFrame(imputed_data,columns=col_names) col_names=ma_features_selected.columns imputer.fit(ma_features_selected) imputed_data = imputer.transform(ma_features_selected) imputed_ma_gt = pd.DataFrame(imputed_data,columns=col_names) # Pearson's correlation requires that each dataset be normally distributed from scipy.stats.stats import pearsonr from scipy.ndimage.interpolation import shift corr_value = [] p_value = [] for i in range(len(imputed_cali_gt.columns)): res = pearsonr(imputed_cali_gt.iloc[:,i], cali_outcomes) corr_value.append(res[0]) p_value.append(res[1]) res_tab = pd.DataFrame({"features" : imputed_cali_gt.columns, "corr": corr_value, 'abs_corr': [abs(x) for x in corr_value], "p_value": p_value}) res_tab['relation'] = np.where(res_tab['corr'] > 0, 'positive', 'negative') res_tab = res_tab.sort_values(by='abs_corr', ascending = False) res_tab.head() def pcorrcoef(feature, outcome, shift_by): corr_value = [] p_value = [] if shift_by != 0: shifted_data = shift(outcome.reset_index()['JHU_cases'].values, shift_by, cval = 0) outcome = shifted_data for i in range(len(feature.columns)): res = pearsonr(feature.iloc[:,i], outcome) corr_value.append(res[0]) p_value.append(res[1]) res_tab = pd.DataFrame({f"features_{shift_by}" : feature.columns, f"corr_{shift_by}": corr_value, f"abs_corr_{shift_by}": [abs(x) for x in corr_value], f"p_value_{shift_by}": p_value}) res_tab[f"relation_{shift_by}"] = np.where(res_tab[f"corr_{shift_by}"] > 0, 'positive', 'negative') # res_tab = res_tab.sort_values(by=f"abs_corr_{shift_by}", ascending = False) return res_tab res_tab_0 = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=0) res_tab_0 = res_tab_0.sort_values(by="abs_corr_0", ascending = False) res_tab_0 # features with correlation coefficent > 0.5 have strong correlation with cases strong_cor = res_tab_0[res_tab_0['corr_0'] > 0.5] fig, ax = plt.subplots(1,1, figsize=(8,8)) sns.barplot(y = 'features_0', x = 'corr_0', data = strong_cor) plt.title("Correlation coefficients of strong correlated features with COVID-19 cases") plt.xlabel("correlation coefficients") plt.ylabel("features") # features with correlation coefficent > 0.3 and < 0.5 have moderate correlation with cases moderate_cor = res_tab_0[(res_tab_0['corr_0'] >= 0.3) & (res_tab_0['corr_0'] < 0.5)] fig, ax = plt.subplots(1,1, figsize=(8,8)) sns.barplot(y = 'features_0', x = 'corr_0', data = moderate_cor) plt.title("Correlation coefficients of moderate correlated features with COVID-19 cases") plt.xlabel("correlation coefficients") plt.ylabel("features") shift_sample = np.linspace(-14, 14, 29).astype('int') for i in shift_sample: if i == shift_sample[0]: final_res_tab = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=i) else: tmp_res_tab = pcorrcoef(feature = imputed_cali_gt, outcome=cali_outcomes, shift_by=i) final_res_tab = pd.concat((final_res_tab,tmp_res_tab), axis = 1) final_res_tab select_abs_corr = [col for col in final_res_tab.columns if 'abs_corr' in col] select_tab = final_res_tab[select_abs_corr] select_tab['minmax_diff'] = select_tab.apply(lambda x: max(x)-min(x), axis=1) select_tab[['minmax_diff']].describe() final_res_tab['max_abs_corr'] = select_tab.apply(lambda x: max(x), axis=1) final_res_tab = final_res_tab.sort_values(by = 'max_abs_corr', ascending = False) # features with correlation coefficent > 0.5 have strong correlation with cases strong_cor = final_res_tab[final_res_tab['max_abs_corr'] > 0.5] fig, ax = plt.subplots(1,1, figsize=(8,8)) sns.barplot(y = 'features_0', x = 'max_abs_corr', data = strong_cor) plt.title("Correlation coefficients of strong correlated features with COVID-19 cases") plt.xlabel("correlation coefficients") plt.ylabel("features") # features with correlation coefficent > 0.3 and < 0.5 have moderate correlation with cases moderate_cor = final_res_tab[(final_res_tab['max_abs_corr'] < 0.5) & (final_res_tab['max_abs_corr'] >= 0.3)] fig, ax = plt.subplots(1,1, figsize=(8,8)) sns.barplot(y = 'features_0', x = 'max_abs_corr', data = moderate_cor) plt.title("Correlation coefficients of moderate correlated features with COVID-19 cases") plt.xlabel("correlation coefficients") plt.ylabel("features") # features with correlation coefficent > 0.3 a strong_moderate_cor = final_res_tab[final_res_tab['max_abs_corr'] >= 0.3] fig, ax = plt.subplots(1,1, figsize=(8,8)) sns.barplot(y = 'features_0', x = 'max_abs_corr', data = strong_moderate_cor) plt.title("Correlation coefficients of strongly and moderately correlated features with COVID-19 cases") plt.xlabel("correlation coefficients") plt.ylabel("features") moderate_features = moderate_cor['features_0'].values print(moderate_features) strong_features = strong_cor['features_0'].values print(strong_features) full_selected_features = list(strong_features) + list(moderate_features) print(full_selected_features) fig = plt.subplots(1,1, figsize = (15,8)) corr_res = imputed_cali_gt[full_selected_features].corr() sns.heatmap(corr_res) full_selected_features.remove('gt2_Facial nerve paralysis') full_selected_features