Loaded Pandas, mounted csv file via drive. printed head
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'
import pandas as pd
data_1 = pd.read_csv(file_path)
print(data_1.head())
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True). Data As Of Start Date End Date Group Year Month State \ 0 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 1 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 2 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 3 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 4 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States Condition Group Condition ICD10_codes Age Group \ 0 Respiratory diseases Influenza and pneumonia J09-J18 0-24 1 Respiratory diseases Influenza and pneumonia J09-J18 25-34 2 Respiratory diseases Influenza and pneumonia J09-J18 35-44 3 Respiratory diseases Influenza and pneumonia J09-J18 45-54 4 Respiratory diseases Influenza and pneumonia J09-J18 55-64 COVID-19 Deaths Number of Mentions Flag 0 1569.0 1647.0 NaN 1 5804.0 6029.0 NaN 2 15080.0 15699.0 NaN 3 37414.0 38878.0 NaN 4 82668.0 85708.0 NaN
from google.colab import drive
drive.mount('/content/drive')
Gathered basic statistical and descriptive information
data_1_shape = data_1.shape
# Descriptive statistics for all columns
data_1_describe = data_1.describe(include='all')
# Display the last few rows of the DataFrame
data_1_tail = data_1.tail()
# Display the data types of each column
data_1_dtypes = data_1.dtypes
data_1_shape, data_1_describe, data_1_tail, data_1_dtypes
((621000, 14), Data As Of Start Date End Date Group Year \ count 621000 621000 621000 621000 608580.000000 unique 1 45 45 3 NaN top 09/24/2023 01/01/2020 09/23/2023 By Month NaN freq 621000 37260 37260 558900 NaN mean NaN NaN NaN NaN 2021.408163 std NaN NaN NaN NaN 1.086436 min NaN NaN NaN NaN 2020.000000 25% NaN NaN NaN NaN 2020.000000 50% NaN NaN NaN NaN 2021.000000 75% NaN NaN NaN NaN 2022.000000 max NaN NaN NaN NaN 2023.000000 Month State Condition Group \ count 558900.000000 621000 621000 unique NaN 54 12 top NaN United States Circulatory diseases freq NaN 11500 189000 mean 6.200000 NaN NaN std 3.350625 NaN NaN min 1.000000 NaN NaN 25% 3.000000 NaN NaN 50% 6.000000 NaN NaN 75% 9.000000 NaN NaN max 12.000000 NaN NaN Condition ICD10_codes Age Group COVID-19 Deaths \ count 621000 621000 621000 4.375510e+05 unique 23 23 10 NaN top Influenza and pneumonia J09-J18 0-24 NaN freq 27000 27000 62100 NaN mean NaN NaN NaN 1.201179e+02 std NaN NaN NaN 2.980201e+03 min NaN NaN NaN 0.000000e+00 25% NaN NaN NaN 0.000000e+00 50% NaN NaN NaN 0.000000e+00 75% NaN NaN NaN 1.800000e+01 max NaN NaN NaN 1.146242e+06 Number of Mentions Flag count 4.434230e+05 183449 unique NaN 1 top NaN One or more data cells have counts between 1-9... freq NaN 183449 mean 1.293348e+02 NaN std 3.203936e+03 NaN min 0.000000e+00 NaN 25% 0.000000e+00 NaN 50% 0.000000e+00 NaN 75% 1.900000e+01 NaN max 1.146242e+06 NaN , Data As Of Start Date End Date Group Year Month \ 620995 09/24/2023 05/01/2023 05/31/2023 By Month 2023.0 5.0 620996 09/24/2023 06/01/2023 06/30/2023 By Month 2023.0 6.0 620997 09/24/2023 07/01/2023 07/31/2023 By Month 2023.0 7.0 620998 09/24/2023 08/01/2023 08/31/2023 By Month 2023.0 8.0 620999 09/24/2023 09/01/2023 09/23/2023 By Month 2023.0 9.0 State Condition Group Condition ICD10_codes Age Group \ 620995 Puerto Rico COVID-19 COVID-19 U071 All Ages 620996 Puerto Rico COVID-19 COVID-19 U071 All Ages 620997 Puerto Rico COVID-19 COVID-19 U071 All Ages 620998 Puerto Rico COVID-19 COVID-19 U071 All Ages 620999 Puerto Rico COVID-19 COVID-19 U071 All Ages COVID-19 Deaths Number of Mentions Flag 620995 67.0 67.0 NaN 620996 122.0 122.0 NaN 620997 114.0 114.0 NaN 620998 78.0 78.0 NaN 620999 36.0 36.0 NaN , Data As Of object Start Date object End Date object Group object Year float64 Month float64 State object Condition Group object Condition object ICD10_codes object Age Group object COVID-19 Deaths float64 Number of Mentions float64 Flag object dtype: object)
data_1 = pd.DataFrame(data_1)
data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])
data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])
data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])
print(data_1)
Data As Of Start Date End Date Group Year Month \ 0 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 1 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 2 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 3 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 4 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN ... ... ... ... ... ... ... 620995 2023-09-24 2023-05-01 2023-05-31 By Month 2023.0 5.0 620996 2023-09-24 2023-06-01 2023-06-30 By Month 2023.0 6.0 620997 2023-09-24 2023-07-01 2023-07-31 By Month 2023.0 7.0 620998 2023-09-24 2023-08-01 2023-08-31 By Month 2023.0 8.0 620999 2023-09-24 2023-09-01 2023-09-23 By Month 2023.0 9.0 State Condition Group Condition \ 0 United States Respiratory diseases Influenza and pneumonia 1 United States Respiratory diseases Influenza and pneumonia 2 United States Respiratory diseases Influenza and pneumonia 3 United States Respiratory diseases Influenza and pneumonia 4 United States Respiratory diseases Influenza and pneumonia ... ... ... ... 620995 Puerto Rico COVID-19 COVID-19 620996 Puerto Rico COVID-19 COVID-19 620997 Puerto Rico COVID-19 COVID-19 620998 Puerto Rico COVID-19 COVID-19 620999 Puerto Rico COVID-19 COVID-19 ICD10_codes Age Group COVID-19 Deaths Number of Mentions Flag 0 J09-J18 0-24 1569.0 1647.0 NaN 1 J09-J18 25-34 5804.0 6029.0 NaN 2 J09-J18 35-44 15080.0 15699.0 NaN 3 J09-J18 45-54 37414.0 38878.0 NaN 4 J09-J18 55-64 82668.0 85708.0 NaN ... ... ... ... ... ... 620995 U071 All Ages 67.0 67.0 NaN 620996 U071 All Ages 122.0 122.0 NaN 620997 U071 All Ages 114.0 114.0 NaN 620998 U071 All Ages 78.0 78.0 NaN 620999 U071 All Ages 36.0 36.0 NaN [621000 rows x 14 columns]
Some Charting Below as Part of EDA
import matplotlib.pyplot as plt
import seaborn as sns
age_group_counts_data_1 = data_1['Age Group'].value_counts()
plt.figure(figsize=(10, 6))
plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')
plt.title('Frequency Distribution of Age Groups in Data 1')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.xticks(rotation=45) # Rotate x-axis labels to show clearly
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)
plt.title('COVID-19 Deaths by Age Group in Data 1')
plt.xlabel('Age Group')
plt.ylabel('COVID-19 Deaths')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-00a1b1d3599e> in <cell line: 4>() 2 import seaborn as sns 3 ----> 4 age_group_counts_data_1 = data_1['Age Group'].value_counts() 5 6 plt.figure(figsize=(10, 6)) NameError: name 'data_1' is not defined
Covid19 Deaths by Condition and Age Group
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12, 6))
barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group', data=data_1)
plt.title('COVID-19 Deaths by Condition Group and Age Group')
plt.xlabel('Condition Group')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.show()
unique_states = data_1['State'].unique()
data_1_no_us = data_1[data_1['State'] != 'United States']
plt.figure(figsize=(12, 6))
barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group', data=data_1_no_us)
plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90)
plt.tight_layout()
plt.show()
Checked for N/a as part pof preprocessing for ML models
missing_values_count = data_1_no_us.isnull().sum()
print(missing_values_count)
Data As Of 0 Start Date 0 End Date 0 Group 0 Year 12190 Month 60950 State 0 Condition Group 0 Condition 0 ICD10_codes 0 Age Group 0 COVID-19 Deaths 183449 Number of Mentions 177577 Flag 426051 dtype: int64
There were quite aq few N/a and bad data, which I deleted.
data_1_cleaned = data_1.dropna(subset=['COVID-19 Deaths', 'Number of Mentions'])
Checked to see how much of the data was removed above
original_row_count = data_1.shape[0]
cleaned_row_count = data_1_cleaned.shape[0]
rows_dropped = original_row_count - cleaned_row_count
print(f"Original number of rows: {original_row_count}")
print(f"Number of rows after cleaning: {cleaned_row_count}")
print(f"Number of rows dropped: {rows_dropped}")
Original number of rows: 621000 Number of rows after cleaning: 437551 Number of rows dropped: 183449
While running the ML models below, I had issues with categorical columns (this data is mostly categorical) below, I hot-encoded it to make it numerical.
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month']
data_1_cleaned = data_1_cleaned.drop(columns=date_cols)
categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols]))
data_1_encoded.columns = encoder.get_feature_names_out(categorical_cols)
data_1_encoded.index = data_1_cleaned.index
num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1)
data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1)
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
from sklearn.preprocessing import OneHotEncoder
date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month']
data_1_cleaned = data_1_cleaned.drop(columns=date_cols, errors='ignore')
categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns
# Applying One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols]))
data_1_encoded.index = data_1_cleaned.index
num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1)
data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1)
/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value. warnings.warn(
The first Model - SKLearn Train, Test, Split
from sklearn.model_selection import train_test_split
# 'COVID-19 Deaths' is the target variable
y = data_1_preprocessed['COVID-19 Deaths']
X = data_1_preprocessed.drop('COVID-19 Deaths', axis=1)
# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")
Training set size: 350040 rows Testing set size: 87511 rows
Just viewing my pre-processed data to see what it looks like
data_1_preprocessed.head()
COVID-19 Deaths | Number of Mentions | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | ... | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1569.0 | 1647.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 5804.0 | 6029.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 15080.0 | 15699.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 37414.0 | 38878.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 82668.0 | 85708.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
5 rows × 128 columns
checking dtypes for later ML models
print(X_train.dtypes)
Number of Mentions float64 0 float64 1 float64 2 float64 3 float64 ... 121 float64 122 float64 123 float64 124 float64 125 float64 Length: 127, dtype: object
Train/Test Split View
print(X_train.columns)
print(X_test.columns)
Index(['Number of Mentions', 0, 1, 2, 3, 4, 5, 6, 7, 8, ... 116, 117, 118, 119, 120, 121, 122, 123, 124, 125], dtype='object', length=127) Index(['Number of Mentions', 0, 1, 2, 3, 4, 5, 6, 7, 8, ... 116, 117, 118, 119, 120, 121, 122, 123, 124, 125], dtype='object', length=127)
print(X_train.dtypes)
Number of Mentions float64 0 float64 1 float64 2 float64 3 float64 ... 121 float64 122 float64 123 float64 124 float64 125 float64 Length: 127, dtype: object
# Check for missing values
print(X_train.isnull().sum())
# Check data types
print(X_train.dtypes)
Number of Mentions 0 0 0 1 0 2 0 3 0 .. 121 0 122 0 123 0 124 0 125 0 Length: 127, dtype: int64 Number of Mentions float64 0 float64 1 float64 2 float64 3 float64 ... 121 float64 122 float64 123 float64 124 float64 125 float64 Length: 127, dtype: object
Converted all types to str for ML modelling below
# Convert all column names to strings
X.columns = X.columns.astype(str)
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
Training Linear Regression Model on the dataset
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
Checking Linear Regression Model Metrics
y_pred = model.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
Mean Squared Error: 119573.1576029768 R^2 Score: 0.9530692937008057
coefficients = model.coef_
Training Columns and Feature Importance
feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])
Graphing Feature Importance
import matplotlib.pyplot as plt
feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()
Running Standard Scaler to fix the data before fruther training
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
Training and gathering metrics on Linear Regression Model. Below is the same as above but with the scaled data so it is easy to visualise
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred_scaled = model.predict(X_test_scaled)
from sklearn.metrics import mean_squared_error, r2_score
y_pred_scaled = model.predict(X_test_scaled)
mse_scaled = mean_squared_error(y_test, y_pred_scaled)
print(f"Mean Squared Error: {mse_scaled}")
r2_scaled = r2_score(y_test, y_pred_scaled)
print(f"R^2 Score: {r2_scaled}")
Mean Squared Error: 119530.27704932197 R^2 Score: 0.9530861236876511
coefficients = model.coef_
feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])
feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()
coefficients = model.coef_
feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])
sorted_features = feature_importance.sort_values(by='Coefficient', ascending=False)
sorted_features.plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()
Viewing Feature Importance Data
coefficients = model.coef_
feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])
print(feature_importance)
Coefficient Number of Mentions 3.193643e+03 0 -3.228707e+14 1 -1.584457e+14 2 -2.892308e+14 3 -2.441330e+14 ... ... 121 -6.542653e+14 122 -6.586453e+14 123 -7.226933e+14 124 -8.236515e+14 125 0.000000e+00 [127 rows x 1 columns]
# Number of rows to display at a time
chunk_size = 10
# Iterate over the DataFrame in chunks
for start in range(0, len(feature_importance), chunk_size):
end = start + chunk_size
print(feature_importance.iloc[start:end])
print("\n") # Print a newline for better separation between chunks
Coefficient Number of Mentions 3.193643e+03 0 -3.228707e+14 1 -1.584457e+14 2 -2.892308e+14 3 -2.441330e+14 4 -2.605345e+14 5 -2.471668e+14 6 -2.453000e+14 7 -2.624680e+14 8 -2.473037e+14 Coefficient 9 -2.503349e+14 10 -2.508170e+14 11 -2.583988e+14 12 -2.614571e+14 13 -2.499485e+14 14 -2.540121e+14 15 -2.525443e+14 16 -2.523530e+14 17 -2.497357e+14 18 -2.481822e+14 Coefficient 19 -2.465004e+14 20 -2.544298e+14 21 -2.459108e+14 22 -2.563755e+14 23 -2.485521e+14 24 -2.499291e+14 25 -2.539170e+14 26 -2.504892e+14 27 -2.471276e+14 28 -2.482990e+14 Coefficient 29 -2.551684e+14 30 -2.484353e+14 31 -2.449841e+14 32 -2.551495e+14 33 -2.527545e+14 34 -2.426008e+14 35 -2.530982e+14 36 -2.476750e+14 37 -2.518932e+14 38 -2.543539e+14 Coefficient 39 -2.578759e+14 40 -2.469122e+14 41 -2.459305e+14 42 -2.565072e+14 43 -2.445688e+14 44 -2.552629e+14 45 -2.481432e+14 46 -2.518932e+14 47 -2.523339e+14 48 -2.624313e+14 Coefficient 49 -3.005830e+14 50 -2.419610e+14 51 -2.645681e+14 52 -2.498711e+14 53 -2.474406e+14 54 -2.529073e+14 55 -2.492513e+14 56 -2.615676e+14 57 3.310441e+14 58 1.508162e+14 Coefficient 59 1.371026e+14 60 2.154090e+14 61 -3.856187e+14 62 -5.358547e+14 63 -4.984364e+14 64 -5.795386e+14 65 -2.881679e+12 66 1.210663e+14 67 6.967718e+14 68 3.919754e+14 Coefficient 69 -4.097677e+13 70 -3.093303e+14 71 -4.052706e+14 72 -6.728743e+14 73 -2.441636e+14 74 -3.476412e+14 75 9.645447e+13 76 -7.996895e+14 77 2.970814e+14 78 2.487471e+14 Coefficient 79 1.988275e+14 80 -2.272504e+14 81 1.396508e+14 82 -1.633230e+14 83 1.031200e+14 84 -1.893600e+14 85 -1.537223e+14 86 -9.015754e+13 87 -3.362053e+14 88 2.154295e+13 Coefficient 89 -1.625299e+14 90 -8.376282e+14 91 -3.045707e+14 92 -3.263975e+14 93 -1.573358e+14 94 9.804019e+13 95 -2.107130e+14 96 4.728852e+14 97 -4.112718e+14 98 -6.801835e+13 Coefficient 99 -2.302176e+14 100 -6.023744e+14 101 -2.371293e+14 102 -5.434325e+13 103 -1.477647e+14 104 -6.502466e+14 105 -4.921786e+14 106 -2.558928e+14 107 -1.370602e+14 108 4.421373e+14 Coefficient 109 -3.167915e+14 110 -2.010348e+14 111 3.930446e+13 112 -3.888962e+14 113 1.032215e+14 114 2.224110e+14 115 -7.720316e+14 116 -7.419122e+14 117 -7.079868e+14 118 -6.761242e+14 Coefficient 119 -6.560650e+14 120 -6.505726e+14 121 -6.542653e+14 122 -6.586453e+14 123 -7.226933e+14 124 -8.236515e+14 125 0.000000e+00
Running Lasso ML Model and Running Metrics
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
Lasso()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso()
from sklearn.metrics import mean_squared_error, r2_score
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
Mean Squared Error: 119629.68971020046 R^2 Score: 0.9530471056799006
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")
Mean Squared Error: 119629.68971020046 R^2 Score: 0.9530471056799006
Calculating Metrics
from sklearn import metrics
import numpy as np
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
mae = metrics.mean_absolute_error(y_test, y_pred)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
Root Mean Squared Error (RMSE): 345.8752516590345 Mean Absolute Error (MAE): 18.54945519148683
Running Residual Plot
import matplotlib.pyplot as plt
residuals = y_test - y_pred
plt.figure(figsize=(10,6))
plt.scatter(y_pred, residuals)
plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
Running Cross Val Score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)
print("Cross-validated RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())
Cross-validated RMSE scores: [1094.62584254 41.30816971 36.25026322 36.84615 41.14088855] Mean RMSE: 250.03426280424463
Running KFold ML Model
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = list(kf.split(X))
train_indices, test_indices = folds[0]
X_fold1, y_fold1 = X.iloc[test_indices], y.iloc[test_indices]
print(X_fold1.describe())
print(y_fold1.describe())
print(X.describe())
print(y.describe())
Number of Mentions 0 1 2 \ count 87511.000000 87511.000000 87511.000000 87511.000000 mean 115.520952 0.888700 0.023951 0.087349 std 1855.304203 0.314505 0.152898 0.282347 min 0.000000 0.000000 0.000000 0.000000 25% 0.000000 1.000000 0.000000 0.000000 50% 0.000000 1.000000 0.000000 0.000000 75% 19.000000 1.000000 0.000000 0.000000 max 298663.000000 1.000000 1.000000 1.000000 3 4 5 6 7 \ count 87511.000000 87511.000000 87511.000000 87511.000000 87511.000000 mean 0.017643 0.020249 0.017609 0.018386 0.020455 std 0.131653 0.140851 0.131527 0.134344 0.141550 min 0.000000 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 0.000000 0.000000 75% 0.000000 0.000000 0.000000 0.000000 0.000000 max 1.000000 1.000000 1.000000 1.000000 1.000000 8 ... 116 117 118 \ count 87511.000000 ... 87511.000000 87511.000000 87511.000000 mean 0.017723 ... 0.111449 0.099325 0.089372 std 0.131945 ... 0.314689 0.299099 0.285281 min 0.000000 ... 0.000000 0.000000 0.000000 25% 0.000000 ... 0.000000 0.000000 0.000000 50% 0.000000 ... 0.000000 0.000000 0.000000 75% 0.000000 ... 0.000000 0.000000 0.000000 max 1.000000 ... 1.000000 1.000000 1.000000 119 120 121 122 123 \ count 87511.000000 87511.000000 87511.000000 87511.000000 87511.000000 mean 0.082024 0.081795 0.083270 0.084389 0.104890 std 0.274403 0.274054 0.276291 0.277972 0.306413 min 0.000000 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 0.000000 0.000000 75% 0.000000 0.000000 0.000000 0.000000 0.000000 max 1.000000 1.000000 1.000000 1.000000 1.000000 124 125 count 87511.000000 87511.0 mean 0.142851 1.0 std 0.349922 0.0 min 0.000000 1.0 25% 0.000000 1.0 50% 0.000000 1.0 75% 0.000000 1.0 max 1.000000 1.0 [8 rows x 127 columns] count 87511.000000 mean 105.524231 std 1596.212786 min 0.000000 25% 0.000000 50% 0.000000 75% 18.000000 max 194736.000000 Name: COVID-19 Deaths, dtype: float64 Number of Mentions 0 1 2 \ count 4.375510e+05 437551.000000 437551.000000 437551.000000 mean 1.309027e+02 0.887896 0.024482 0.087622 std 3.225334e+03 0.315494 0.154539 0.282744 min 0.000000e+00 0.000000 0.000000 0.000000 25% 0.000000e+00 1.000000 0.000000 0.000000 50% 0.000000e+00 1.000000 0.000000 0.000000 75% 1.900000e+01 1.000000 0.000000 0.000000 max 1.146242e+06 1.000000 1.000000 1.000000 3 4 5 6 \ count 437551.000000 437551.000000 437551.000000 437551.000000 mean 0.017349 0.019829 0.017694 0.017632 std 0.130568 0.139411 0.131837 0.131611 min 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 0.000000 75% 0.000000 0.000000 0.000000 0.000000 max 1.000000 1.000000 1.000000 1.000000 7 8 ... 116 117 \ count 437551.000000 437551.000000 ... 437551.000000 437551.000000 mean 0.020110 0.017733 ... 0.110684 0.099367 std 0.140376 0.131979 ... 0.313741 0.299154 min 0.000000 0.000000 ... 0.000000 0.000000 25% 0.000000 0.000000 ... 0.000000 0.000000 50% 0.000000 0.000000 ... 0.000000 0.000000 75% 0.000000 0.000000 ... 0.000000 0.000000 max 1.000000 1.000000 ... 1.000000 1.000000 118 119 120 121 \ count 437551.000000 437551.000000 437551.000000 437551.000000 mean 0.089608 0.083519 0.082244 0.083364 std 0.285619 0.276666 0.274737 0.276432 min 0.000000 0.000000 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 50% 0.000000 0.000000 0.000000 0.000000 75% 0.000000 0.000000 0.000000 0.000000 max 1.000000 1.000000 1.000000 1.000000 122 123 124 125 count 437551.000000 437551.000000 437551.000000 437551.0 mean 0.084575 0.104253 0.141387 1.0 std 0.278249 0.305589 0.348421 0.0 min 0.000000 0.000000 0.000000 1.0 25% 0.000000 0.000000 0.000000 1.0 50% 0.000000 0.000000 0.000000 1.0 75% 0.000000 0.000000 0.000000 1.0 max 1.000000 1.000000 1.000000 1.0 [8 rows x 127 columns] count 4.375510e+05 mean 1.201179e+02 std 2.980201e+03 min 0.000000e+00 25% 0.000000e+00 50% 0.000000e+00 75% 1.800000e+01 max 1.146242e+06 Name: COVID-19 Deaths, dtype: float64
Random Forest
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)
print(f"Random Forest - Mean Squared Error: {rf_mse}")
print(f"Random Forest - R^2 Score: {rf_r2}")
Random Forest - Mean Squared Error: 230110.6785765412 Random Forest - R^2 Score: 0.9096849419295162
Gradient Boosting
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)
print(f"XGBoost - Mean Squared Error: {xgb_mse}")
print(f"XGBoost - R^2 Score: {xgb_r2}")
XGBoost - Mean Squared Error: 1435663.2412159576 XGBoost - R^2 Score: 0.4365232860892677
Ridge Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
data_1_preprocessed.columns = data_1_preprocessed.columns.astype(str)
# Sample 3% of the data
sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)
# Split the Sample
X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
ridge_model = Ridge(random_state=42)
ridge_model.fit(X_train_sample, y_train_sample)
ridge_predictions = ridge_model.predict(X_test_sample)
ridge_mse = mean_squared_error(y_test_sample, ridge_predictions)
ridge_r2 = r2_score(y_test_sample, ridge_predictions)
print(f"Ridge Regression - Mean Squared Error: {ridge_mse}")
print(f"Ridge Regression - R^2 Score: {ridge_r2}")
Ridge Regression - Mean Squared Error: 69549.19693577621 Ridge Regression - R^2 Score: 0.8998919383066677
SVR Model (on 3% of the data as 100% was taking hours to process
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)
X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
svr_model = SVR()
svr_model.fit(X_train_sample, y_train_sample)
y_pred_sample = svr_model.predict(X_test_sample)
mse = mean_squared_error(y_test_sample, y_pred_sample)
r2 = r2_score(y_test_sample, y_pred_sample)
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)
Mean Squared Error: 684977.500927637 R^2 Score: 0.014053749826476558
Running Ridge Regression Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
# Sample 3% of the data
sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)
# Split the Sample
X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
ridge_model = Ridge(random_state=42)
ridge_model.fit(X_train_sample, y_train_sample)
ridge_predictions = ridge_model.predict(X_test_sample)
ridge_mse = mean_squared_error(y_test_sample, ridge_predictions)
ridge_r2 = r2_score(y_test_sample, ridge_predictions)
print(f"Ridge Regression - Mean Squared Error: {ridge_mse}")
print(f"Ridge Regression - R^2 Score: {ridge_r2}")
Ridge Regression - Mean Squared Error: 69549.19693577621 Ridge Regression - R^2 Score: 0.8998919383066677