Loaded Pandas, mounted csv file via drive. printed head

New Section¶

In [ ]:

import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/My Drive/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'

import pandas as pd
data_1 = pd.read_csv(file_path)

print(data_1.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Data As Of  Start Date    End Date     Group  Year  Month          State  \
0  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
1  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
2  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
3  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
4  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   

        Condition Group                Condition ICD10_codes Age Group  \
0  Respiratory diseases  Influenza and pneumonia     J09-J18      0-24   
1  Respiratory diseases  Influenza and pneumonia     J09-J18     25-34   
2  Respiratory diseases  Influenza and pneumonia     J09-J18     35-44   
3  Respiratory diseases  Influenza and pneumonia     J09-J18     45-54   
4  Respiratory diseases  Influenza and pneumonia     J09-J18     55-64   

   COVID-19 Deaths  Number of Mentions Flag  
0           1569.0              1647.0  NaN  
1           5804.0              6029.0  NaN  
2          15080.0             15699.0  NaN  
3          37414.0             38878.0  NaN  
4          82668.0             85708.0  NaN

In [ ]:

from google.colab import drive
drive.mount('/content/drive')

Gathered basic statistical and descriptive information

In [ ]:

data_1_shape = data_1.shape

# Descriptive statistics for all columns
data_1_describe = data_1.describe(include='all')

# Display the last few rows of the DataFrame
data_1_tail = data_1.tail()

# Display the data types of each column
data_1_dtypes = data_1.dtypes

data_1_shape, data_1_describe, data_1_tail, data_1_dtypes

Out[ ]:

((621000, 14),
         Data As Of  Start Date    End Date     Group           Year  \
 count       621000      621000      621000    621000  608580.000000   
 unique           1          45          45         3            NaN   
 top     09/24/2023  01/01/2020  09/23/2023  By Month            NaN   
 freq        621000       37260       37260    558900            NaN   
 mean           NaN         NaN         NaN       NaN    2021.408163   
 std            NaN         NaN         NaN       NaN       1.086436   
 min            NaN         NaN         NaN       NaN    2020.000000   
 25%            NaN         NaN         NaN       NaN    2020.000000   
 50%            NaN         NaN         NaN       NaN    2021.000000   
 75%            NaN         NaN         NaN       NaN    2022.000000   
 max            NaN         NaN         NaN       NaN    2023.000000   
 
                 Month          State       Condition Group  \
 count   558900.000000         621000                621000   
 unique            NaN             54                    12   
 top               NaN  United States  Circulatory diseases   
 freq              NaN          11500                189000   
 mean         6.200000            NaN                   NaN   
 std          3.350625            NaN                   NaN   
 min          1.000000            NaN                   NaN   
 25%          3.000000            NaN                   NaN   
 50%          6.000000            NaN                   NaN   
 75%          9.000000            NaN                   NaN   
 max         12.000000            NaN                   NaN   
 
                       Condition ICD10_codes Age Group  COVID-19 Deaths  \
 count                    621000      621000    621000     4.375510e+05   
 unique                       23          23        10              NaN   
 top     Influenza and pneumonia     J09-J18      0-24              NaN   
 freq                      27000       27000     62100              NaN   
 mean                        NaN         NaN       NaN     1.201179e+02   
 std                         NaN         NaN       NaN     2.980201e+03   
 min                         NaN         NaN       NaN     0.000000e+00   
 25%                         NaN         NaN       NaN     0.000000e+00   
 50%                         NaN         NaN       NaN     0.000000e+00   
 75%                         NaN         NaN       NaN     1.800000e+01   
 max                         NaN         NaN       NaN     1.146242e+06   
 
         Number of Mentions                                               Flag  
 count         4.434230e+05                                             183449  
 unique                 NaN                                                  1  
 top                    NaN  One or more data cells have counts between 1-9...  
 freq                   NaN                                             183449  
 mean          1.293348e+02                                                NaN  
 std           3.203936e+03                                                NaN  
 min           0.000000e+00                                                NaN  
 25%           0.000000e+00                                                NaN  
 50%           0.000000e+00                                                NaN  
 75%           1.900000e+01                                                NaN  
 max           1.146242e+06                                                NaN  ,
         Data As Of  Start Date    End Date     Group    Year  Month  \
 620995  09/24/2023  05/01/2023  05/31/2023  By Month  2023.0    5.0   
 620996  09/24/2023  06/01/2023  06/30/2023  By Month  2023.0    6.0   
 620997  09/24/2023  07/01/2023  07/31/2023  By Month  2023.0    7.0   
 620998  09/24/2023  08/01/2023  08/31/2023  By Month  2023.0    8.0   
 620999  09/24/2023  09/01/2023  09/23/2023  By Month  2023.0    9.0   
 
               State Condition Group Condition ICD10_codes Age Group  \
 620995  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620996  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620997  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620998  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620999  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 
         COVID-19 Deaths  Number of Mentions Flag  
 620995             67.0                67.0  NaN  
 620996            122.0               122.0  NaN  
 620997            114.0               114.0  NaN  
 620998             78.0                78.0  NaN  
 620999             36.0                36.0  NaN  ,
 Data As Of             object
 Start Date             object
 End Date               object
 Group                  object
 Year                  float64
 Month                 float64
 State                  object
 Condition Group        object
 Condition              object
 ICD10_codes            object
 Age Group              object
 COVID-19 Deaths       float64
 Number of Mentions    float64
 Flag                   object
 dtype: object)

In [ ]:

data_1 = pd.DataFrame(data_1)

data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])
data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])
data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])

print(data_1)

       Data As Of Start Date   End Date     Group    Year  Month  \
0      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
1      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
2      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
3      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
4      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
...           ...        ...        ...       ...     ...    ...   
620995 2023-09-24 2023-05-01 2023-05-31  By Month  2023.0    5.0   
620996 2023-09-24 2023-06-01 2023-06-30  By Month  2023.0    6.0   
620997 2023-09-24 2023-07-01 2023-07-31  By Month  2023.0    7.0   
620998 2023-09-24 2023-08-01 2023-08-31  By Month  2023.0    8.0   
620999 2023-09-24 2023-09-01 2023-09-23  By Month  2023.0    9.0   

                State       Condition Group                Condition  \
0       United States  Respiratory diseases  Influenza and pneumonia   
1       United States  Respiratory diseases  Influenza and pneumonia   
2       United States  Respiratory diseases  Influenza and pneumonia   
3       United States  Respiratory diseases  Influenza and pneumonia   
4       United States  Respiratory diseases  Influenza and pneumonia   
...               ...                   ...                      ...   
620995    Puerto Rico              COVID-19                 COVID-19   
620996    Puerto Rico              COVID-19                 COVID-19   
620997    Puerto Rico              COVID-19                 COVID-19   
620998    Puerto Rico              COVID-19                 COVID-19   
620999    Puerto Rico              COVID-19                 COVID-19   

       ICD10_codes Age Group  COVID-19 Deaths  Number of Mentions Flag  
0          J09-J18      0-24           1569.0              1647.0  NaN  
1          J09-J18     25-34           5804.0              6029.0  NaN  
2          J09-J18     35-44          15080.0             15699.0  NaN  
3          J09-J18     45-54          37414.0             38878.0  NaN  
4          J09-J18     55-64          82668.0             85708.0  NaN  
...            ...       ...              ...                 ...  ...  
620995        U071  All Ages             67.0                67.0  NaN  
620996        U071  All Ages            122.0               122.0  NaN  
620997        U071  All Ages            114.0               114.0  NaN  
620998        U071  All Ages             78.0                78.0  NaN  
620999        U071  All Ages             36.0                36.0  NaN  

[621000 rows x 14 columns]

Some Charting Below as Part of EDA

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sns

age_group_counts_data_1 = data_1['Age Group'].value_counts()

plt.figure(figsize=(10, 6))
plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')
plt.title('Frequency Distribution of Age Groups in Data 1')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels to show clearly
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)
plt.title('COVID-19 Deaths by Age Group in Data 1')
plt.xlabel('Age Group')
plt.ylabel('COVID-19 Deaths')
plt.show()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-00a1b1d3599e> in <cell line: 4>()
      2 import seaborn as sns
      3 
----> 4 age_group_counts_data_1 = data_1['Age Group'].value_counts()
      5 
      6 plt.figure(figsize=(10, 6))

NameError: name 'data_1' is not defined

Covid19 Deaths by Condition and Age Group

In [ ]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group', data=data_1)
plt.title('COVID-19 Deaths by Condition Group and Age Group')
plt.xlabel('Condition Group')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90)  # Rotate x-axis labels
plt.show()

MAde a unique States data set and removed 'United States' as it was being included in the group of states and was badly skewing the data. Fixed graph is below.¶

In [ ]:

unique_states = data_1['State'].unique()


data_1_no_us = data_1[data_1['State'] != 'United States']

plt.figure(figsize=(12, 6))
barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group', data=data_1_no_us)
plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90)
plt.tight_layout()
plt.show()

Checked for N/a as part pof preprocessing for ML models

In [ ]:

missing_values_count = data_1_no_us.isnull().sum()
print(missing_values_count)

Data As Of                 0
Start Date                 0
End Date                   0
Group                      0
Year                   12190
Month                  60950
State                      0
Condition Group            0
Condition                  0
ICD10_codes                0
Age Group                  0
COVID-19 Deaths       183449
Number of Mentions    177577
Flag                  426051
dtype: int64

There were quite aq few N/a and bad data, which I deleted.

In [ ]:

data_1_cleaned = data_1.dropna(subset=['COVID-19 Deaths', 'Number of Mentions'])

Checked to see how much of the data was removed above

In [ ]:

original_row_count = data_1.shape[0]
cleaned_row_count = data_1_cleaned.shape[0]
rows_dropped = original_row_count - cleaned_row_count

print(f"Original number of rows: {original_row_count}")
print(f"Number of rows after cleaning: {cleaned_row_count}")
print(f"Number of rows dropped: {rows_dropped}")

Original number of rows: 621000
Number of rows after cleaning: 437551
Number of rows dropped: 183449

While running the ML models below, I had issues with categorical columns (this data is mostly categorical) below, I hot-encoded it to make it numerical.

In [ ]:

from sklearn.preprocessing import OneHotEncoder
import pandas as pd


date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month']
data_1_cleaned = data_1_cleaned.drop(columns=date_cols)

categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols]))

data_1_encoded.columns = encoder.get_feature_names_out(categorical_cols)

data_1_encoded.index = data_1_cleaned.index

num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1)
data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(

In [ ]:

from sklearn.preprocessing import OneHotEncoder

date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month']
data_1_cleaned = data_1_cleaned.drop(columns=date_cols, errors='ignore')


categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns

# Applying One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols]))

data_1_encoded.index = data_1_cleaned.index

num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1)
data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1)

/usr/local/lib/python3.10/dist-packages/sklearn/preprocessing/_encoders.py:868: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(

The first Model - SKLearn Train, Test, Split

In [ ]:

from sklearn.model_selection import train_test_split

# 'COVID-19 Deaths' is the target variable
y = data_1_preprocessed['COVID-19 Deaths']
X = data_1_preprocessed.drop('COVID-19 Deaths', axis=1)

# Splitting the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} rows")
print(f"Testing set size: {X_test.shape[0]} rows")

Training set size: 350040 rows
Testing set size: 87511 rows

Just viewing my pre-processed data to see what it looks like

In [ ]:

data_1_preprocessed.head()

Out[ ]:

	COVID-19 Deaths	Number of Mentions	1	...	116	117	118	119	125
0	1569.0	1647.0	1.0	...	0.0	0.0	0.0	0.0	1.0
1	5804.0	6029.0	1.0	...	1.0	0.0	0.0	0.0	1.0
2	15080.0	15699.0	1.0	...	0.0	1.0	0.0	0.0	1.0
3	37414.0	38878.0	1.0	...	0.0	0.0	1.0	0.0	1.0
4	82668.0	85708.0	1.0	...	0.0	0.0	0.0	1.0	1.0

5 rows × 128 columns

checking dtypes for later ML models

In [ ]:

print(X_train.dtypes)

Number of Mentions    float64
0                     float64
1                     float64
2                     float64
3                     float64
                       ...   
121                   float64
122                   float64
123                   float64
124                   float64
125                   float64
Length: 127, dtype: object

Train/Test Split View

In [ ]:

print(X_train.columns)
print(X_test.columns)

Index(['Number of Mentions',                    0,                    1,
                          2,                    3,                    4,
                          5,                    6,                    7,
                          8,
       ...
                        116,                  117,                  118,
                        119,                  120,                  121,
                        122,                  123,                  124,
                        125],
      dtype='object', length=127)
Index(['Number of Mentions',                    0,                    1,
                          2,                    3,                    4,
                          5,                    6,                    7,
                          8,
       ...
                        116,                  117,                  118,
                        119,                  120,                  121,
                        122,                  123,                  124,
                        125],
      dtype='object', length=127)

In [ ]:

print(X_train.dtypes)

Number of Mentions    float64
0                     float64
1                     float64
2                     float64
3                     float64
                       ...   
121                   float64
122                   float64
123                   float64
124                   float64
125                   float64
Length: 127, dtype: object

In [ ]:

# Check for missing values
print(X_train.isnull().sum())

# Check data types
print(X_train.dtypes)

Number of Mentions    0
0                     0
1                     0
2                     0
3                     0
                     ..
121                   0
122                   0
123                   0
124                   0
125                   0
Length: 127, dtype: int64
Number of Mentions    float64
0                     float64
1                     float64
2                     float64
3                     float64
                       ...   
121                   float64
122                   float64
123                   float64
124                   float64
125                   float64
Length: 127, dtype: object

Converted all types to str for ML modelling below

In [ ]:

# Convert all column names to strings
X.columns = X.columns.astype(str)

In [ ]:

X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

Training Linear Regression Model on the dataset

In [ ]:

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

Out[ ]:

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Checking Linear Regression Model Metrics

In [ ]:

y_pred = model.predict(X_test)

In [ ]:

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 119573.1576029768
R^2 Score: 0.9530692937008057

In [ ]:

coefficients = model.coef_

Training Columns and Feature Importance

In [ ]:

feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])

Graphing Feature Importance

In [ ]:

import matplotlib.pyplot as plt

feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()

Running Standard Scaler to fix the data before fruther training

In [ ]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training and gathering metrics on Linear Regression Model. Below is the same as above but with the scaled data so it is easy to visualise

In [ ]:

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred_scaled = model.predict(X_test_scaled)

In [ ]:

from sklearn.metrics import mean_squared_error, r2_score

y_pred_scaled = model.predict(X_test_scaled)

mse_scaled = mean_squared_error(y_test, y_pred_scaled)
print(f"Mean Squared Error: {mse_scaled}")

r2_scaled = r2_score(y_test, y_pred_scaled)
print(f"R^2 Score: {r2_scaled}")

Mean Squared Error: 119530.27704932197
R^2 Score: 0.9530861236876511

In [ ]:

coefficients = model.coef_

In [ ]:

feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])

In [ ]:

feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()

In [ ]:

coefficients = model.coef_

feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])

In [ ]:

sorted_features = feature_importance.sort_values(by='Coefficient', ascending=False)

sorted_features.plot(kind='bar', figsize=(12,6))
plt.title('Feature Importance in Linear Regression Model')
plt.ylabel('Coefficient Value')
plt.xlabel('Features')
plt.show()

Viewing Feature Importance Data

In [ ]:

coefficients = model.coef_

feature_names = X_train.columns
feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient'])

print(feature_importance)

                     Coefficient
Number of Mentions  3.193643e+03
0                  -3.228707e+14
1                  -1.584457e+14
2                  -2.892308e+14
3                  -2.441330e+14
...                          ...
121                -6.542653e+14
122                -6.586453e+14
123                -7.226933e+14
124                -8.236515e+14
125                 0.000000e+00

[127 rows x 1 columns]

In [ ]:

# Number of rows to display at a time
chunk_size = 10

# Iterate over the DataFrame in chunks
for start in range(0, len(feature_importance), chunk_size):
    end = start + chunk_size
    print(feature_importance.iloc[start:end])
    print("\n")  # Print a newline for better separation between chunks

                     Coefficient
Number of Mentions  3.193643e+03
0                  -3.228707e+14
1                  -1.584457e+14
2                  -2.892308e+14
3                  -2.441330e+14
4                  -2.605345e+14
5                  -2.471668e+14
6                  -2.453000e+14
7                  -2.624680e+14
8                  -2.473037e+14


     Coefficient
9  -2.503349e+14
10 -2.508170e+14
11 -2.583988e+14
12 -2.614571e+14
13 -2.499485e+14
14 -2.540121e+14
15 -2.525443e+14
16 -2.523530e+14
17 -2.497357e+14
18 -2.481822e+14


     Coefficient
19 -2.465004e+14
20 -2.544298e+14
21 -2.459108e+14
22 -2.563755e+14
23 -2.485521e+14
24 -2.499291e+14
25 -2.539170e+14
26 -2.504892e+14
27 -2.471276e+14
28 -2.482990e+14


     Coefficient
29 -2.551684e+14
30 -2.484353e+14
31 -2.449841e+14
32 -2.551495e+14
33 -2.527545e+14
34 -2.426008e+14
35 -2.530982e+14
36 -2.476750e+14
37 -2.518932e+14
38 -2.543539e+14


     Coefficient
39 -2.578759e+14
40 -2.469122e+14
41 -2.459305e+14
42 -2.565072e+14
43 -2.445688e+14
44 -2.552629e+14
45 -2.481432e+14
46 -2.518932e+14
47 -2.523339e+14
48 -2.624313e+14


     Coefficient
49 -3.005830e+14
50 -2.419610e+14
51 -2.645681e+14
52 -2.498711e+14
53 -2.474406e+14
54 -2.529073e+14
55 -2.492513e+14
56 -2.615676e+14
57  3.310441e+14
58  1.508162e+14


     Coefficient
59  1.371026e+14
60  2.154090e+14
61 -3.856187e+14
62 -5.358547e+14
63 -4.984364e+14
64 -5.795386e+14
65 -2.881679e+12
66  1.210663e+14
67  6.967718e+14
68  3.919754e+14


     Coefficient
69 -4.097677e+13
70 -3.093303e+14
71 -4.052706e+14
72 -6.728743e+14
73 -2.441636e+14
74 -3.476412e+14
75  9.645447e+13
76 -7.996895e+14
77  2.970814e+14
78  2.487471e+14


     Coefficient
79  1.988275e+14
80 -2.272504e+14
81  1.396508e+14
82 -1.633230e+14
83  1.031200e+14
84 -1.893600e+14
85 -1.537223e+14
86 -9.015754e+13
87 -3.362053e+14
88  2.154295e+13


     Coefficient
89 -1.625299e+14
90 -8.376282e+14
91 -3.045707e+14
92 -3.263975e+14
93 -1.573358e+14
94  9.804019e+13
95 -2.107130e+14
96  4.728852e+14
97 -4.112718e+14
98 -6.801835e+13


      Coefficient
99  -2.302176e+14
100 -6.023744e+14
101 -2.371293e+14
102 -5.434325e+13
103 -1.477647e+14
104 -6.502466e+14
105 -4.921786e+14
106 -2.558928e+14
107 -1.370602e+14
108  4.421373e+14


      Coefficient
109 -3.167915e+14
110 -2.010348e+14
111  3.930446e+13
112 -3.888962e+14
113  1.032215e+14
114  2.224110e+14
115 -7.720316e+14
116 -7.419122e+14
117 -7.079868e+14
118 -6.761242e+14


      Coefficient
119 -6.560650e+14
120 -6.505726e+14
121 -6.542653e+14
122 -6.586453e+14
123 -7.226933e+14
124 -8.236515e+14
125  0.000000e+00

Running Lasso ML Model and Running Metrics

In [ ]:

from sklearn.linear_model import Lasso

In [ ]:

lasso_model = Lasso(alpha=1.0)

In [ ]:

lasso_model.fit(X_train, y_train)

Out[ ]:

Lasso()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [ ]:

from sklearn.metrics import mean_squared_error, r2_score
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 119629.68971020046
R^2 Score: 0.9530471056799006

In [ ]:

from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=1.0)

lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 119629.68971020046
R^2 Score: 0.9530471056799006

Calculating Metrics

In [ ]:

from sklearn import metrics
import numpy as np


rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
mae = metrics.mean_absolute_error(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)

Root Mean Squared Error (RMSE): 345.8752516590345
Mean Absolute Error (MAE): 18.54945519148683

Running Residual Plot

In [ ]:

import matplotlib.pyplot as plt

residuals = y_test - y_pred

plt.figure(figsize=(10,6))
plt.scatter(y_pred, residuals)
plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='red')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [ ]:

Running Cross Val Score

In [ ]:

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

rmse_scores = np.sqrt(-scores)

print("Cross-validated RMSE scores:", rmse_scores)
print("Mean RMSE:", rmse_scores.mean())

Cross-validated RMSE scores: [1094.62584254   41.30816971   36.25026322   36.84615      41.14088855]
Mean RMSE: 250.03426280424463

Running KFold ML Model

In [ ]:

from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

folds = list(kf.split(X))

train_indices, test_indices = folds[0]

In [ ]:

X_fold1, y_fold1 = X.iloc[test_indices], y.iloc[test_indices]


print(X_fold1.describe())
print(y_fold1.describe())

print(X.describe())
print(y.describe())

       Number of Mentions             0             1             2  \
count        87511.000000  87511.000000  87511.000000  87511.000000   
mean           115.520952      0.888700      0.023951      0.087349   
std           1855.304203      0.314505      0.152898      0.282347   
min              0.000000      0.000000      0.000000      0.000000   
25%              0.000000      1.000000      0.000000      0.000000   
50%              0.000000      1.000000      0.000000      0.000000   
75%             19.000000      1.000000      0.000000      0.000000   
max         298663.000000      1.000000      1.000000      1.000000   

                  3             4             5             6             7  \
count  87511.000000  87511.000000  87511.000000  87511.000000  87511.000000   
mean       0.017643      0.020249      0.017609      0.018386      0.020455   
std        0.131653      0.140851      0.131527      0.134344      0.141550   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

                  8  ...           116           117           118  \
count  87511.000000  ...  87511.000000  87511.000000  87511.000000   
mean       0.017723  ...      0.111449      0.099325      0.089372   
std        0.131945  ...      0.314689      0.299099      0.285281   
min        0.000000  ...      0.000000      0.000000      0.000000   
25%        0.000000  ...      0.000000      0.000000      0.000000   
50%        0.000000  ...      0.000000      0.000000      0.000000   
75%        0.000000  ...      0.000000      0.000000      0.000000   
max        1.000000  ...      1.000000      1.000000      1.000000   

                119           120           121           122           123  \
count  87511.000000  87511.000000  87511.000000  87511.000000  87511.000000   
mean       0.082024      0.081795      0.083270      0.084389      0.104890   
std        0.274403      0.274054      0.276291      0.277972      0.306413   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

                124      125  
count  87511.000000  87511.0  
mean       0.142851      1.0  
std        0.349922      0.0  
min        0.000000      1.0  
25%        0.000000      1.0  
50%        0.000000      1.0  
75%        0.000000      1.0  
max        1.000000      1.0  

[8 rows x 127 columns]
count     87511.000000
mean        105.524231
std        1596.212786
min           0.000000
25%           0.000000
50%           0.000000
75%          18.000000
max      194736.000000
Name: COVID-19 Deaths, dtype: float64
       Number of Mentions              0              1              2  \
count        4.375510e+05  437551.000000  437551.000000  437551.000000   
mean         1.309027e+02       0.887896       0.024482       0.087622   
std          3.225334e+03       0.315494       0.154539       0.282744   
min          0.000000e+00       0.000000       0.000000       0.000000   
25%          0.000000e+00       1.000000       0.000000       0.000000   
50%          0.000000e+00       1.000000       0.000000       0.000000   
75%          1.900000e+01       1.000000       0.000000       0.000000   
max          1.146242e+06       1.000000       1.000000       1.000000   

                   3              4              5              6  \
count  437551.000000  437551.000000  437551.000000  437551.000000   
mean        0.017349       0.019829       0.017694       0.017632   
std         0.130568       0.139411       0.131837       0.131611   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

                   7              8  ...            116            117  \
count  437551.000000  437551.000000  ...  437551.000000  437551.000000   
mean        0.020110       0.017733  ...       0.110684       0.099367   
std         0.140376       0.131979  ...       0.313741       0.299154   
min         0.000000       0.000000  ...       0.000000       0.000000   
25%         0.000000       0.000000  ...       0.000000       0.000000   
50%         0.000000       0.000000  ...       0.000000       0.000000   
75%         0.000000       0.000000  ...       0.000000       0.000000   
max         1.000000       1.000000  ...       1.000000       1.000000   

                 118            119            120            121  \
count  437551.000000  437551.000000  437551.000000  437551.000000   
mean        0.089608       0.083519       0.082244       0.083364   
std         0.285619       0.276666       0.274737       0.276432   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000       0.000000   
50%         0.000000       0.000000       0.000000       0.000000   
75%         0.000000       0.000000       0.000000       0.000000   
max         1.000000       1.000000       1.000000       1.000000   

                 122            123            124       125  
count  437551.000000  437551.000000  437551.000000  437551.0  
mean        0.084575       0.104253       0.141387       1.0  
std         0.278249       0.305589       0.348421       0.0  
min         0.000000       0.000000       0.000000       1.0  
25%         0.000000       0.000000       0.000000       1.0  
50%         0.000000       0.000000       0.000000       1.0  
75%         0.000000       0.000000       0.000000       1.0  
max         1.000000       1.000000       1.000000       1.0  

[8 rows x 127 columns]
count    4.375510e+05
mean     1.201179e+02
std      2.980201e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.800000e+01
max      1.146242e+06
Name: COVID-19 Deaths, dtype: float64

Random Forest

In [ ]:

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=42)

rf_model.fit(X_train, y_train)

rf_predictions = rf_model.predict(X_test)

rf_mse = mean_squared_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print(f"Random Forest - Mean Squared Error: {rf_mse}")
print(f"Random Forest - R^2 Score: {rf_r2}")

Random Forest - Mean Squared Error: 230110.6785765412
Random Forest - R^2 Score: 0.9096849419295162

Gradient Boosting

In [ ]:

from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

xgb_model = XGBRegressor(random_state=42)

xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)

xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

print(f"XGBoost - Mean Squared Error: {xgb_mse}")
print(f"XGBoost - R^2 Score: {xgb_r2}")

XGBoost - Mean Squared Error: 1435663.2412159576
XGBoost - R^2 Score: 0.4365232860892677

Ridge Regression

In [ ]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score


data_1_preprocessed.columns = data_1_preprocessed.columns.astype(str)

# Sample 3% of the data
sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)

# Split the Sample
X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)


ridge_model = Ridge(random_state=42)
ridge_model.fit(X_train_sample, y_train_sample)


ridge_predictions = ridge_model.predict(X_test_sample)
ridge_mse = mean_squared_error(y_test_sample, ridge_predictions)
ridge_r2 = r2_score(y_test_sample, ridge_predictions)

print(f"Ridge Regression - Mean Squared Error: {ridge_mse}")
print(f"Ridge Regression - R^2 Score: {ridge_r2}")

Ridge Regression - Mean Squared Error: 69549.19693577621
Ridge Regression - R^2 Score: 0.8998919383066677

SVR Model (on 3% of the data as 100% was taking hours to process

In [ ]:

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)

X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

svr_model = SVR()
svr_model.fit(X_train_sample, y_train_sample)

y_pred_sample = svr_model.predict(X_test_sample)
mse = mean_squared_error(y_test_sample, y_pred_sample)
r2 = r2_score(y_test_sample, y_pred_sample)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 684977.500927637
R^2 Score: 0.014053749826476558

Running Ridge Regression Model

In [ ]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# Sample 3% of the data
sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42)

# Split the Sample
X_sample = sampled_data.drop('COVID-19 Deaths', axis=1)
y_sample = sampled_data['COVID-19 Deaths']
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

ridge_model = Ridge(random_state=42)
ridge_model.fit(X_train_sample, y_train_sample)


ridge_predictions = ridge_model.predict(X_test_sample)
ridge_mse = mean_squared_error(y_test_sample, ridge_predictions)
ridge_r2 = r2_score(y_test_sample, ridge_predictions)

print(f"Ridge Regression - Mean Squared Error: {ridge_mse}")
print(f"Ridge Regression - R^2 Score: {ridge_r2}")

Ridge Regression - Mean Squared Error: 69549.19693577621
Ridge Regression - R^2 Score: 0.8998919383066677

In [ ]: