import pandas as pd from google.colab import drive drive.mount('/content/drive') file_path = '/content/drive/My Drive/Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv' import pandas as pd data_1 = pd.read_csv(file_path) print(data_1.head()) from google.colab import drive drive.mount('/content/drive') data_1_shape = data_1.shape # Descriptive statistics for all columns data_1_describe = data_1.describe(include='all') # Display the last few rows of the DataFrame data_1_tail = data_1.tail() # Display the data types of each column data_1_dtypes = data_1.dtypes data_1_shape, data_1_describe, data_1_tail, data_1_dtypes data_1 = pd.DataFrame(data_1) data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of']) data_1['Start Date'] = pd.to_datetime(data_1['Start Date']) data_1 ['End Date'] = pd.to_datetime(data_1['End Date']) print(data_1) import matplotlib.pyplot as plt import seaborn as sns age_group_counts_data_1 = data_1['Age Group'].value_counts() plt.figure(figsize=(10, 6)) plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue') plt.title('Frequency Distribution of Age Groups in Data 1') plt.xlabel('Age Group') plt.ylabel('Frequency') plt.xticks(rotation=45) # Rotate x-axis labels to show clearly plt.show() plt.figure(figsize=(10, 6)) sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1) plt.title('COVID-19 Deaths by Age Group in Data 1') plt.xlabel('Age Group') plt.ylabel('COVID-19 Deaths') plt.show() import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(12, 6)) barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group', data=data_1) plt.title('COVID-19 Deaths by Condition Group and Age Group') plt.xlabel('Condition Group') plt.ylabel('COVID-19 Deaths') plt.legend(title='Age Group') barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90) # Rotate x-axis labels plt.show() unique_states = data_1['State'].unique() data_1_no_us = data_1[data_1['State'] != 'United States'] plt.figure(figsize=(12, 6)) barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group', data=data_1_no_us) plt.title('COVID-19 Deaths by State and Age Group (excluding United States)') plt.xlabel('State') plt.ylabel('COVID-19 Deaths') plt.legend(title='Age Group') barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90) plt.tight_layout() plt.show() missing_values_count = data_1_no_us.isnull().sum() print(missing_values_count) data_1_cleaned = data_1.dropna(subset=['COVID-19 Deaths', 'Number of Mentions']) original_row_count = data_1.shape[0] cleaned_row_count = data_1_cleaned.shape[0] rows_dropped = original_row_count - cleaned_row_count print(f"Original number of rows: {original_row_count}") print(f"Number of rows after cleaning: {cleaned_row_count}") print(f"Number of rows dropped: {rows_dropped}") from sklearn.preprocessing import OneHotEncoder import pandas as pd date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month'] data_1_cleaned = data_1_cleaned.drop(columns=date_cols) categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols])) data_1_encoded.columns = encoder.get_feature_names_out(categorical_cols) data_1_encoded.index = data_1_cleaned.index num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1) data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1) from sklearn.preprocessing import OneHotEncoder date_cols = ['Data As Of', 'Start Date', 'End Date', 'Year', 'Month'] data_1_cleaned = data_1_cleaned.drop(columns=date_cols, errors='ignore') categorical_cols = data_1_cleaned.select_dtypes(include=['object', 'category']).columns # Applying One-Hot Encoding encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) data_1_encoded = pd.DataFrame(encoder.fit_transform(data_1_cleaned[categorical_cols])) data_1_encoded.index = data_1_cleaned.index num_data_1_cleaned = data_1_cleaned.drop(categorical_cols, axis=1) data_1_preprocessed = pd.concat([num_data_1_cleaned, data_1_encoded], axis=1) from sklearn.model_selection import train_test_split # 'COVID-19 Deaths' is the target variable y = data_1_preprocessed['COVID-19 Deaths'] X = data_1_preprocessed.drop('COVID-19 Deaths', axis=1) # Splitting the dataset into training (80%) and testing (20%) sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(f"Training set size: {X_train.shape[0]} rows") print(f"Testing set size: {X_test.shape[0]} rows") data_1_preprocessed.head() print(X_train.dtypes) print(X_train.columns) print(X_test.columns) print(X_train.dtypes) # Check for missing values print(X_train.isnull().sum()) # Check data types print(X_train.dtypes) # Convert all column names to strings X.columns = X.columns.astype(str) X_train.columns = X_train.columns.astype(str) X_test.columns = X_test.columns.astype(str) from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) from sklearn.metrics import mean_squared_error, r2_score mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Mean Squared Error: {mse}") print(f"R^2 Score: {r2}") coefficients = model.coef_ feature_names = X_train.columns feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient']) import matplotlib.pyplot as plt feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6)) plt.title('Feature Importance in Linear Regression Model') plt.ylabel('Coefficient Value') plt.xlabel('Features') plt.show() from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) model = LinearRegression() model.fit(X_train_scaled, y_train) y_pred_scaled = model.predict(X_test_scaled) from sklearn.metrics import mean_squared_error, r2_score y_pred_scaled = model.predict(X_test_scaled) mse_scaled = mean_squared_error(y_test, y_pred_scaled) print(f"Mean Squared Error: {mse_scaled}") r2_scaled = r2_score(y_test, y_pred_scaled) print(f"R^2 Score: {r2_scaled}") coefficients = model.coef_ feature_names = X_train.columns feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient']) feature_importance.sort_values(by='Coefficient', ascending=False).plot(kind='bar', figsize=(12,6)) plt.title('Feature Importance in Linear Regression Model') plt.ylabel('Coefficient Value') plt.xlabel('Features') plt.show() coefficients = model.coef_ feature_names = X_train.columns feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient']) sorted_features = feature_importance.sort_values(by='Coefficient', ascending=False) sorted_features.plot(kind='bar', figsize=(12,6)) plt.title('Feature Importance in Linear Regression Model') plt.ylabel('Coefficient Value') plt.xlabel('Features') plt.show() coefficients = model.coef_ feature_names = X_train.columns feature_importance = pd.DataFrame(coefficients, index=feature_names, columns=['Coefficient']) print(feature_importance) # Number of rows to display at a time chunk_size = 10 # Iterate over the DataFrame in chunks for start in range(0, len(feature_importance), chunk_size): end = start + chunk_size print(feature_importance.iloc[start:end]) print("\n") # Print a newline for better separation between chunks from sklearn.linear_model import Lasso lasso_model = Lasso(alpha=1.0) lasso_model.fit(X_train, y_train) from sklearn.metrics import mean_squared_error, r2_score y_pred = lasso_model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Mean Squared Error: {mse}") print(f"R^2 Score: {r2}") from sklearn.linear_model import Lasso lasso_model = Lasso(alpha=1.0) lasso_model.fit(X_train, y_train) y_pred = lasso_model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print(f"Mean Squared Error: {mse}") print(f"R^2 Score: {r2}") from sklearn import metrics import numpy as np rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) mae = metrics.mean_absolute_error(y_test, y_pred) print("Root Mean Squared Error (RMSE):", rmse) print("Mean Absolute Error (MAE):", mae) import matplotlib.pyplot as plt residuals = y_test - y_pred plt.figure(figsize=(10,6)) plt.scatter(y_pred, residuals) plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='red') plt.xlabel('Predicted Values') plt.ylabel('Residuals') plt.title('Residual Plot') plt.show() Running Cross Val Score from sklearn.model_selection import cross_val_score scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error') rmse_scores = np.sqrt(-scores) print("Cross-validated RMSE scores:", rmse_scores) print("Mean RMSE:", rmse_scores.mean()) from sklearn.model_selection import KFold kf = KFold(n_splits=5, shuffle=True, random_state=42) folds = list(kf.split(X)) train_indices, test_indices = folds[0] X_fold1, y_fold1 = X.iloc[test_indices], y.iloc[test_indices] print(X_fold1.describe()) print(y_fold1.describe()) print(X.describe()) print(y.describe()) from sklearn.ensemble import RandomForestRegressor rf_model = RandomForestRegressor(random_state=42) rf_model.fit(X_train, y_train) rf_predictions = rf_model.predict(X_test) rf_mse = mean_squared_error(y_test, rf_predictions) rf_r2 = r2_score(y_test, rf_predictions) print(f"Random Forest - Mean Squared Error: {rf_mse}") print(f"Random Forest - R^2 Score: {rf_r2}") from xgboost import XGBRegressor from sklearn.metrics import mean_squared_error, r2_score xgb_model = XGBRegressor(random_state=42) xgb_model.fit(X_train, y_train) xgb_predictions = xgb_model.predict(X_test) xgb_mse = mean_squared_error(y_test, xgb_predictions) xgb_r2 = r2_score(y_test, xgb_predictions) print(f"XGBoost - Mean Squared Error: {xgb_mse}") print(f"XGBoost - R^2 Score: {xgb_r2}") from sklearn.model_selection import train_test_split from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error, r2_score data_1_preprocessed.columns = data_1_preprocessed.columns.astype(str) # Sample 3% of the data sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42) # Split the Sample X_sample = sampled_data.drop('COVID-19 Deaths', axis=1) y_sample = sampled_data['COVID-19 Deaths'] X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42) ridge_model = Ridge(random_state=42) ridge_model.fit(X_train_sample, y_train_sample) ridge_predictions = ridge_model.predict(X_test_sample) ridge_mse = mean_squared_error(y_test_sample, ridge_predictions) ridge_r2 = r2_score(y_test_sample, ridge_predictions) print(f"Ridge Regression - Mean Squared Error: {ridge_mse}") print(f"Ridge Regression - R^2 Score: {ridge_r2}") from sklearn.model_selection import train_test_split from sklearn.svm import SVR from sklearn.metrics import mean_squared_error, r2_score import numpy as np sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42) X_sample = sampled_data.drop('COVID-19 Deaths', axis=1) y_sample = sampled_data['COVID-19 Deaths'] X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42) svr_model = SVR() svr_model.fit(X_train_sample, y_train_sample) y_pred_sample = svr_model.predict(X_test_sample) mse = mean_squared_error(y_test_sample, y_pred_sample) r2 = r2_score(y_test_sample, y_pred_sample) print("Mean Squared Error:", mse) print("R^2 Score:", r2) from sklearn.model_selection import train_test_split from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error, r2_score # Sample 3% of the data sampled_data = data_1_preprocessed.sample(frac=0.03, random_state=42) # Split the Sample X_sample = sampled_data.drop('COVID-19 Deaths', axis=1) y_sample = sampled_data['COVID-19 Deaths'] X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42) ridge_model = Ridge(random_state=42) ridge_model.fit(X_train_sample, y_train_sample) ridge_predictions = ridge_model.predict(X_test_sample) ridge_mse = mean_squared_error(y_test_sample, ridge_predictions) ridge_r2 = r2_score(y_test_sample, ridge_predictions) print(f"Ridge Regression - Mean Squared Error: {ridge_mse}") print(f"Ridge Regression - R^2 Score: {ridge_r2}")