#!/usr/bin/env python
# coding: utf-8
#
Dataset 1: Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv
# In[1]:
import pandas as pd
file_path_1 = 'Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'
data_1 = pd.read_csv(file_path_1)
print(data_1.head())
# In[2]:
data_1_shape = data_1.shape
# Descriptive statistics for all columns
data_1_describe = data_1.describe(include='all')
# Display the last few rows of the DataFrame
data_1_tail = data_1.tail()
# Display the data types of each column
data_1_dtypes = data_1.dtypes
data_1_shape, data_1_describe, data_1_tail, data_1_dtypes
# In[ ]:
# In[3]:
data_1 = pd.DataFrame(data_1)
# Convert dates to datetime
data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])
data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])
data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])
# Display the DataFrame
print(data_1)
# In[4]:
# Since the dataframe is named data_1, let's perform the analysis using that correct name
import matplotlib.pyplot as plt
import seaborn as sns
# Frequency distribution of Age Group
age_group_counts_data_1 = data_1['Age Group'].value_counts()
# Bar chart of Age Group counts in 'data_1'
plt.figure(figsize=(10, 6))
plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')
plt.title('Frequency Distribution of Age Groups in Data 1')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.xticks(rotation=45) # Rotate x-axis labels to show clearly
plt.show()
# Boxplot of COVID-19 Deaths by Age Group in 'data_1'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)
plt.title('COVID-19 Deaths by Age Group in Data 1')
plt.xlabel('Age Group')
plt.ylabel('COVID-19 Deaths')
plt.show()
# In[5]:
import matplotlib.pyplot as plt
import seaborn as sns
# Bar chart for 'Condition Group' with vertical x-axis labels
plt.figure(figsize=(12, 6))
barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by Condition Group and Age Group')
plt.xlabel('Condition Group')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.show()
plt.figure(figsize=(12, 6))
barplot2 = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by State and Age Group')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot2.set_xticklabels(barplot2.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.show()
# In[ ]:
# We will remove 'United States' from the 'State' column and then recreate the bar plot.
# Check if there are other states in the dataset besides 'United States'
unique_states = data_1['State'].unique()
# If 'United States' is the only state, the following code will not be able to create a meaningful plot.
# We'll proceed under the assumption that there are other states in the full dataset.
# Filter out the 'United States' entry from the dataset
data_1_no_us = data_1[data_1['State'] != 'United States']
# Now let's create the bar plot without 'United States'
plt.figure(figsize=(12, 6))
barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1_no_us)
plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.tight_layout() # This will adjust the plot to make sure everything fits without overlapping
plt.show()
# In[ ]:
pip install ydata-profiling
# In[ ]:
from ydata_profiling import ProfileReport
ProfileReport(data_1)