import pandas as pd
file_path_1 = 'Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'
data_1 = pd.read_csv(file_path_1)
print(data_1.head())
Data As Of Start Date End Date Group Year Month State \ 0 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 1 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 2 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 3 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States 4 09/24/2023 01/01/2020 09/23/2023 By Total NaN NaN United States Condition Group Condition ICD10_codes Age Group \ 0 Respiratory diseases Influenza and pneumonia J09-J18 0-24 1 Respiratory diseases Influenza and pneumonia J09-J18 25-34 2 Respiratory diseases Influenza and pneumonia J09-J18 35-44 3 Respiratory diseases Influenza and pneumonia J09-J18 45-54 4 Respiratory diseases Influenza and pneumonia J09-J18 55-64 COVID-19 Deaths Number of Mentions Flag 0 1569.0 1647.0 NaN 1 5804.0 6029.0 NaN 2 15080.0 15699.0 NaN 3 37414.0 38878.0 NaN 4 82668.0 85708.0 NaN
data_1_shape = data_1.shape
# Descriptive statistics for all columns
data_1_describe = data_1.describe(include='all')
# Display the last few rows of the DataFrame
data_1_tail = data_1.tail()
# Display the data types of each column
data_1_dtypes = data_1.dtypes
data_1_shape, data_1_describe, data_1_tail, data_1_dtypes
((621000, 14), Data As Of Start Date End Date Group Year \ count 621000 621000 621000 621000 608580.000000 unique 1 45 45 3 NaN top 09/24/2023 01/01/2020 09/23/2023 By Month NaN freq 621000 37260 37260 558900 NaN mean NaN NaN NaN NaN 2021.408163 std NaN NaN NaN NaN 1.086436 min NaN NaN NaN NaN 2020.000000 25% NaN NaN NaN NaN 2020.000000 50% NaN NaN NaN NaN 2021.000000 75% NaN NaN NaN NaN 2022.000000 max NaN NaN NaN NaN 2023.000000 Month State Condition Group \ count 558900.000000 621000 621000 unique NaN 54 12 top NaN United States Circulatory diseases freq NaN 11500 189000 mean 6.200000 NaN NaN std 3.350625 NaN NaN min 1.000000 NaN NaN 25% 3.000000 NaN NaN 50% 6.000000 NaN NaN 75% 9.000000 NaN NaN max 12.000000 NaN NaN Condition ICD10_codes Age Group COVID-19 Deaths \ count 621000 621000 621000 4.375510e+05 unique 23 23 10 NaN top Influenza and pneumonia J09-J18 0-24 NaN freq 27000 27000 62100 NaN mean NaN NaN NaN 1.201179e+02 std NaN NaN NaN 2.980201e+03 min NaN NaN NaN 0.000000e+00 25% NaN NaN NaN 0.000000e+00 50% NaN NaN NaN 0.000000e+00 75% NaN NaN NaN 1.800000e+01 max NaN NaN NaN 1.146242e+06 Number of Mentions Flag count 4.434230e+05 183449 unique NaN 1 top NaN One or more data cells have counts between 1-9... freq NaN 183449 mean 1.293348e+02 NaN std 3.203936e+03 NaN min 0.000000e+00 NaN 25% 0.000000e+00 NaN 50% 0.000000e+00 NaN 75% 1.900000e+01 NaN max 1.146242e+06 NaN , Data As Of Start Date End Date Group Year Month \ 620995 09/24/2023 05/01/2023 05/31/2023 By Month 2023.0 5.0 620996 09/24/2023 06/01/2023 06/30/2023 By Month 2023.0 6.0 620997 09/24/2023 07/01/2023 07/31/2023 By Month 2023.0 7.0 620998 09/24/2023 08/01/2023 08/31/2023 By Month 2023.0 8.0 620999 09/24/2023 09/01/2023 09/23/2023 By Month 2023.0 9.0 State Condition Group Condition ICD10_codes Age Group \ 620995 Puerto Rico COVID-19 COVID-19 U071 All Ages 620996 Puerto Rico COVID-19 COVID-19 U071 All Ages 620997 Puerto Rico COVID-19 COVID-19 U071 All Ages 620998 Puerto Rico COVID-19 COVID-19 U071 All Ages 620999 Puerto Rico COVID-19 COVID-19 U071 All Ages COVID-19 Deaths Number of Mentions Flag 620995 67.0 67.0 NaN 620996 122.0 122.0 NaN 620997 114.0 114.0 NaN 620998 78.0 78.0 NaN 620999 36.0 36.0 NaN , Data As Of object Start Date object End Date object Group object Year float64 Month float64 State object Condition Group object Condition object ICD10_codes object Age Group object COVID-19 Deaths float64 Number of Mentions float64 Flag object dtype: object)
data_1 = pd.DataFrame(data_1)
# Convert dates to datetime
data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])
data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])
data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])
# Display the DataFrame
print(data_1)
Data As Of Start Date End Date Group Year Month \ 0 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 1 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 2 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 3 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN 4 2023-09-24 2020-01-01 2023-09-23 By Total NaN NaN ... ... ... ... ... ... ... 620995 2023-09-24 2023-05-01 2023-05-31 By Month 2023.0 5.0 620996 2023-09-24 2023-06-01 2023-06-30 By Month 2023.0 6.0 620997 2023-09-24 2023-07-01 2023-07-31 By Month 2023.0 7.0 620998 2023-09-24 2023-08-01 2023-08-31 By Month 2023.0 8.0 620999 2023-09-24 2023-09-01 2023-09-23 By Month 2023.0 9.0 State Condition Group Condition \ 0 United States Respiratory diseases Influenza and pneumonia 1 United States Respiratory diseases Influenza and pneumonia 2 United States Respiratory diseases Influenza and pneumonia 3 United States Respiratory diseases Influenza and pneumonia 4 United States Respiratory diseases Influenza and pneumonia ... ... ... ... 620995 Puerto Rico COVID-19 COVID-19 620996 Puerto Rico COVID-19 COVID-19 620997 Puerto Rico COVID-19 COVID-19 620998 Puerto Rico COVID-19 COVID-19 620999 Puerto Rico COVID-19 COVID-19 ICD10_codes Age Group COVID-19 Deaths Number of Mentions Flag 0 J09-J18 0-24 1569.0 1647.0 NaN 1 J09-J18 25-34 5804.0 6029.0 NaN 2 J09-J18 35-44 15080.0 15699.0 NaN 3 J09-J18 45-54 37414.0 38878.0 NaN 4 J09-J18 55-64 82668.0 85708.0 NaN ... ... ... ... ... ... 620995 U071 All Ages 67.0 67.0 NaN 620996 U071 All Ages 122.0 122.0 NaN 620997 U071 All Ages 114.0 114.0 NaN 620998 U071 All Ages 78.0 78.0 NaN 620999 U071 All Ages 36.0 36.0 NaN [621000 rows x 14 columns]
# Since the dataframe is named data_1, let's perform the analysis using that correct name
import matplotlib.pyplot as plt
import seaborn as sns
# Frequency distribution of Age Group
age_group_counts_data_1 = data_1['Age Group'].value_counts()
# Bar chart of Age Group counts in 'data_1'
plt.figure(figsize=(10, 6))
plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')
plt.title('Frequency Distribution of Age Groups in Data 1')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.xticks(rotation=45) # Rotate x-axis labels to show clearly
plt.show()
# Boxplot of COVID-19 Deaths by Age Group in 'data_1'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)
plt.title('COVID-19 Deaths by Age Group in Data 1')
plt.xlabel('Age Group')
plt.ylabel('COVID-19 Deaths')
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Bar chart for 'Condition Group' with vertical x-axis labels
plt.figure(figsize=(12, 6))
barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by Condition Group and Age Group')
plt.xlabel('Condition Group')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.show()
plt.figure(figsize=(12, 6))
barplot2 = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by State and Age Group')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot2.set_xticklabels(barplot2.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[5], line 6 4 # Bar chart for 'Condition Group' with vertical x-axis labels 5 plt.figure(figsize=(12, 6)) ----> 6 barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1) 7 plt.title('COVID-19 Deaths by Condition Group and Age Group') 8 plt.xlabel('Condition Group') File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2755, in barplot(data, x, y, hue, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge, ci, ax, **kwargs) 2752 if estimator is len: 2753 estimator = "size" -> 2755 plotter = _BarPlotter(x, y, hue, data, order, hue_order, 2756 estimator, errorbar, n_boot, units, seed, 2757 orient, color, palette, saturation, 2758 width, errcolor, errwidth, capsize, dodge) 2760 if ax is None: 2761 ax = plt.gca() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:1530, in _BarPlotter.__init__(self, x, y, hue, data, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge) 1525 def __init__(self, x, y, hue, data, order, hue_order, 1526 estimator, errorbar, n_boot, units, seed, 1527 orient, color, palette, saturation, width, 1528 errcolor, errwidth, capsize, dodge): 1529 """Initialize the plotter.""" -> 1530 self.establish_variables(x, y, hue, data, orient, 1531 order, hue_order, units) 1532 self.establish_colors(color, palette, saturation) 1533 self.estimate_statistic(estimator, errorbar, n_boot, seed) File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:541, in _CategoricalPlotter.establish_variables(self, x, y, hue, data, orient, order, hue_order, units) 539 if isinstance(var, str): 540 err = f"Could not interpret input '{var}'" --> 541 raise ValueError(err) 543 # Figure out the plotting orientation 544 orient = infer_orient( 545 x, y, orient, require_numeric=self.require_numeric 546 ) ValueError: Could not interpret input 'Age Group Numeric'
<Figure size 1200x600 with 0 Axes>
# We will remove 'United States' from the 'State' column and then recreate the bar plot.
# Check if there are other states in the dataset besides 'United States'
unique_states = data_1['State'].unique()
# If 'United States' is the only state, the following code will not be able to create a meaningful plot.
# We'll proceed under the assumption that there are other states in the full dataset.
# Filter out the 'United States' entry from the dataset
data_1_no_us = data_1[data_1['State'] != 'United States']
# Now let's create the bar plot without 'United States'
plt.figure(figsize=(12, 6))
barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1_no_us)
plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90) # Rotate x-axis labels
plt.tight_layout() # This will adjust the plot to make sure everything fits without overlapping
plt.show()
pip install ydata-profiling
from ydata_profiling import ProfileReport
ProfileReport(data_1)