Notebook

Dataset 1: Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv

In [1]:

import pandas as pd
file_path_1 = 'Conditions_Contributing_to_COVID-19_Deaths__by_State_and_Age__Provisional_2020-2023.csv'
data_1 = pd.read_csv(file_path_1)
print(data_1.head())

   Data As Of  Start Date    End Date     Group  Year  Month          State  \
0  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
1  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
2  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
3  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   
4  09/24/2023  01/01/2020  09/23/2023  By Total   NaN    NaN  United States   

        Condition Group                Condition ICD10_codes Age Group  \
0  Respiratory diseases  Influenza and pneumonia     J09-J18      0-24   
1  Respiratory diseases  Influenza and pneumonia     J09-J18     25-34   
2  Respiratory diseases  Influenza and pneumonia     J09-J18     35-44   
3  Respiratory diseases  Influenza and pneumonia     J09-J18     45-54   
4  Respiratory diseases  Influenza and pneumonia     J09-J18     55-64   

   COVID-19 Deaths  Number of Mentions Flag  
0           1569.0              1647.0  NaN  
1           5804.0              6029.0  NaN  
2          15080.0             15699.0  NaN  
3          37414.0             38878.0  NaN  
4          82668.0             85708.0  NaN

In [2]:

data_1_shape = data_1.shape

# Descriptive statistics for all columns
data_1_describe = data_1.describe(include='all')

# Display the last few rows of the DataFrame
data_1_tail = data_1.tail()

# Display the data types of each column
data_1_dtypes = data_1.dtypes

data_1_shape, data_1_describe, data_1_tail, data_1_dtypes

Out[2]:

((621000, 14),
         Data As Of  Start Date    End Date     Group           Year  \
 count       621000      621000      621000    621000  608580.000000   
 unique           1          45          45         3            NaN   
 top     09/24/2023  01/01/2020  09/23/2023  By Month            NaN   
 freq        621000       37260       37260    558900            NaN   
 mean           NaN         NaN         NaN       NaN    2021.408163   
 std            NaN         NaN         NaN       NaN       1.086436   
 min            NaN         NaN         NaN       NaN    2020.000000   
 25%            NaN         NaN         NaN       NaN    2020.000000   
 50%            NaN         NaN         NaN       NaN    2021.000000   
 75%            NaN         NaN         NaN       NaN    2022.000000   
 max            NaN         NaN         NaN       NaN    2023.000000   
 
                 Month          State       Condition Group  \
 count   558900.000000         621000                621000   
 unique            NaN             54                    12   
 top               NaN  United States  Circulatory diseases   
 freq              NaN          11500                189000   
 mean         6.200000            NaN                   NaN   
 std          3.350625            NaN                   NaN   
 min          1.000000            NaN                   NaN   
 25%          3.000000            NaN                   NaN   
 50%          6.000000            NaN                   NaN   
 75%          9.000000            NaN                   NaN   
 max         12.000000            NaN                   NaN   
 
                       Condition ICD10_codes Age Group  COVID-19 Deaths  \
 count                    621000      621000    621000     4.375510e+05   
 unique                       23          23        10              NaN   
 top     Influenza and pneumonia     J09-J18      0-24              NaN   
 freq                      27000       27000     62100              NaN   
 mean                        NaN         NaN       NaN     1.201179e+02   
 std                         NaN         NaN       NaN     2.980201e+03   
 min                         NaN         NaN       NaN     0.000000e+00   
 25%                         NaN         NaN       NaN     0.000000e+00   
 50%                         NaN         NaN       NaN     0.000000e+00   
 75%                         NaN         NaN       NaN     1.800000e+01   
 max                         NaN         NaN       NaN     1.146242e+06   
 
         Number of Mentions                                               Flag  
 count         4.434230e+05                                             183449  
 unique                 NaN                                                  1  
 top                    NaN  One or more data cells have counts between 1-9...  
 freq                   NaN                                             183449  
 mean          1.293348e+02                                                NaN  
 std           3.203936e+03                                                NaN  
 min           0.000000e+00                                                NaN  
 25%           0.000000e+00                                                NaN  
 50%           0.000000e+00                                                NaN  
 75%           1.900000e+01                                                NaN  
 max           1.146242e+06                                                NaN  ,
         Data As Of  Start Date    End Date     Group    Year  Month  \
 620995  09/24/2023  05/01/2023  05/31/2023  By Month  2023.0    5.0   
 620996  09/24/2023  06/01/2023  06/30/2023  By Month  2023.0    6.0   
 620997  09/24/2023  07/01/2023  07/31/2023  By Month  2023.0    7.0   
 620998  09/24/2023  08/01/2023  08/31/2023  By Month  2023.0    8.0   
 620999  09/24/2023  09/01/2023  09/23/2023  By Month  2023.0    9.0   
 
               State Condition Group Condition ICD10_codes Age Group  \
 620995  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620996  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620997  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620998  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 620999  Puerto Rico        COVID-19  COVID-19        U071  All Ages   
 
         COVID-19 Deaths  Number of Mentions Flag  
 620995             67.0                67.0  NaN  
 620996            122.0               122.0  NaN  
 620997            114.0               114.0  NaN  
 620998             78.0                78.0  NaN  
 620999             36.0                36.0  NaN  ,
 Data As Of             object
 Start Date             object
 End Date               object
 Group                  object
 Year                  float64
 Month                 float64
 State                  object
 Condition Group        object
 Condition              object
 ICD10_codes            object
 Age Group              object
 COVID-19 Deaths       float64
 Number of Mentions    float64
 Flag                   object
 dtype: object)

In [ ]:

In [3]:

data_1 = pd.DataFrame(data_1)

# Convert dates to datetime
data_1['Data As Of'] = pd.to_datetime(data_1['Data As Of'])
data_1['Start Date'] = pd.to_datetime(data_1['Start Date'])
data_1 ['End Date'] = pd.to_datetime(data_1['End Date'])

# Display the DataFrame
print(data_1)

       Data As Of Start Date   End Date     Group    Year  Month  \
0      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
1      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
2      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
3      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
4      2023-09-24 2020-01-01 2023-09-23  By Total     NaN    NaN   
...           ...        ...        ...       ...     ...    ...   
620995 2023-09-24 2023-05-01 2023-05-31  By Month  2023.0    5.0   
620996 2023-09-24 2023-06-01 2023-06-30  By Month  2023.0    6.0   
620997 2023-09-24 2023-07-01 2023-07-31  By Month  2023.0    7.0   
620998 2023-09-24 2023-08-01 2023-08-31  By Month  2023.0    8.0   
620999 2023-09-24 2023-09-01 2023-09-23  By Month  2023.0    9.0   

                State       Condition Group                Condition  \
0       United States  Respiratory diseases  Influenza and pneumonia   
1       United States  Respiratory diseases  Influenza and pneumonia   
2       United States  Respiratory diseases  Influenza and pneumonia   
3       United States  Respiratory diseases  Influenza and pneumonia   
4       United States  Respiratory diseases  Influenza and pneumonia   
...               ...                   ...                      ...   
620995    Puerto Rico              COVID-19                 COVID-19   
620996    Puerto Rico              COVID-19                 COVID-19   
620997    Puerto Rico              COVID-19                 COVID-19   
620998    Puerto Rico              COVID-19                 COVID-19   
620999    Puerto Rico              COVID-19                 COVID-19   

       ICD10_codes Age Group  COVID-19 Deaths  Number of Mentions Flag  
0          J09-J18      0-24           1569.0              1647.0  NaN  
1          J09-J18     25-34           5804.0              6029.0  NaN  
2          J09-J18     35-44          15080.0             15699.0  NaN  
3          J09-J18     45-54          37414.0             38878.0  NaN  
4          J09-J18     55-64          82668.0             85708.0  NaN  
...            ...       ...              ...                 ...  ...  
620995        U071  All Ages             67.0                67.0  NaN  
620996        U071  All Ages            122.0               122.0  NaN  
620997        U071  All Ages            114.0               114.0  NaN  
620998        U071  All Ages             78.0                78.0  NaN  
620999        U071  All Ages             36.0                36.0  NaN  

[621000 rows x 14 columns]

In [4]:

# Since the dataframe is named data_1, let's perform the analysis using that correct name

import matplotlib.pyplot as plt
import seaborn as sns

# Frequency distribution of Age Group
age_group_counts_data_1 = data_1['Age Group'].value_counts()

# Bar chart of Age Group counts in 'data_1'
plt.figure(figsize=(10, 6))
plt.bar(age_group_counts_data_1.index, age_group_counts_data_1.values, color='skyblue')
plt.title('Frequency Distribution of Age Groups in Data 1')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.xticks(rotation=45)  # Rotate x-axis labels to show clearly
plt.show()

# Boxplot of COVID-19 Deaths by Age Group in 'data_1'
plt.figure(figsize=(10, 6))
sns.boxplot(x='Age Group', y='COVID-19 Deaths', data=data_1)
plt.title('COVID-19 Deaths by Age Group in Data 1')
plt.xlabel('Age Group')
plt.ylabel('COVID-19 Deaths')
plt.show()

In [5]:

import matplotlib.pyplot as plt
import seaborn as sns

# Bar chart for 'Condition Group' with vertical x-axis labels
plt.figure(figsize=(12, 6))
barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by Condition Group and Age Group')
plt.xlabel('Condition Group')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot1.set_xticklabels(barplot1.get_xticklabels(), rotation=90)  # Rotate x-axis labels
plt.show()

plt.figure(figsize=(12, 6))
barplot2 = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
plt.title('COVID-19 Deaths by State and Age Group')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot2.set_xticklabels(barplot2.get_xticklabels(), rotation=90)  # Rotate x-axis labels
plt.show()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 6
      4 # Bar chart for 'Condition Group' with vertical x-axis labels
      5 plt.figure(figsize=(12, 6))
----> 6 barplot1 = sns.barplot(x='Condition Group', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1)
      7 plt.title('COVID-19 Deaths by Condition Group and Age Group')
      8 plt.xlabel('Condition Group')

File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2755, in barplot(data, x, y, hue, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge, ci, ax, **kwargs)
   2752 if estimator is len:
   2753     estimator = "size"
-> 2755 plotter = _BarPlotter(x, y, hue, data, order, hue_order,
   2756                       estimator, errorbar, n_boot, units, seed,
   2757                       orient, color, palette, saturation,
   2758                       width, errcolor, errwidth, capsize, dodge)
   2760 if ax is None:
   2761     ax = plt.gca()

File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:1530, in _BarPlotter.__init__(self, x, y, hue, data, order, hue_order, estimator, errorbar, n_boot, units, seed, orient, color, palette, saturation, width, errcolor, errwidth, capsize, dodge)
   1525 def __init__(self, x, y, hue, data, order, hue_order,
   1526              estimator, errorbar, n_boot, units, seed,
   1527              orient, color, palette, saturation, width,
   1528              errcolor, errwidth, capsize, dodge):
   1529     """Initialize the plotter."""
-> 1530     self.establish_variables(x, y, hue, data, orient,
   1531                              order, hue_order, units)
   1532     self.establish_colors(color, palette, saturation)
   1533     self.estimate_statistic(estimator, errorbar, n_boot, seed)

File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:541, in _CategoricalPlotter.establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    539     if isinstance(var, str):
    540         err = f"Could not interpret input '{var}'"
--> 541         raise ValueError(err)
    543 # Figure out the plotting orientation
    544 orient = infer_orient(
    545     x, y, orient, require_numeric=self.require_numeric
    546 )

ValueError: Could not interpret input 'Age Group Numeric'

<Figure size 1200x600 with 0 Axes>

In [ ]:

# We will remove 'United States' from the 'State' column and then recreate the bar plot.

# Check if there are other states in the dataset besides 'United States'
unique_states = data_1['State'].unique()

# If 'United States' is the only state, the following code will not be able to create a meaningful plot.
# We'll proceed under the assumption that there are other states in the full dataset.

# Filter out the 'United States' entry from the dataset
data_1_no_us = data_1[data_1['State'] != 'United States']

# Now let's create the bar plot without 'United States'
plt.figure(figsize=(12, 6))
barplot_no_us = sns.barplot(x='State', y='COVID-19 Deaths', hue='Age Group Numeric', data=data_1_no_us)
plt.title('COVID-19 Deaths by State and Age Group (excluding United States)')
plt.xlabel('State')
plt.ylabel('COVID-19 Deaths')
plt.legend(title='Age Group')
barplot_no_us.set_xticklabels(barplot_no_us.get_xticklabels(), rotation=90)  # Rotate x-axis labels
plt.tight_layout()  # This will adjust the plot to make sure everything fits without overlapping
plt.show()

In [ ]:

pip install ydata-profiling

In [ ]:

from ydata_profiling import ProfileReport

ProfileReport(data_1)