#!/usr/bin/env python # coding: utf-8 # # Read, clean, and validate # > A Summary of lecture "Exploratory Data Analysis in Python", via datacamp # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Datacamp] # - image: images/conception.png # ## DataFrames and Series # # ### Exploring the NSFG data # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[2]: nsfg = pd.read_hdf('./dataset/nsfg.hdf5', 'nsfg') # In[3]: # Display the number of rows and columns print(nsfg.shape) # Display the names of the columns print(nsfg.columns) # Select columns birthwgt_oz1: ounces ounces = nsfg['birthwgt_oz1'] # Print the first 5 elements of ounces print(ounces.head(5)) # ## Clean and Validate # ### Clean a variable # In[4]: nsfg['nbrnaliv'].value_counts() # In[5]: # replace the value 8 with NaN nsfg['nbrnaliv'].replace([8], np.nan, inplace=True) # Print the values and their frequencies print(nsfg['nbrnaliv'].value_counts()) # ### Compute a variable # In[6]: nsfg['agecon'].describe() # In[7]: nsfg['agepreg'].describe() # In[8]: # Select the columns and divide by 100 agecon = nsfg['agecon'] / 100 agepreg = nsfg['agepreg'] / 100 # Compute the difference preg_length = agepreg - agecon # Compute summary statistics print(preg_length.describe()) # ## Filter and visualize # ### Make a histogram # In[9]: # Plot the histogram plt.hist(agecon, bins=20) # Label the axes plt.xlabel("Age at conception") plt.ylabel('Number of pregnancies') plt.savefig('../images/conception.png') # In[20]: # Plot the histogram plt.hist(agecon, bins=20, histtype='step') # Label the axes plt.xlabel("Age at conception") plt.ylabel('Number of pregnancies') # ### Compute birth weight # In[21]: def resample_rows_weighted(df, column='wgt2013_2015'): """Resamples a DataFrame using probabilities proportional to given column. Args: df: DataFrame column: string column name to use as weights returns: DataFrame """ weights = df[column].copy() weights /= sum(weights) indices = np.random.choice(df.index, len(df), replace=True, p=weights) sample = df.loc[indices] return sample # In[22]: # Resample the data nsfg = resample_rows_weighted(nsfg, 'wgt2013_2015') # Clean the weight variables pounds = nsfg['birthwgt_lb1'].replace([98, 99], np.nan) ounces = nsfg['birthwgt_oz1'].replace([98, 99], np.nan) # Compute total birth weight birth_weight = pounds + ounces/16 # In[25]: # Create a Boolean Series for full-term babies full_term = nsfg['prglngth'] >= 37 # Select the weights of full-term babies full_term_weight = birth_weight[full_term] # Compute the mean weight of full-term babies print(full_term_weight.mean()) # ### Filter # In[26]: # Filter full-term babies full_term = nsfg['prglngth'] >= 37 # Filter single birth single = nsfg['nbrnaliv'] == 1 # Compute birth weight for single full-term babies single_full_term_weight = birth_weight[single & full_term] print('Single full-term mean:', single_full_term_weight.mean()) # Compute birth weight for multiple full-term babies mult_full_term_weight = birth_weight[~single & full_term] print('Multiple full-term mean:', mult_full_term_weight.mean())