#!/usr/bin/env python
# coding: utf-8

# # Read, clean, and validate
# > A Summary of lecture "Exploratory Data Analysis in Python", via datacamp
# 
# - toc: true 
# - badges: true
# - comments: true
# - author: Chanseok Kang
# - categories: [Python, Datacamp]
# - image: images/conception.png

# ## DataFrames and Series
# 

# ### Exploring the NSFG data

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# In[2]:


nsfg = pd.read_hdf('./dataset/nsfg.hdf5', 'nsfg')


# In[3]:


# Display the number of rows and columns
print(nsfg.shape)

# Display the names of the columns
print(nsfg.columns)

# Select columns birthwgt_oz1: ounces
ounces = nsfg['birthwgt_oz1']

# Print the first 5 elements of ounces
print(ounces.head(5))


# ## Clean and Validate

# ### Clean a variable

# In[4]:


nsfg['nbrnaliv'].value_counts()


# In[5]:


# replace the value 8 with NaN
nsfg['nbrnaliv'].replace([8], np.nan, inplace=True)

# Print the values and their frequencies
print(nsfg['nbrnaliv'].value_counts())


# ### Compute a variable

# In[6]:


nsfg['agecon'].describe()


# In[7]:


nsfg['agepreg'].describe()


# In[8]:


# Select the columns and divide by 100
agecon = nsfg['agecon'] / 100
agepreg = nsfg['agepreg'] / 100

# Compute the difference
preg_length = agepreg - agecon

# Compute summary statistics
print(preg_length.describe())


# ## Filter and visualize

# ### Make a histogram

# In[9]:


# Plot the histogram
plt.hist(agecon, bins=20)

# Label the axes
plt.xlabel("Age at conception")
plt.ylabel('Number of pregnancies')
plt.savefig('../images/conception.png')


# In[20]:


# Plot the histogram
plt.hist(agecon, bins=20, histtype='step')

# Label the axes
plt.xlabel("Age at conception")
plt.ylabel('Number of pregnancies')


# ### Compute birth weight

# In[21]:


def resample_rows_weighted(df, column='wgt2013_2015'):
    """Resamples a DataFrame using probabilities proportional to given column.
    Args:
        df: DataFrame
        column: string column name to use as weights
    returns: 
        DataFrame
    """
    weights = df[column].copy()
    weights /= sum(weights)
    indices = np.random.choice(df.index, len(df), replace=True, p=weights)
    sample = df.loc[indices]
    return sample


# In[22]:


# Resample the data
nsfg = resample_rows_weighted(nsfg, 'wgt2013_2015')

# Clean the weight variables
pounds = nsfg['birthwgt_lb1'].replace([98, 99], np.nan)
ounces = nsfg['birthwgt_oz1'].replace([98, 99], np.nan)

# Compute total birth weight
birth_weight = pounds + ounces/16


# In[25]:


# Create a Boolean Series for full-term babies
full_term = nsfg['prglngth'] >= 37

# Select the weights of full-term babies
full_term_weight = birth_weight[full_term]

# Compute the mean weight of full-term babies
print(full_term_weight.mean())


# ### Filter

# In[26]:


# Filter full-term babies
full_term = nsfg['prglngth'] >= 37

# Filter single birth
single = nsfg['nbrnaliv'] == 1

# Compute birth weight for single full-term babies
single_full_term_weight = birth_weight[single & full_term]
print('Single full-term mean:', single_full_term_weight.mean())

# Compute birth weight for multiple full-term babies
mult_full_term_weight = birth_weight[~single & full_term]
print('Multiple full-term mean:', mult_full_term_weight.mean())