''''
-------------------------------------------------------------------------------
Filename   : ch2.ipynb
Date       : 2012-04-30
Author     : C. Vogel
Purpose    : Replicate analysis of height and weight data in Chapter 2 of 
           : _Machine Learning for Hackers_.
Input Data : 01_heights_weights_genders.tsv is available at the book's github 
           : repository at https://github.com/johnmyleswhite/ML_for_Hackers.git
Libraries  : Numpy 1.6.1, Matplotlib 1.1.0, Pandas 0.7.3, scipy 0.10.0, 
           : statsmodels 0.4.0
-------------------------------------------------------------------------------

This notebook is a Python port of the R code in Chapter 2 of _Machine Learning
for Hackers_ by D. Conway and J.M. White.

Running the notebook will produce 9 PNG figures and save them to the working 
directory.

The height/weight dataset CSV file should be located in a /data/ subfolder of 
the working directory. 

For a detailed description of the analysis and the process of porting it
to Python, see: slendrmeans.wordpress.com/will-it-python.
'''

import numpy as np
from pandas import *
import matplotlib.pyplot as plt
import os
import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDE
from statsmodels.nonparametric import lowess
from statsmodels.api import GLM, Logit

# Numeric Summaries
# p. 37

# Import the height and weights data
heights_weights = read_table('data/01_heights_weights_genders.csv', sep = ',', header = 0)

# Assign the heights column to its own series, and describe it.
heights = heights_weights['Height']
heights.describe()

# Means, medians, and modes (p. 38)

def my_mean(x):
    return float(np.sum(x)) / len(x)

def my_median(x):
    '''
    Compute the median of a series x.
    '''
    
    # Get a sorted copy of the values in the series (need to call values
    # otherwise integer indexing messes things up.)
    sorted_x = np.sort(x.values)
    if len(x) % 2 == 0:
        indices = [0.5 * len(x) - 1, 0.5 * len(x)]
        return np.mean(sorted_x[indices])
    else:
        # Ceil(x) - 1 = Floor(x), but this is to make clear that the -1 is to
        # account for 0-based counting.
        index = ceil(0.5 * len(x)) - 1
        return sorted_x.ix[index]

# Check my_mean and my_median against built-ins

my_mean(heights) - heights.mean()

my_median(heights) - heights.median()

# Quantiles (p. 40)

heights.min(), heights.max()

# Range = max - min. Note: np.ptp(heights.values) will do the same thing.
# HT Nathaniel Smith

def my_range(s):
    '''
    Difference between the max and min of an array or Series
    '''
    return s.max() - s.min()

my_range(heights)

# Similarly, pandas doesn't seem to provide multiple quantiles. 
# But (1) the standard ones are available via .describe() and
# (2) creating one is simple.

# To get a single quantile
heights.quantile(.5)

# Function to get arbitrary quantiles of a series.
def my_quantiles(s, prob = (0.0, 0.25, 0.5, 1.0)):
    '''
    Calculate quantiles of a series.

    Parameters:
    -----------
    s : a pandas Series 
    prob : a tuple (or other iterable) of probabilities at 
           which to compute quantiles. Must be an iterable,
           even for a single probability (e.g. prob = (0.50)
           not prob = 0.50).

    Returns:
    --------
    A pandas series with the probabilities as an index.
    '''
    q = [s.quantile(p) for p in prob]
    return Series(q, index = prob)

# With the default prob argument   
my_quantiles(heights)

# With a specific prob argument - here deciles
my_quantiles(heights, prob = arange(0, 1.1, 0.1))

 # Standard deviation and variances

def my_var(x):
    return np.sum((x - x.mean())**2) / (len(x) - 1)

my_var(heights) - heights.var()

def my_sd(x):
    return np.sqrt(my_var(x))

my_sd(heights) - heights.std()

# Exploratory Data Visualization (p. 44)

# Histograms

# 1-inch bins
bins1 = np.arange(heights.min(), heights.max(), 1.)
heights.hist(bins = bins1, fc = 'steelblue')
plt.savefig('height_hist_bins1.png')

# 5-inch bins
bins5 = np.arange(heights.min(), heights.max(), 5.)
heights.hist(bins = bins5, fc = 'steelblue')
plt.savefig('height_hist_bins5.png')

# 0.001-inch bins
bins001 = np.arange(heights.min(), heights.max(), .001)
heights.hist(bins = bins001, fc = 'steelblue')
plt.savefig('height_hist_bins001.png')

# Kernel density estimators, from scipy.stats.

# Create a KDE ojbect
heights_kde = KDE(heights)
# Use fit() to estimate the densities. Default is gaussian kernel 
# using fft. This will provide a "density" attribute.
heights_kde.fit()

# Plot the density of the heights
# Sort inside the plotting so the lines connect nicely.
fig = plt.figure()
plt.plot(heights_kde.support, heights_kde.density)
plt.savefig('heights_density.png')

# Pull out male and female heights as arrays over which to compute densities

heights_m = heights[heights_weights['Gender'] == 'Male'].values
heights_f = heights[heights_weights['Gender'] == 'Female'].values
heights_m_kde = KDE(heights_m)
heights_f_kde = KDE(heights_f)
heights_m_kde.fit()
heights_f_kde.fit()

fig = plt.figure()
plt.plot(heights_m_kde.support, heights_m_kde.density, label = 'Male')
plt.plot(heights_f_kde.support, heights_f_kde.density, label = 'Female')
plt.legend()
plt.savefig('height_density_bysex.png')                

# Do the same thing with weights.
weights_m = heights_weights[heights_weights['Gender'] == 'Male']['Weight'].values
weights_f = heights_weights[heights_weights['Gender'] == 'Female']['Weight'].values
weights_m_kde = KDE(weights_m)
weights_f_kde = KDE(weights_f)
weights_m_kde.fit()
weights_f_kde.fit()


fig = plt.figure()
plt.plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
plt.plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
plt.legend()
plt.savefig('weight_density_bysex.png')

# Subplot weight density by sex.
fig, axes = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (9, 6))
plt.subplots_adjust(hspace = 0.1)
axes[0].plot(weights_f_kde.support, weights_f_kde.density, label = 'Female')
axes[0].xaxis.tick_top()
axes[0].legend()
axes[1].plot(weights_m_kde.support, weights_f_kde.density, label = 'Male')
axes[1].legend()
plt.savefig('weight_density_bysex_subplot.png')

# Scatter plot. Pull weight (both sexes) out as a separate array first, like 
# we did with height above.

weights = heights_weights['Weight']
plt.plot(heights, weights, '.k', mew = 0, alpha = .1)
plt.savefig('height_weight_scatter.png')

# Lowess smoothing - this seems to be new functionality not yet in docs (as of 0.40, April 2012).

lowess_line = lowess.lowess(weights, heights)

plt.figure(figsize = (13, 9))
plt.plot(heights, weights, '.', mfc = 'steelblue', mew=0, alpha = .25)
plt.plot(lowess_line[:,0], lowess_line[:, 1],  '-', color = '#461B7E', label = "Lowess fit")
plt.legend(loc = "upper left")
plt.savefig('height_weight_lowess.png')

# Logistic regression of sex on height and weight
# Sex is coded in the binary variable `male`.

# LHS binary variable
male = (heights_weights['Gender'] == 'Male') * 1

# Matrix of predictor variables: hieght and weight from data frame
# into an Nx2 array.
hw_exog = heights_weights[['Height', 'Weight']].values

# Logit model 1: Using GLM and the Binomial Family w/ the Logit Link
# Note I have to add constants to the `exog` matrix. The prepend = True
# argument prevents a warning about future change to the default argument.
logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit))
logit_model.fit().summary()

# Get the coefficient parameters.
logit_pars = logit_model.fit().params

# Logit model 2: Using the Logit function.
logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True))
logit_model2.fit().summary()

# Get the coefficient parameters
logit_pars2 = logit_model2.fit().params

# Compare the two methods again. They give the same parameters.
DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2})

# Draw a separating line in the [height, weight]-space.
# The line will separate the space into predicted-male
# and predicted-female regions.

# Get the intercept and slope of the line based on the logit coefficients 
intercept = -logit_pars['const'] / logit_pars['x2']
slope =  -logit_pars['x1'] / logit_pars['x2']

# Plot the data and the separating line
# Color code male and female points.
fig = plt.figure(figsize = (10, 8))
plt.plot(heights_f, weights_f, '.', label = 'Female', mew = 0, mfc='coral', alpha = .1)
plt.plot(heights_m, weights_m, '.', label = 'Male', mew = 0, mfc='steelblue', alpha = .1)
plt.plot(array([50, 80]), intercept + slope * array([50, 80]), '-', color = '#461B7E')
plt.legend(loc='upper left')
plt.savefig('height_weight_class.png')