%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

x = np.array([ 1.00201077,  1.58251956,  0.94515919,  6.48778002,  1.47764604,
        5.18847071,  4.21988095,  2.85971522,  3.40044437,  3.74907745,
        1.18065796,  3.74748775,  3.27328568,  3.19374927,  8.0726155 ,
        0.90326139,  2.34460034,  2.14199217,  3.27446744,  3.58872357,
        1.20611533,  2.16594393,  5.56610242,  4.66479977,  2.3573932 ])
_ = plt.hist(x, bins=10)

precip = pd.read_table("data/nashville_precip.txt", index_col=0, 
                       na_values='NA', delim_whitespace=True)
precip.head()

_ = precip.hist(sharex=True, sharey=True, grid=False, figsize=(10,6))
plt.tight_layout()

from IPython.display import IFrame
IFrame('http://en.wikipedia.org/wiki/Gamma_distribution?useformat=mobile', width=600, height=350)

precip.Oct.ix[1960:1970]

precip.fillna(value={'Oct': precip.Oct.mean()}, inplace=True)

precip_mean = precip.mean()
precip_mean

precip_var = precip.var()
precip_var

alpha_mom = precip_mean ** 2 / precip_var
beta_mom = precip_var / precip_mean

estimates = pd.DataFrame({'alpha_mom': alpha_mom, 'beta_mom': beta_mom})
estimates

from scipy.stats import gamma

xvals = np.linspace(0, 10)
yvals = gamma.pdf(xvals, alpha_mom[0], beta_mom[0])
precip.Jan.hist(normed=True, bins=20)
plt.plot(xvals, yvals)

axs = precip.hist(normed=True, figsize=(12, 8), sharex=True, sharey=True, bins=15, grid=False)

for ax in axs.ravel():
    
    # Get month
    m = ax.get_title()
    
    # Plot fitted distribution
    x = np.linspace(*ax.get_xlim())
    ax.plot(x, gamma.pdf(x, alpha_mom[m], beta_mom[m]))
    
    # Annotate with parameter estimates
    label = 'alpha = {0:.2f}\nbeta = {1:.2f}'.format(alpha_mom[m], beta_mom[m])
    ax.annotate(label, xy=(10, 0.2))
    
plt.tight_layout()

y = np.random.poisson(5, size=100)
plt.hist(y, bins=np.sqrt(len(y)), normed=True)
plt.xlabel('y'); plt.ylabel('Pr(y)')

poisson_like = lambda x, lam: np.exp(-lam) * (lam**x) / (np.arange(x)+1).prod()

lam = 6
value = 10
poisson_like(value, lam)

np.sum(poisson_like(yi, lam) for yi in y)

lam = 8
np.sum(poisson_like(yi, lam) for yi in y)

lambdas = np.linspace(0,15)
x = 5
plt.plot(lambdas, [poisson_like(x, l) for l in lambdas])
plt.xlabel('$\lambda$')
plt.ylabel('L($\lambda$|x={0})'.format(x));

lam = 5
xvals = np.arange(15)
yvals = [poisson_like(x, lam) for x in xvals]
plt.bar(xvals, yvals)
plt.xlabel('x')
plt.ylabel('Pr(X|$\lambda$=5)');

from scipy.optimize import newton

# some function
func = lambda x: 3./(1 + 400*np.exp(-2*x)) - 1
xvals = np.linspace(0, 6)
plt.plot(xvals, func(xvals))
plt.text(5.3, 2.1, '$f(x)$', fontsize=16)

# zero line
plt.plot([0,6], [0,0], 'k-')

# value at step n
plt.plot([4,4], [0,func(4)], 'k:')
plt.text(4, -.2, '$x_n$', fontsize=16)

# tangent line
tanline = lambda x: -0.858 + 0.626*x
plt.plot(xvals, tanline(xvals), 'r--')

# point at step n+1
xprime = 0.858/0.626
plt.plot([xprime, xprime], [tanline(xprime), func(xprime)], 'k:')
plt.text(xprime+.1, -.2, '$x_{n+1}$', fontsize=16)

from scipy.special import psi, polygamma

dlgamma = lambda m, log_mean, mean_log: np.log(m) - psi(m) - log_mean + mean_log
dl2gamma = lambda m, *args: 1./m - polygamma(1, m)

# Calculate statistics
log_mean = precip.mean().apply(np.log)
mean_log = precip.apply(np.log).mean()

# Alpha MLE for December
alpha_mle = newton(dlgamma, 2, dl2gamma, args=(log_mean[-1], mean_log[-1]))
alpha_mle

beta_mle = alpha_mle/precip.mean()[-1]
beta_mle

dec = precip.Dec
dec.hist(normed=True, bins=10, grid=False)
x = np.linspace(0, dec.max())
plt.plot(x, gamma.pdf(x, alpha_mom[-1], beta_mom[-1]), 'm-')
plt.plot(x, gamma.pdf(x, alpha_mle, beta_mle), 'r--')

from scipy.stats import gamma

ahat, _, bhat = gamma.fit(precip.Dec, floc=0)
ahat, 1./bhat

# Some random data
y = np.random.random(15) * 10
y

from scipy.stats import norm

# Create an array of x-valuese``````````````````````````````
x = np.linspace(y.min()-1, y.max()+1, 100)

# Smoothing parameter
s = 0.4

# Calculate the kernels
kernels = np.transpose([norm.pdf(x, yi, s) for yi in y])
plt.plot(x, kernels, 'k:')
plt.plot(x, kernels.sum(1))
plt.plot(y, np.zeros(len(y)), 'ro', ms=10)

# Create a bi-modal distribution with a mixture of Normals.
x1 = np.random.normal(0, 2, 50)
x2 = np.random.normal(4, 1, 50)

# Append by row
x = np.r_[x1, x2]

plt.hist(x, bins=10, normed=True)

from scipy.stats import kde

density = kde.gaussian_kde(x)
xgrid = np.linspace(x.min(), x.max(), 100)
plt.hist(x, bins=10, normed=True)
plt.plot(xgrid, density(xgrid), 'r-')

x = np.array([2.2, 4.3, 5.1, 5.8, 6.4, 8.0])
y = np.array([0.4, 10.1, 14.0, 10.9, 15.4, 18.5])
plt.plot(x,y,'ro')

ss = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x) ** 2)

ss([0,1],x,y)

from scipy.optimize import fmin

b0,b1 = fmin(ss, [0,1], args=(x,y))
b0,b1

plt.plot(x, y, 'ro')
plt.plot([0,10], [b0, b0+b1*10])

plt.plot(x, y, 'ro')
plt.plot([0,10], [b0, b0+b1*10])
for xi, yi in zip(x,y):
    plt.plot([xi]*2, [yi, b0+b1*xi], 'k:')
plt.xlim(2, 9); plt.ylim(0, 20)

sabs = lambda theta, x, y: np.sum(np.abs(y - theta[0] - theta[1]*x))
b0,b1 = fmin(sabs, [0,1], args=(x,y))
print b0,b1
plt.plot(x, y, 'ro')
plt.plot([0,10], [b0, b0+b1*10])

ss2 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2)) ** 2)
b0,b1,b2 = fmin(ss2, [1,1,-1], args=(x,y))
print b0,b1,b2
plt.plot(x, y, 'ro')
xvals = np.linspace(0, 10, 100)
plt.plot(xvals, b0 + b1*xvals + b2*(xvals**2))

ss3 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2) - theta[3]*(x**3)) ** 2)

bb = pd.read_csv("data/baseball.csv", index_col=0)
plt.plot(bb.hr, bb.rbi, 'r.')
b0,b1,b2,b3 = fmin(ss3, [0,1,-1,0], args=(bb.hr, bb.rbi))
xvals = np.arange(40)
plt.plot(xvals, b0 + b1*xvals + b2*(xvals**2) + b3*(xvals**3))

import statsmodels.api as sm

straight_line = sm.OLS(y, sm.add_constant(x)).fit()
straight_line.summary()

from statsmodels.formula.api import ols as OLS

data = pd.DataFrame(dict(x=x, y=y))
cubic_fit = OLS('y ~ x + I(x**2)', data).fit()

cubic_fit.summary()

def calc_poly(params, data):
        x = np.c_[[data**i for i in range(len(params))]]
        return np.dot(params, x)
    
ssp = lambda theta, x, y: np.sum((y - calc_poly(theta, x)) ** 2)
betas = fmin(ssp, np.zeros(10), args=(x,y), maxiter=1e6)
plt.plot(x, y, 'ro')
xvals = np.linspace(0, max(x), 100)
plt.plot(xvals, calc_poly(betas, xvals))

n = len(x)

aic = lambda rss, p, n: n * np.log(rss/(n-p-1)) + 2*p

RSS1 = ss(fmin(ss, [0,1], args=(x,y)), x, y)
RSS2 = ss2(fmin(ss2, [1,1,-1], args=(x,y)), x, y)

print aic(RSS1, 2, n), aic(RSS2, 3, n)

titanic = pd.read_excel("data/titanic.xls", "titanic")
titanic.name

jitter = np.random.normal(scale=0.02, size=len(titanic))
plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3)
plt.yticks([0,1])
plt.ylabel("survived")
plt.xlabel("log(fare)")

x = np.log(titanic.fare[titanic.fare>0])
y = titanic.survived[titanic.fare>0]
betas_titanic = fmin(ss, [1,1], args=(x,y))

jitter = np.random.normal(scale=0.02, size=len(titanic))
plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3)
plt.yticks([0,1])
plt.ylabel("survived")
plt.xlabel("log(fare)")
plt.plot([0,7], [betas_titanic[0], betas_titanic[0] + betas_titanic[1]*7.])

logit = lambda p: np.log(p/(1.-p))
unit_interval = np.linspace(0,1)
plt.plot(unit_interval/(1-unit_interval), unit_interval)

plt.plot(logit(unit_interval), unit_interval)

invlogit = lambda x: 1. / (1 + np.exp(-x))

def logistic_like(theta, x, y):
    p = invlogit(theta[0] + theta[1] * x)
    # Return negative of log-likelihood
    return -np.sum(y * np.log(p) + (1-y) * np.log(1 - p))

x, y = titanic[titanic.fare.notnull()][['fare', 'survived']].values.T

b0 ,b1 = fmin(logistic_like, [0.5,0], args=(x,y))
b0, b1

jitter = np.random.normal(scale=0.01, size=len(x))
plt.plot(x, y+jitter, 'r.', alpha=0.3)
plt.yticks([0,.25,.5,.75,1])
plt.xvals = np.linspace(0, 600)
plt.plot(xvals, invlogit(b0+b1*xvals))

logistic = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial()).fit()
logistic.summary()

np.random.permutation(titanic.name)[:5]

random_ind = np.random.randint(0, len(titanic), 5)
titanic.name[random_ind]

n = 10
R = 1000
# Original sample (n=10)
x = np.random.normal(size=n)
# 1000 bootstrap samples of size 10
s = [x[np.random.randint(0,n,n)].mean() for i in range(R)]
_ = plt.hist(s, bins=30)

boot_mean = np.sum(s)/R
boot_mean

boot_var = ((np.array(s) - boot_mean) ** 2).sum() / (R-1)
boot_var

boot_mean - np.mean(x)

s_sorted = np.sort(s)
s_sorted[:10]

s_sorted[-10:]

alpha = 0.05
s_sorted[[(R+1)*alpha/2, (R+1)*(1-alpha/2)]]