import numpy as np
import pandas as pd

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

x = array([ 1.00201077,  1.58251956,  0.94515919,  6.48778002,  1.47764604,
        5.18847071,  4.21988095,  2.85971522,  3.40044437,  3.74907745,
        1.18065796,  3.74748775,  3.27328568,  3.19374927,  8.0726155 ,
        0.90326139,  2.34460034,  2.14199217,  3.27446744,  3.58872357,
        1.20611533,  2.16594393,  5.56610242,  4.66479977,  2.3573932 ])
_ = hist(x, bins=8)

precip = pd.read_table("data/nashville_precip.txt", index_col=0, na_values='NA', delim_whitespace=True)
precip.head()

_ = precip.hist(sharex=True, sharey=True, grid=False)
tight_layout()

precip.fillna(value={'Oct': precip.Oct.mean()}, inplace=True)

precip_mean = precip.mean()
precip_mean

precip_var = precip.var()
precip_var

alpha_mom = precip_mean ** 2 / precip_var
beta_mom = precip_var / precip_mean

alpha_mom, beta_mom

from scipy.stats.distributions import gamma

hist(precip.Jan, normed=True, bins=20)
plot(linspace(0, 10), gamma.pdf(linspace(0, 10), alpha_mom[0], beta_mom[0]))

axs = precip.hist(normed=True, figsize=(12, 8), sharex=True, sharey=True, bins=15, grid=False)

for ax in axs.ravel():
    
    # Get month
    m = ax.get_title()
    
    # Plot fitted distribution
    x = linspace(*ax.get_xlim())
    ax.plot(x, gamma.pdf(x, alpha_mom[m], beta_mom[m]))
    
    # Annotate with parameter estimates
    label = 'alpha = {0:.2f}\nbeta = {1:.2f}'.format(alpha_mom[m], beta_mom[m])
    ax.annotate(label, xy=(10, 0.2))
    
tight_layout()

y = np.random.poisson(5, size=100)
plt.hist(y, bins=12, normed=True)
xlabel('y'); ylabel('Pr(y)')

poisson_like = lambda x, lam: np.exp(-lam) * (lam**x) / (np.arange(x)+1).prod()

lam = 6
value = 10
poisson_like(value, lam)

np.sum(poisson_like(yi, lam) for yi in y)

lam = 8
np.sum(poisson_like(yi, lam) for yi in y)

lambdas = np.linspace(0,15)
x = 5
plt.plot(lambdas, [poisson_like(x, l) for l in lambdas])
xlabel('$\lambda$')
ylabel('L($\lambda$|x={0})'.format(x))

lam = 5
xvals = arange(15)
plt.bar(xvals, [poisson_like(x, lam) for x in xvals])
xlabel('x')
ylabel('Pr(X|$\lambda$=5)')

from scipy.optimize import newton

# some function
func = lambda x: 3./(1 + 400*np.exp(-2*x)) - 1
xvals = np.linspace(0, 6)
plot(xvals, func(xvals))
text(5.3, 2.1, '$f(x)$', fontsize=16)
# zero line
plot([0,6], [0,0], 'k-')
# value at step n
plot([4,4], [0,func(4)], 'k:')
plt.text(4, -.2, '$x_n$', fontsize=16)
# tangent line
tanline = lambda x: -0.858 + 0.626*x
plot(xvals, tanline(xvals), 'r--')
# point at step n+1
xprime = 0.858/0.626
plot([xprime, xprime], [tanline(xprime), func(xprime)], 'k:')
plt.text(xprime+.1, -.2, '$x_{n+1}$', fontsize=16)

from scipy.special import psi, polygamma

dlgamma = lambda m, log_mean, mean_log: np.log(m) - psi(m) - log_mean + mean_log
dl2gamma = lambda m, *args: 1./m - polygamma(1, m)

# Calculate statistics
log_mean = precip.mean().apply(log)
mean_log = precip.apply(log).mean()

# Alpha MLE for December
alpha_mle = newton(dlgamma, 2, dl2gamma, args=(log_mean[-1], mean_log[-1]))
alpha_mle

beta_mle = alpha_mle/precip.mean()[-1]
beta_mle

dec = precip.Dec
dec.hist(normed=True, bins=10, grid=False)
x = linspace(0, dec.max())
plot(x, gamma.pdf(x, alpha_mom[-1], beta_mom[-1]), 'm-')
plot(x, gamma.pdf(x, alpha_mle, beta_mle), 'r--')

from scipy.stats import gamma

gamma.fit(precip.Dec)

x = np.random.normal(size=10000)
a = -1
x_small = x < a
while x_small.sum():
    x[x_small] = np.random.normal(size=x_small.sum())
    x_small = x < a
    
_ = hist(x, bins=100)

from scipy.stats.distributions import norm

trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[1])) - 
                                      np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum()

from scipy.optimize import fmin

fmin(trunc_norm, np.array([1,2]), args=(-1, x))

# Some random data
y = np.random.random(15) * 10
y

x = np.linspace(0, 10, 100)
# Smoothing parameter
s = 0.4
# Calculate the kernels
kernels = np.transpose([norm.pdf(x, yi, s) for yi in y])
plot(x, kernels, 'k:')
plot(x, kernels.sum(1))
plot(y, np.zeros(len(y)), 'ro', ms=10)

# Create a bi-modal distribution with a mixture of Normals.
x1 = np.random.normal(0, 3, 50)
x2 = np.random.normal(4, 1, 50)

# Append by row
x = np.r_[x1, x2]

plt.hist(x, bins=8, normed=True)

from scipy.stats import kde

density = kde.gaussian_kde(x)
xgrid = np.linspace(x.min(), x.max(), 100)
plt.hist(x, bins=8, normed=True)
plt.plot(xgrid, density(xgrid), 'r-')

cdystonia = pd.read_csv("data/cdystonia.csv")
cdystonia[cdystonia.obs==6].hist(column='twstrs', by=cdystonia.treat, bins=8)


x = np.array([2.2, 4.3, 5.1, 5.8, 6.4, 8.0])
y = np.array([0.4, 10.1, 14.0, 10.9, 15.4, 18.5])
plot(x,y,'ro')

ss = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x) ** 2)

ss([0,1],x,y)

b0,b1 = fmin(ss, [0,1], args=(x,y))
b0,b1

plot(x, y, 'ro')
plot([0,10], [b0, b0+b1*10])

plot(x, y, 'ro')
plot([0,10], [b0, b0+b1*10])
for xi, yi in zip(x,y):
    plot([xi]*2, [yi, b0+b1*xi], 'k:')
xlim(2, 9); ylim(0, 20)

sabs = lambda theta, x, y: np.sum(np.abs(y - theta[0] - theta[1]*x))
b0,b1 = fmin(sabs, [0,1], args=(x,y))
print b0,b1
plot(x, y, 'ro')
plot([0,10], [b0, b0+b1*10])

ss2 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2)) ** 2)
b0,b1,b2 = fmin(ss2, [1,1,-1], args=(x,y))
print b0,b1,b2
plot(x, y, 'ro')
xvals = np.linspace(0, 10, 100)
plot(xvals, b0 + b1*xvals + b2*(xvals**2))

ss3 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2) - theta[3]*(x**3)) ** 2)

bb = pd.read_csv("data/baseball.csv", index_col=0)
plot(bb.hr, bb.rbi, 'r.')
b0,b1,b2,b3 = fmin(ss3, [0,1,-1,0], args=(bb.hr, bb.rbi))
xvals = arange(40)
plot(xvals, b0 + b1*xvals + b2*(xvals**2) + b3*(xvals**3))

import statsmodels.api as sm

straight_line = sm.OLS(y, sm.add_constant(x)).fit()
straight_line.summary()

from statsmodels.formula.api import ols as OLS

data = pd.DataFrame(dict(x=x, x2=x**2, y=y))
cubic_fit = OLS('y ~ x + x2', data).fit()

cubic_fit.summary()


def calc_poly(params, data):
        x = np.c_[[data**i for i in range(len(params))]]
        return np.dot(params, x)
    
ssp = lambda theta, x, y: np.sum((y - calc_poly(theta, x)) ** 2)
betas = fmin(ssp, np.zeros(10), args=(x,y), maxiter=1e6)
plot(x, y, 'ro')
xvals = np.linspace(0, max(x), 100)
plot(xvals, calc_poly(betas, xvals))

n = len(x)

aic = lambda rss, p, n: n * np.log(rss/(n-p-1)) + 2*p

RSS1 = ss(fmin(ss, [0,1], args=(x,y)), x, y)
RSS2 = ss2(fmin(ss2, [1,1,-1], args=(x,y)), x, y)

print aic(RSS1, 2, n), aic(RSS2, 3, n)

titanic = pd.read_excel("data/titanic.xls", "titanic")
titanic.name

jitter = np.random.normal(scale=0.02, size=len(titanic))
plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3)
yticks([0,1])
ylabel("survived")
xlabel("log(fare)")

x = np.log(titanic.fare[titanic.fare>0])
y = titanic.survived[titanic.fare>0]
betas_titanic = fmin(ss, [1,1], args=(x,y))

jitter = np.random.normal(scale=0.02, size=len(titanic))
plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3)
yticks([0,1])
ylabel("survived")
xlabel("log(fare)")
plt.plot([0,7], [betas_titanic[0], betas_titanic[0] + betas_titanic[1]*7.])

logit = lambda p: np.log(p/(1.-p))
unit_interval = np.linspace(0,1)
plt.plot(unit_interval/(1-unit_interval), unit_interval)

plt.plot(logit(unit_interval), unit_interval)

invlogit = lambda x: 1. / (1 + np.exp(-x))

def logistic_like(theta, x, y):
    p = invlogit(theta[0] + theta[1] * x)
    # Return negative of log-likelihood
    return -np.sum(y * np.log(p) + (1-y) * np.log(1 - p))

x, y = titanic[titanic.fare.notnull()][['fare', 'survived']].values.T

b0,b1 = fmin(logistic_like, [0.5,0], args=(x,y))
b0, b1

jitter = np.random.normal(scale=0.01, size=len(x))
plot(x, y+jitter, 'r.', alpha=0.3)
yticks([0,.25,.5,.75,1])
xvals = np.linspace(0, 600)
plot(xvals, invlogit(b0+b1*xvals))

logistic = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial()).fit()
logistic.summary()


np.random.permutation(titanic.name)[:5]

random_ind = np.random.randint(0, len(titanic), 5)
titanic.name[random_ind]

n = 10
R = 1000
# Original sample (n=10)
x = np.random.normal(size=n)
# 1000 bootstrap samples of size 10
s = [x[np.random.randint(0,n,n)].mean() for i in range(R)]
_ = hist(s, bins=30)

boot_mean = np.sum(s)/R
boot_mean

boot_var = ((np.array(s) - boot_mean) ** 2).sum() / (R-1)
boot_var

boot_mean - np.mean(x)

s_sorted = np.sort(s)
s_sorted[:10]

s_sorted[-10:]

alpha = 0.05
s_sorted[[(R+1)*alpha/2, (R+1)*(1-alpha/2)]]