%matplotlib inline import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.api as sm # Set some Pandas options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 25) x = np.array([ 1.00201077, 1.58251956, 0.94515919, 6.48778002, 1.47764604, 5.18847071, 4.21988095, 2.85971522, 3.40044437, 3.74907745, 1.18065796, 3.74748775, 3.27328568, 3.19374927, 8.0726155 , 0.90326139, 2.34460034, 2.14199217, 3.27446744, 3.58872357, 1.20611533, 2.16594393, 5.56610242, 4.66479977, 2.3573932 ]) _ = plt.hist(x, bins=10) precip = pd.read_table("data/nashville_precip.txt", index_col=0, na_values='NA', delim_whitespace=True) precip.head() _ = precip.hist(sharex=True, sharey=True, grid=False, figsize=(10,6)) plt.tight_layout() from IPython.display import IFrame IFrame('http://en.wikipedia.org/wiki/Gamma_distribution?useformat=mobile', width=600, height=350) precip.Oct.ix[1960:1970] precip.fillna(value={'Oct': precip.Oct.mean()}, inplace=True) precip_mean = precip.mean() precip_mean precip_var = precip.var() precip_var alpha_mom = precip_mean ** 2 / precip_var beta_mom = precip_var / precip_mean estimates = pd.DataFrame({'alpha_mom': alpha_mom, 'beta_mom': beta_mom}) estimates from scipy.stats import gamma xvals = np.linspace(0, 10) yvals = gamma.pdf(xvals, alpha_mom[0], beta_mom[0]) precip.Jan.hist(normed=True, bins=20) plt.plot(xvals, yvals) axs = precip.hist(normed=True, figsize=(12, 8), sharex=True, sharey=True, bins=15, grid=False) for ax in axs.ravel(): # Get month m = ax.get_title() # Plot fitted distribution x = np.linspace(*ax.get_xlim()) ax.plot(x, gamma.pdf(x, alpha_mom[m], beta_mom[m])) # Annotate with parameter estimates label = 'alpha = {0:.2f}\nbeta = {1:.2f}'.format(alpha_mom[m], beta_mom[m]) ax.annotate(label, xy=(10, 0.2)) plt.tight_layout() y = np.random.poisson(5, size=100) plt.hist(y, bins=np.sqrt(len(y)), normed=True) plt.xlabel('y'); plt.ylabel('Pr(y)') poisson_like = lambda x, lam: np.exp(-lam) * (lam**x) / (np.arange(x)+1).prod() lam = 6 value = 10 poisson_like(value, lam) np.sum(poisson_like(yi, lam) for yi in y) lam = 8 np.sum(poisson_like(yi, lam) for yi in y) lambdas = np.linspace(0,15) x = 5 plt.plot(lambdas, [poisson_like(x, l) for l in lambdas]) plt.xlabel('$\lambda$') plt.ylabel('L($\lambda$|x={0})'.format(x)); lam = 5 xvals = np.arange(15) yvals = [poisson_like(x, lam) for x in xvals] plt.bar(xvals, yvals) plt.xlabel('x') plt.ylabel('Pr(X|$\lambda$=5)'); from scipy.optimize import newton # some function func = lambda x: 3./(1 + 400*np.exp(-2*x)) - 1 xvals = np.linspace(0, 6) plt.plot(xvals, func(xvals)) plt.text(5.3, 2.1, '$f(x)$', fontsize=16) # zero line plt.plot([0,6], [0,0], 'k-') # value at step n plt.plot([4,4], [0,func(4)], 'k:') plt.text(4, -.2, '$x_n$', fontsize=16) # tangent line tanline = lambda x: -0.858 + 0.626*x plt.plot(xvals, tanline(xvals), 'r--') # point at step n+1 xprime = 0.858/0.626 plt.plot([xprime, xprime], [tanline(xprime), func(xprime)], 'k:') plt.text(xprime+.1, -.2, '$x_{n+1}$', fontsize=16) from scipy.special import psi, polygamma dlgamma = lambda m, log_mean, mean_log: np.log(m) - psi(m) - log_mean + mean_log dl2gamma = lambda m, *args: 1./m - polygamma(1, m) # Calculate statistics log_mean = precip.mean().apply(np.log) mean_log = precip.apply(np.log).mean() # Alpha MLE for December alpha_mle = newton(dlgamma, 2, dl2gamma, args=(log_mean[-1], mean_log[-1])) alpha_mle beta_mle = alpha_mle/precip.mean()[-1] beta_mle dec = precip.Dec dec.hist(normed=True, bins=10, grid=False) x = np.linspace(0, dec.max()) plt.plot(x, gamma.pdf(x, alpha_mom[-1], beta_mom[-1]), 'm-') plt.plot(x, gamma.pdf(x, alpha_mle, beta_mle), 'r--') from scipy.stats import gamma ahat, _, bhat = gamma.fit(precip.Dec, floc=0) ahat, 1./bhat # Some random data y = np.random.random(15) * 10 y from scipy.stats import norm # Create an array of x-valuese`````````````````````````````` x = np.linspace(y.min()-1, y.max()+1, 100) # Smoothing parameter s = 0.4 # Calculate the kernels kernels = np.transpose([norm.pdf(x, yi, s) for yi in y]) plt.plot(x, kernels, 'k:') plt.plot(x, kernels.sum(1)) plt.plot(y, np.zeros(len(y)), 'ro', ms=10) # Create a bi-modal distribution with a mixture of Normals. x1 = np.random.normal(0, 2, 50) x2 = np.random.normal(4, 1, 50) # Append by row x = np.r_[x1, x2] plt.hist(x, bins=10, normed=True) from scipy.stats import kde density = kde.gaussian_kde(x) xgrid = np.linspace(x.min(), x.max(), 100) plt.hist(x, bins=10, normed=True) plt.plot(xgrid, density(xgrid), 'r-') x = np.array([2.2, 4.3, 5.1, 5.8, 6.4, 8.0]) y = np.array([0.4, 10.1, 14.0, 10.9, 15.4, 18.5]) plt.plot(x,y,'ro') ss = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x) ** 2) ss([0,1],x,y) from scipy.optimize import fmin b0,b1 = fmin(ss, [0,1], args=(x,y)) b0,b1 plt.plot(x, y, 'ro') plt.plot([0,10], [b0, b0+b1*10]) plt.plot(x, y, 'ro') plt.plot([0,10], [b0, b0+b1*10]) for xi, yi in zip(x,y): plt.plot([xi]*2, [yi, b0+b1*xi], 'k:') plt.xlim(2, 9); plt.ylim(0, 20) sabs = lambda theta, x, y: np.sum(np.abs(y - theta[0] - theta[1]*x)) b0,b1 = fmin(sabs, [0,1], args=(x,y)) print b0,b1 plt.plot(x, y, 'ro') plt.plot([0,10], [b0, b0+b1*10]) ss2 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2)) ** 2) b0,b1,b2 = fmin(ss2, [1,1,-1], args=(x,y)) print b0,b1,b2 plt.plot(x, y, 'ro') xvals = np.linspace(0, 10, 100) plt.plot(xvals, b0 + b1*xvals + b2*(xvals**2)) ss3 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2) - theta[3]*(x**3)) ** 2) bb = pd.read_csv("data/baseball.csv", index_col=0) plt.plot(bb.hr, bb.rbi, 'r.') b0,b1,b2,b3 = fmin(ss3, [0,1,-1,0], args=(bb.hr, bb.rbi)) xvals = np.arange(40) plt.plot(xvals, b0 + b1*xvals + b2*(xvals**2) + b3*(xvals**3)) import statsmodels.api as sm straight_line = sm.OLS(y, sm.add_constant(x)).fit() straight_line.summary() from statsmodels.formula.api import ols as OLS data = pd.DataFrame(dict(x=x, y=y)) cubic_fit = OLS('y ~ x + I(x**2)', data).fit() cubic_fit.summary() def calc_poly(params, data): x = np.c_[[data**i for i in range(len(params))]] return np.dot(params, x) ssp = lambda theta, x, y: np.sum((y - calc_poly(theta, x)) ** 2) betas = fmin(ssp, np.zeros(10), args=(x,y), maxiter=1e6) plt.plot(x, y, 'ro') xvals = np.linspace(0, max(x), 100) plt.plot(xvals, calc_poly(betas, xvals)) n = len(x) aic = lambda rss, p, n: n * np.log(rss/(n-p-1)) + 2*p RSS1 = ss(fmin(ss, [0,1], args=(x,y)), x, y) RSS2 = ss2(fmin(ss2, [1,1,-1], args=(x,y)), x, y) print aic(RSS1, 2, n), aic(RSS2, 3, n) titanic = pd.read_excel("data/titanic.xls", "titanic") titanic.name jitter = np.random.normal(scale=0.02, size=len(titanic)) plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3) plt.yticks([0,1]) plt.ylabel("survived") plt.xlabel("log(fare)") x = np.log(titanic.fare[titanic.fare>0]) y = titanic.survived[titanic.fare>0] betas_titanic = fmin(ss, [1,1], args=(x,y)) jitter = np.random.normal(scale=0.02, size=len(titanic)) plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3) plt.yticks([0,1]) plt.ylabel("survived") plt.xlabel("log(fare)") plt.plot([0,7], [betas_titanic[0], betas_titanic[0] + betas_titanic[1]*7.]) logit = lambda p: np.log(p/(1.-p)) unit_interval = np.linspace(0,1) plt.plot(unit_interval/(1-unit_interval), unit_interval) plt.plot(logit(unit_interval), unit_interval) invlogit = lambda x: 1. / (1 + np.exp(-x)) def logistic_like(theta, x, y): p = invlogit(theta[0] + theta[1] * x) # Return negative of log-likelihood return -np.sum(y * np.log(p) + (1-y) * np.log(1 - p)) x, y = titanic[titanic.fare.notnull()][['fare', 'survived']].values.T b0 ,b1 = fmin(logistic_like, [0.5,0], args=(x,y)) b0, b1 jitter = np.random.normal(scale=0.01, size=len(x)) plt.plot(x, y+jitter, 'r.', alpha=0.3) plt.yticks([0,.25,.5,.75,1]) plt.xvals = np.linspace(0, 600) plt.plot(xvals, invlogit(b0+b1*xvals)) logistic = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial()).fit() logistic.summary() np.random.permutation(titanic.name)[:5] random_ind = np.random.randint(0, len(titanic), 5) titanic.name[random_ind] n = 10 R = 1000 # Original sample (n=10) x = np.random.normal(size=n) # 1000 bootstrap samples of size 10 s = [x[np.random.randint(0,n,n)].mean() for i in range(R)] _ = plt.hist(s, bins=30) boot_mean = np.sum(s)/R boot_mean boot_var = ((np.array(s) - boot_mean) ** 2).sum() / (R-1) boot_var boot_mean - np.mean(x) s_sorted = np.sort(s) s_sorted[:10] s_sorted[-10:] alpha = 0.05 s_sorted[[(R+1)*alpha/2, (R+1)*(1-alpha/2)]]