import numpy as np import pandas as pd # Set some Pandas options pd.set_option('display.notebook_repr_html', False) pd.set_option('display.max_columns', 20) pd.set_option('display.max_rows', 25) x = array([ 1.00201077, 1.58251956, 0.94515919, 6.48778002, 1.47764604, 5.18847071, 4.21988095, 2.85971522, 3.40044437, 3.74907745, 1.18065796, 3.74748775, 3.27328568, 3.19374927, 8.0726155 , 0.90326139, 2.34460034, 2.14199217, 3.27446744, 3.58872357, 1.20611533, 2.16594393, 5.56610242, 4.66479977, 2.3573932 ]) _ = hist(x, bins=8) precip = pd.read_table("data/nashville_precip.txt", index_col=0, na_values='NA', delim_whitespace=True) precip.head() _ = precip.hist(sharex=True, sharey=True, grid=False) tight_layout() precip.fillna(value={'Oct': precip.Oct.mean()}, inplace=True) precip_mean = precip.mean() precip_mean precip_var = precip.var() precip_var alpha_mom = precip_mean ** 2 / precip_var beta_mom = precip_var / precip_mean alpha_mom, beta_mom from scipy.stats.distributions import gamma hist(precip.Jan, normed=True, bins=20) plot(linspace(0, 10), gamma.pdf(linspace(0, 10), alpha_mom[0], beta_mom[0])) axs = precip.hist(normed=True, figsize=(12, 8), sharex=True, sharey=True, bins=15, grid=False) for ax in axs.ravel(): # Get month m = ax.get_title() # Plot fitted distribution x = linspace(*ax.get_xlim()) ax.plot(x, gamma.pdf(x, alpha_mom[m], beta_mom[m])) # Annotate with parameter estimates label = 'alpha = {0:.2f}\nbeta = {1:.2f}'.format(alpha_mom[m], beta_mom[m]) ax.annotate(label, xy=(10, 0.2)) tight_layout() y = np.random.poisson(5, size=100) plt.hist(y, bins=12, normed=True) xlabel('y'); ylabel('Pr(y)') poisson_like = lambda x, lam: np.exp(-lam) * (lam**x) / (np.arange(x)+1).prod() lam = 6 value = 10 poisson_like(value, lam) np.sum(poisson_like(yi, lam) for yi in y) lam = 8 np.sum(poisson_like(yi, lam) for yi in y) lambdas = np.linspace(0,15) x = 5 plt.plot(lambdas, [poisson_like(x, l) for l in lambdas]) xlabel('$\lambda$') ylabel('L($\lambda$|x={0})'.format(x)) lam = 5 xvals = arange(15) plt.bar(xvals, [poisson_like(x, lam) for x in xvals]) xlabel('x') ylabel('Pr(X|$\lambda$=5)') from scipy.optimize import newton # some function func = lambda x: 3./(1 + 400*np.exp(-2*x)) - 1 xvals = np.linspace(0, 6) plot(xvals, func(xvals)) text(5.3, 2.1, '$f(x)$', fontsize=16) # zero line plot([0,6], [0,0], 'k-') # value at step n plot([4,4], [0,func(4)], 'k:') plt.text(4, -.2, '$x_n$', fontsize=16) # tangent line tanline = lambda x: -0.858 + 0.626*x plot(xvals, tanline(xvals), 'r--') # point at step n+1 xprime = 0.858/0.626 plot([xprime, xprime], [tanline(xprime), func(xprime)], 'k:') plt.text(xprime+.1, -.2, '$x_{n+1}$', fontsize=16) from scipy.special import psi, polygamma dlgamma = lambda m, log_mean, mean_log: np.log(m) - psi(m) - log_mean + mean_log dl2gamma = lambda m, *args: 1./m - polygamma(1, m) # Calculate statistics log_mean = precip.mean().apply(log) mean_log = precip.apply(log).mean() # Alpha MLE for December alpha_mle = newton(dlgamma, 2, dl2gamma, args=(log_mean[-1], mean_log[-1])) alpha_mle beta_mle = alpha_mle/precip.mean()[-1] beta_mle dec = precip.Dec dec.hist(normed=True, bins=10, grid=False) x = linspace(0, dec.max()) plot(x, gamma.pdf(x, alpha_mom[-1], beta_mom[-1]), 'm-') plot(x, gamma.pdf(x, alpha_mle, beta_mle), 'r--') from scipy.stats import gamma gamma.fit(precip.Dec) x = np.random.normal(size=10000) a = -1 x_small = x < a while x_small.sum(): x[x_small] = np.random.normal(size=x_small.sum()) x_small = x < a _ = hist(x, bins=100) from scipy.stats.distributions import norm trunc_norm = lambda theta, a, x: -(np.log(norm.pdf(x, theta[0], theta[1])) - np.log(1 - norm.cdf(a, theta[0], theta[1]))).sum() from scipy.optimize import fmin fmin(trunc_norm, np.array([1,2]), args=(-1, x)) # Some random data y = np.random.random(15) * 10 y x = np.linspace(0, 10, 100) # Smoothing parameter s = 0.4 # Calculate the kernels kernels = np.transpose([norm.pdf(x, yi, s) for yi in y]) plot(x, kernels, 'k:') plot(x, kernels.sum(1)) plot(y, np.zeros(len(y)), 'ro', ms=10) # Create a bi-modal distribution with a mixture of Normals. x1 = np.random.normal(0, 3, 50) x2 = np.random.normal(4, 1, 50) # Append by row x = np.r_[x1, x2] plt.hist(x, bins=8, normed=True) from scipy.stats import kde density = kde.gaussian_kde(x) xgrid = np.linspace(x.min(), x.max(), 100) plt.hist(x, bins=8, normed=True) plt.plot(xgrid, density(xgrid), 'r-') cdystonia = pd.read_csv("data/cdystonia.csv") cdystonia[cdystonia.obs==6].hist(column='twstrs', by=cdystonia.treat, bins=8) x = np.array([2.2, 4.3, 5.1, 5.8, 6.4, 8.0]) y = np.array([0.4, 10.1, 14.0, 10.9, 15.4, 18.5]) plot(x,y,'ro') ss = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x) ** 2) ss([0,1],x,y) b0,b1 = fmin(ss, [0,1], args=(x,y)) b0,b1 plot(x, y, 'ro') plot([0,10], [b0, b0+b1*10]) plot(x, y, 'ro') plot([0,10], [b0, b0+b1*10]) for xi, yi in zip(x,y): plot([xi]*2, [yi, b0+b1*xi], 'k:') xlim(2, 9); ylim(0, 20) sabs = lambda theta, x, y: np.sum(np.abs(y - theta[0] - theta[1]*x)) b0,b1 = fmin(sabs, [0,1], args=(x,y)) print b0,b1 plot(x, y, 'ro') plot([0,10], [b0, b0+b1*10]) ss2 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2)) ** 2) b0,b1,b2 = fmin(ss2, [1,1,-1], args=(x,y)) print b0,b1,b2 plot(x, y, 'ro') xvals = np.linspace(0, 10, 100) plot(xvals, b0 + b1*xvals + b2*(xvals**2)) ss3 = lambda theta, x, y: np.sum((y - theta[0] - theta[1]*x - theta[2]*(x**2) - theta[3]*(x**3)) ** 2) bb = pd.read_csv("data/baseball.csv", index_col=0) plot(bb.hr, bb.rbi, 'r.') b0,b1,b2,b3 = fmin(ss3, [0,1,-1,0], args=(bb.hr, bb.rbi)) xvals = arange(40) plot(xvals, b0 + b1*xvals + b2*(xvals**2) + b3*(xvals**3)) import statsmodels.api as sm straight_line = sm.OLS(y, sm.add_constant(x)).fit() straight_line.summary() from statsmodels.formula.api import ols as OLS data = pd.DataFrame(dict(x=x, x2=x**2, y=y)) cubic_fit = OLS('y ~ x + x2', data).fit() cubic_fit.summary() def calc_poly(params, data): x = np.c_[[data**i for i in range(len(params))]] return np.dot(params, x) ssp = lambda theta, x, y: np.sum((y - calc_poly(theta, x)) ** 2) betas = fmin(ssp, np.zeros(10), args=(x,y), maxiter=1e6) plot(x, y, 'ro') xvals = np.linspace(0, max(x), 100) plot(xvals, calc_poly(betas, xvals)) n = len(x) aic = lambda rss, p, n: n * np.log(rss/(n-p-1)) + 2*p RSS1 = ss(fmin(ss, [0,1], args=(x,y)), x, y) RSS2 = ss2(fmin(ss2, [1,1,-1], args=(x,y)), x, y) print aic(RSS1, 2, n), aic(RSS2, 3, n) titanic = pd.read_excel("data/titanic.xls", "titanic") titanic.name jitter = np.random.normal(scale=0.02, size=len(titanic)) plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3) yticks([0,1]) ylabel("survived") xlabel("log(fare)") x = np.log(titanic.fare[titanic.fare>0]) y = titanic.survived[titanic.fare>0] betas_titanic = fmin(ss, [1,1], args=(x,y)) jitter = np.random.normal(scale=0.02, size=len(titanic)) plt.scatter(np.log(titanic.fare), titanic.survived + jitter, alpha=0.3) yticks([0,1]) ylabel("survived") xlabel("log(fare)") plt.plot([0,7], [betas_titanic[0], betas_titanic[0] + betas_titanic[1]*7.]) logit = lambda p: np.log(p/(1.-p)) unit_interval = np.linspace(0,1) plt.plot(unit_interval/(1-unit_interval), unit_interval) plt.plot(logit(unit_interval), unit_interval) invlogit = lambda x: 1. / (1 + np.exp(-x)) def logistic_like(theta, x, y): p = invlogit(theta[0] + theta[1] * x) # Return negative of log-likelihood return -np.sum(y * np.log(p) + (1-y) * np.log(1 - p)) x, y = titanic[titanic.fare.notnull()][['fare', 'survived']].values.T b0,b1 = fmin(logistic_like, [0.5,0], args=(x,y)) b0, b1 jitter = np.random.normal(scale=0.01, size=len(x)) plot(x, y+jitter, 'r.', alpha=0.3) yticks([0,.25,.5,.75,1]) xvals = np.linspace(0, 600) plot(xvals, invlogit(b0+b1*xvals)) logistic = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial()).fit() logistic.summary() np.random.permutation(titanic.name)[:5] random_ind = np.random.randint(0, len(titanic), 5) titanic.name[random_ind] n = 10 R = 1000 # Original sample (n=10) x = np.random.normal(size=n) # 1000 bootstrap samples of size 10 s = [x[np.random.randint(0,n,n)].mean() for i in range(R)] _ = hist(s, bins=30) boot_mean = np.sum(s)/R boot_mean boot_var = ((np.array(s) - boot_mean) ** 2).sum() / (R-1) boot_var boot_mean - np.mean(x) s_sorted = np.sort(s) s_sorted[:10] s_sorted[-10:] alpha = 0.05 s_sorted[[(R+1)*alpha/2, (R+1)*(1-alpha/2)]]