#!/usr/bin/env python # coding: utf-8 # # Discrete # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import binom, bernoulli, multinomial, poisson, norm, uniform import numpy as np import pandas as pd # ## Binomial # # Toss a coint $n$ times with $k$ times success, probability of head is $\theta$, $X \in \{0, ..., n\}$. # # - **PMF** # $$Bin(k|n, \theta) = {{n}\choose{k}}\theta^k (1 - \theta)^{n - k}$$ # In[2]: # setup experiment n = 5 theta = 0.4 k = x = np.arange(binom.ppf(0.01, n, theta), binom.ppf(0.99, n, theta)) y = binom.pmf(k, n, theta) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'bo', ms=8, label='binom pmf') ax.vlines(x, ymin=0, ymax=y, colors='b', lw=5, alpha=0.5) # ## Bernouli # # Toss a coin only once $X \in \{0, 1\}$. # # - **PMF** # $$Ber(x | \theta) = \theta^{\mathbf{I}(x = 1)} (1 - \theta)^{\mathbf{I}(x = 0)}$$ # # - **Likelihood** # $$L(p) = log(p) = klog(\theta) + (n - k)log(1 - \theta)$$ # # - **MLE** # $$\frac{d L(p)}{d \theta} = 0 \iff k\frac{1}{\theta} - (n - k)\frac{1}{1 - \theta} = 0$$ # $$\theta_{MLE} = \frac{k}{n}$$ # In[3]: # setup experiment theta = 0.3 k = x = np.arange(bernoulli.ppf(0.01, theta), bernoulli.ppf(0.99, theta)) y = bernoulli.pmf(k, theta) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'bo', ms=8, label='bernoulli pmf') ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5) # ## Multinomial # # Tossing a $K$-sided die, let $\textbf{x} = (x_1, ..., x_K) \in \{0, 1, ..., n\}^K$, $x_j$ is the number of times side $j$ of the die occurs. # # - **PMF** # $$Mu(\textbf{x} | n, \boldsymbol{\theta}) = {{n}\choose{x_1 ... x_K}} \prod_{j=1}^K \theta_{j}^{x_j}$$ # $${{n}\choose{x_1 ... x_K}} = \frac{n!}{x_1!x_2!...x_K!}$$ # In[4]: # setup experiment x = [3, 4] n = 7 theta = [0.4, 0.6] print multinomial.pmf(x, n, theta) print binom.pmf(3, 7, 0.4) # ## Multinouli # # One-hot encoding $\textbf{x} \in \{0, 1\}^K$. # # $$Mu(\textbf{x} | 1, \boldsymbol{\theta}) = \prod_{j=1}^K \theta_j^{\mathbf{I}(x_j = 1)}$$ # ## Poisson # # Counts rare events like radioactive decay and traffic accidents. $X \in \{0, 1, ...\}$, $\lambda > 0$, $e^{-\lambda}$ is normalization constant. # # - **PMF** # $$Poi(x|\lambda) = e^{-\lambda}\frac{\lambda^x}{x!}$$ # In[5]: # setup experiment mu = Lambda = 0.6 k = x = np.arange(poisson.ppf(0.01, Lambda), poisson.ppf(0.99, Lambda)) y = poisson.pmf(x, Lambda) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'bo', ms=8, label='poisson pmf') ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5) # # Continuous # In[6]: from scipy.stats import norm, laplace, gamma, pareto, beta, expon, erlang, chi2 # ## Normal # # - **Probability density function** # $$f(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x - \mu)^2}{2\sigma^2}}$$ # # - **Likelihood** # $$L(\mu, \sigma^2) = \prod_{i=1}^n f(x_i | \mu, \sigma^2) = \prod_{i=1}^n \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x - \mu)^2}{2\sigma^2}} = (2\pi\sigma^2)^{-n/2} e^{-\frac{1}{2\sigma^2} \sum_{i=1}^n (x_i - \mu)^2}$$ # # - **Log-likelihood** # $$ln L(\mu, \sigma^2) = \sum_{i=1}^n ln f(x_i | \mu, \sigma^2) = -\frac{n}{2} ln(2\pi) - \frac{n}{2} ln \sigma^2 - \frac{1}{2\sigma^2} \sum_{i=1}^n (x_i - \mu)^2$$ # # - **MLE**: Taking derivatives with respect to $\mu$ and $\sigma^2$. # $$\hat{\mu} = \frac{1}{n} \sum_{i=1}^n x_i$$ # $$\hat{\sigma}^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \hat{\mu})^2$$ # # - **Prior** # $$p(y)$$ # # - **Posterior** # $$p(y|x) = \frac{p(x|y) p(y)}{p(x)}$$ # - **Posterior predictive distribution** # - https://en.wikipedia.org/wiki/Conjugate_prior # In[7]: # setup experiment mu = 0 std = 1 x = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100) y = norm.pdf(x, mu, std) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='norm pdf') # ## Laplace # # Heavy tails, also known as the double sided exponential distribution, $\mu$ is location, $b$ is scala. # # - **PDF** # $$Lap(x|\mu, b) = \frac{1}{2b}e^{-\frac{|x - \mu|}{b}}$$ # $$mean = \mu, mode = \mu, var = 2b^2$$ # In[8]: # setup experiment mu = 0 std = 1 x = np.linspace(laplace.ppf(0.01, mu, std), laplace.ppf(0.99, mu, std), 100) y = laplace.pdf(x, mu, std) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='laplace pdf') # ## Gamma # # Flexible distribution for positive real valued rv’s. # # - **PDF** # $$Ga(T|shape = a, rate = b) = \frac{b^a}{\Gamma(a)}T^{a - 1}e^{-Tb}$$ # $$\Gamma(x) = \int_0^\infty u^{x - 1}e^{-u}du$$ # $$mean = \frac{a}{b}, mode = \frac{a - 1}{b}, var = \frac{a}{b^2}$$ # In[9]: # setup experiment a = 1.99 b = 1 x = np.linspace(gamma.ppf(0.01, a), gamma.ppf(0.99, a), 100) y = gamma.pdf(x, a) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='gamma pdf') # - **Exponential** # $$Expon(x|\lambda) = Ga(x|1, \lambda)$$ # In[10]: # setup experiment exponential a = 1 x = np.linspace(expon.ppf(0.01, a), expon.ppf(0.99, a), 100) y = expon.pdf(x, a) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='exponential pdf') # - **Erlang** # $$Erlang(x|\lambda) = Ga(x|2, \lambda)$$ # In[11]: # setup experiment exponential a = 2 x = np.linspace(erlang.ppf(0.01, a), erlang.ppf(0.99, a), 100) y = erlang.pdf(x, a) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='erlang pdf') # - **Chi-square** # $$\chi^2(x|\nu) = Ga(x|\frac{\nu}{2}, \frac{1}{2})$$ # In[12]: # setup experiment exponential nu = 55 x = np.linspace(chi2.ppf(0.01, nu), chi2.ppf(0.99, nu), 100) y = chi2.pdf(x, nu) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='Chi-square pdf') # ## Beta # # $$Beta(x|a, b) = \frac{1}{B(a, b)} x^{a - 1}(1 - x)^{b - 1}$$ # $$B(a, b) = \frac{\Gamma(a)\Gamma(b)}{\Gamma(a + b)}$$ # $$\Gamma(a) = (a - 1)!$$ # $$mean = \frac{a}{a + b}, mode = \frac{a - 1}{a + b - 2}, var = \frac{ab}{(a + b)^2(a + b + 1)}$$ # In[13]: # setup experiment a, b = 2.31, 0.627 x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100) y = beta.pdf(x, a, b) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='beta pdf') # ## Pareto # # Exhibit long tails, also called heavy tails, known as Zipf’s law or power law. # # $$Pareto(x|k, m) = km^kx^{-(k + 1)}\mathbf{I} (x \ge m)$$ # $$mean = \frac{km}{k - 1} \ if \ k > 1, mode = m, var = \frac{m^2k}{(k - 1)^2(k - 2)} \ if \ k > 2$$ # In[14]: # setup experiment b = 2.62 x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100) y = pareto.pdf(x, b) # plotting result fig, ax = plt.subplots(1, 1) ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf') # # KL Divergence # # $$D_{KL}(p||q) = \sum_{i=1}^N p(x_i) log\frac{p(x_i)}{q(x_i)}$$ # In[15]: # Evidence x = pd.Series([0]*2 + [1]*3 + [2]*5 + [3]*14 + [4]*16 + [5]*15 + [6]*12 + [7]*8 + [8]*10 + [9]*8 + [10]*7) p_true = dict(zip(np.histogram(x)[1], np.histogram(x)[0])) p_true[10.0] = 7 p_true = np.array([p_true[val] * 0.01 for val in x]) sns.distplot(x, bins=10, label="Evidence") # Uniform model p_uniform = uniform.pdf(x, loc=0, scale=11) plt.plot(x, p_uniform, 'k-', lw=2, label="Uniform") # Normal model mu, std = norm.fit(x) p_norm = norm.pdf(x, mu, std) plt.plot(x, p_norm, 'r-', lw=2, label="Normal") plt.legend(loc="best") # In[16]: print "Log likelihood (Uniform):", np.log(p_uniform).sum() print "Log likelihood (Normal):", np.log(p_norm).sum() # In[17]: print "KL (Uniform):", (p_true * np.log(p_true / p_uniform)).sum() print "KL (Normal):", (p_true * np.log(p_true / p_norm)).sum() # # Binning # In[18]: # cho distribution training good (x, y) mu = 1 std = 1 x_good = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100) y_good = norm.pdf(x_good, mu, std) # cho distribution training bad (x, y) mu = -1 std = 2 x_bad = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 50) y_bad = norm.pdf(x_bad, mu, std) # plot 2 distribution fig, ax = plt.subplots(1, 1) ax.plot(x_good, y_good, 'r-', lw=5, alpha=0.6, label='good') ax.plot(x_bad, y_bad, 'g.', lw=5, alpha=0.6, label='bad') ax.legend(loc="best") # In[19]: # binning with quantiles 10 qtiles = [0, .25, .5, .75, .9, 1.] bin_good = pd.qcut(x_good, qtiles) bin_bad = pd.qcut(x_bad, qtiles) # calculate pmf of each bin pmf_good = bin_good.describe() pmf_bad = bin_bad.describe() print "PMF good" display(pmf_good) print "PMF bad" display(pmf_bad) # cho testing data, xac dinh prob cua data nay test_data = np.array([-10, -1, 1, 2, 3]) def map_to_prob(data, pmf): res = {} for i in range(pmf.shape[0]): idx = pmf.index[i] for j in data: if j in idx: res[i] = pmf.loc[idx].freqs return res test_prob_good = map_to_prob(test_data, pmf_good) test_prob_bad = map_to_prob(test_data, pmf_bad) print "Good:", test_prob_good print "Bad:", test_prob_bad print "Diff:", np.array(test_prob_good.values()) - np.array(test_prob_bad.values()) # In[20]: # apply model on quantiles mu = 1 std = 1 cat = [] freqs = [] for b in bin_good.categories: cat.append(b) freqs.append(norm.cdf(b.right, mu, std) - norm.cdf(b.left, mu, std)) norm_model = pd.DataFrame({"categories": cat, "freqs": freqs}) norm_model = norm_model.set_index("categories") norm_model