#!/usr/bin/env python
# coding: utf-8

# # Discrete

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import binom, bernoulli, multinomial, poisson, norm, uniform
import numpy as np
import pandas as pd


# ## Binomial
# 
# Toss a coint $n$ times with $k$ times success, probability of head is $\theta$, $X \in \{0, ..., n\}$.
# 
# - **PMF**
# $$Bin(k|n, \theta) = {{n}\choose{k}}\theta^k (1 - \theta)^{n - k}$$

# In[2]:


# setup experiment
n = 5
theta = 0.4
k = x = np.arange(binom.ppf(0.01, n, theta), binom.ppf(0.99, n, theta))
y = binom.pmf(k, n, theta)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='binom pmf')
ax.vlines(x, ymin=0, ymax=y, colors='b', lw=5, alpha=0.5)


# ## Bernouli
# 
# Toss a coin only once $X \in \{0, 1\}$.
# 
# - **PMF**
# $$Ber(x | \theta) = \theta^{\mathbf{I}(x = 1)} (1 - \theta)^{\mathbf{I}(x = 0)}$$
# 
# - **Likelihood**
# $$L(p) = log(p) = klog(\theta) + (n - k)log(1 - \theta)$$
# 
# - **MLE**
# $$\frac{d L(p)}{d \theta} = 0 \iff k\frac{1}{\theta} - (n - k)\frac{1}{1 - \theta} = 0$$
# $$\theta_{MLE} = \frac{k}{n}$$

# In[3]:


# setup experiment
theta = 0.3
k = x = np.arange(bernoulli.ppf(0.01, theta), bernoulli.ppf(0.99, theta))
y = bernoulli.pmf(k, theta)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='bernoulli pmf')
ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5)


# ## Multinomial
# 
# Tossing a $K$-sided die, let $\textbf{x} = (x_1, ..., x_K) \in \{0, 1, ..., n\}^K$, $x_j$ is the number of times side $j$ of the die occurs.
# 
# - **PMF**
# $$Mu(\textbf{x} | n, \boldsymbol{\theta}) = {{n}\choose{x_1 ... x_K}} \prod_{j=1}^K \theta_{j}^{x_j}$$
# $${{n}\choose{x_1 ... x_K}} = \frac{n!}{x_1!x_2!...x_K!}$$

# In[4]:


# setup experiment
x = [3, 4]
n = 7
theta = [0.4, 0.6]
print multinomial.pmf(x, n, theta)
print binom.pmf(3, 7, 0.4)


# ## Multinouli
# 
# One-hot encoding $\textbf{x} \in \{0, 1\}^K$.
# 
# $$Mu(\textbf{x} | 1, \boldsymbol{\theta}) = \prod_{j=1}^K \theta_j^{\mathbf{I}(x_j = 1)}$$

# ## Poisson
# 
# Counts rare events like radioactive decay and traffic accidents. $X \in \{0, 1, ...\}$, $\lambda > 0$, $e^{-\lambda}$ is normalization constant.
# 
# - **PMF**
# $$Poi(x|\lambda) = e^{-\lambda}\frac{\lambda^x}{x!}$$

# In[5]:


# setup experiment
mu = Lambda = 0.6
k = x = np.arange(poisson.ppf(0.01, Lambda), poisson.ppf(0.99, Lambda))
y = poisson.pmf(x, Lambda)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'bo', ms=8, label='poisson pmf')
ax.vlines(x, 0, y, colors='b', lw=5, alpha=0.5)


# # Continuous

# In[6]:


from scipy.stats import norm, laplace, gamma, pareto, beta, expon, erlang, chi2


# ## Normal
# 
# - **Probability density function**
# $$f(x | \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x - \mu)^2}{2\sigma^2}}$$
# 
# - **Likelihood**
# $$L(\mu, \sigma^2) = \prod_{i=1}^n f(x_i | \mu, \sigma^2) = \prod_{i=1}^n \frac{1}{\sqrt{2\pi \sigma^2}} e^{-\frac{(x - \mu)^2}{2\sigma^2}} = (2\pi\sigma^2)^{-n/2} e^{-\frac{1}{2\sigma^2} \sum_{i=1}^n (x_i - \mu)^2}$$
# 
# - **Log-likelihood**
# $$ln L(\mu, \sigma^2) = \sum_{i=1}^n ln f(x_i | \mu, \sigma^2) = -\frac{n}{2} ln(2\pi) - \frac{n}{2} ln \sigma^2 - \frac{1}{2\sigma^2} \sum_{i=1}^n (x_i - \mu)^2$$
# 
# - **MLE**: Taking derivatives with respect to $\mu$ and $\sigma^2$.
# $$\hat{\mu} = \frac{1}{n} \sum_{i=1}^n x_i$$
# $$\hat{\sigma}^2 = \frac{1}{n} \sum_{i=1}^n (x_i - \hat{\mu})^2$$
# 
# - **Prior**
# $$p(y)$$
# 
# - **Posterior**
# $$p(y|x) = \frac{p(x|y) p(y)}{p(x)}$$
# - **Posterior predictive distribution**
# - https://en.wikipedia.org/wiki/Conjugate_prior

# In[7]:


# setup experiment
mu = 0
std = 1
x = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100)
y = norm.pdf(x, mu, std)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='norm pdf')


# ## Laplace
# 
# Heavy tails, also known as the double sided exponential distribution, $\mu$ is location, $b$ is scala.
# 
# - **PDF**
# $$Lap(x|\mu, b) = \frac{1}{2b}e^{-\frac{|x - \mu|}{b}}$$
# $$mean = \mu, mode = \mu, var = 2b^2$$

# In[8]:


# setup experiment
mu = 0
std = 1
x = np.linspace(laplace.ppf(0.01, mu, std), laplace.ppf(0.99, mu, std), 100)
y = laplace.pdf(x, mu, std)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='laplace pdf')


# ## Gamma
# 
# Flexible distribution for positive real valued rv’s.
# 
# - **PDF**
# $$Ga(T|shape = a, rate = b) = \frac{b^a}{\Gamma(a)}T^{a - 1}e^{-Tb}$$
# $$\Gamma(x) = \int_0^\infty u^{x - 1}e^{-u}du$$
# $$mean = \frac{a}{b}, mode = \frac{a - 1}{b}, var = \frac{a}{b^2}$$

# In[9]:


# setup experiment
a = 1.99
b = 1
x = np.linspace(gamma.ppf(0.01, a), gamma.ppf(0.99, a), 100)
y = gamma.pdf(x, a)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='gamma pdf')


# - **Exponential**
# $$Expon(x|\lambda) = Ga(x|1, \lambda)$$

# In[10]:


# setup experiment exponential
a = 1
x = np.linspace(expon.ppf(0.01, a), expon.ppf(0.99, a), 100)
y = expon.pdf(x, a)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='exponential pdf')


# - **Erlang**
# $$Erlang(x|\lambda) = Ga(x|2, \lambda)$$

# In[11]:


# setup experiment exponential
a = 2
x = np.linspace(erlang.ppf(0.01, a), erlang.ppf(0.99, a), 100)
y = erlang.pdf(x, a)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='erlang pdf')


# - **Chi-square**
# $$\chi^2(x|\nu) = Ga(x|\frac{\nu}{2}, \frac{1}{2})$$

# In[12]:


# setup experiment exponential
nu = 55
x = np.linspace(chi2.ppf(0.01, nu), chi2.ppf(0.99, nu), 100)
y = chi2.pdf(x, nu)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='Chi-square pdf')


# ## Beta
# 
# $$Beta(x|a, b) = \frac{1}{B(a, b)} x^{a - 1}(1 - x)^{b - 1}$$
# $$B(a, b) = \frac{\Gamma(a)\Gamma(b)}{\Gamma(a + b)}$$
# $$\Gamma(a) = (a - 1)!$$
# $$mean = \frac{a}{a + b}, mode = \frac{a - 1}{a + b - 2}, var = \frac{ab}{(a + b)^2(a + b + 1)}$$

# In[13]:


# setup experiment
a, b = 2.31, 0.627
x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100)
y = beta.pdf(x, a, b)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, y, 'r-', lw=5, alpha=0.6, label='beta pdf')


# ## Pareto
# 
# Exhibit long tails, also called heavy tails, known as Zipf’s law or power law.
# 
# $$Pareto(x|k, m) = km^kx^{-(k + 1)}\mathbf{I} (x \ge m)$$
# $$mean = \frac{km}{k - 1} \ if \ k > 1, mode = m, var = \frac{m^2k}{(k - 1)^2(k - 2)} \ if \ k > 2$$

# In[14]:


# setup experiment
b = 2.62
x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100)
y = pareto.pdf(x, b)

# plotting result
fig, ax = plt.subplots(1, 1)
ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf')


# # KL Divergence
# 
# $$D_{KL}(p||q) = \sum_{i=1}^N p(x_i) log\frac{p(x_i)}{q(x_i)}$$

# In[15]:


# Evidence
x = pd.Series([0]*2 + [1]*3 + [2]*5 + [3]*14 + [4]*16 + [5]*15 + [6]*12 + [7]*8 + [8]*10 + [9]*8 + [10]*7)
p_true = dict(zip(np.histogram(x)[1], np.histogram(x)[0]))
p_true[10.0] = 7
p_true = np.array([p_true[val] * 0.01 for val in x])
sns.distplot(x, bins=10, label="Evidence")

# Uniform model
p_uniform = uniform.pdf(x, loc=0, scale=11)
plt.plot(x, p_uniform, 'k-', lw=2, label="Uniform")

# Normal model
mu, std = norm.fit(x)
p_norm = norm.pdf(x, mu, std)
plt.plot(x, p_norm, 'r-', lw=2, label="Normal")
plt.legend(loc="best")


# In[16]:


print "Log likelihood (Uniform):", np.log(p_uniform).sum()
print "Log likelihood (Normal):", np.log(p_norm).sum()


# In[17]:


print "KL (Uniform):", (p_true * np.log(p_true / p_uniform)).sum()
print "KL (Normal):", (p_true * np.log(p_true / p_norm)).sum()


# # Binning

# In[18]:


# cho distribution training good (x, y)
mu = 1
std = 1
x_good = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 100)
y_good = norm.pdf(x_good, mu, std)

# cho distribution training bad (x, y)
mu = -1
std = 2
x_bad = np.linspace(norm.ppf(0.01, mu, std), norm.ppf(0.99, mu, std), 50)
y_bad = norm.pdf(x_bad, mu, std)

# plot 2 distribution
fig, ax = plt.subplots(1, 1)
ax.plot(x_good, y_good, 'r-', lw=5, alpha=0.6, label='good')
ax.plot(x_bad, y_bad, 'g.', lw=5, alpha=0.6, label='bad')
ax.legend(loc="best")


# In[19]:


# binning with quantiles 10
qtiles = [0, .25, .5, .75, .9, 1.]
bin_good = pd.qcut(x_good, qtiles)
bin_bad = pd.qcut(x_bad, qtiles)

# calculate pmf of each bin
pmf_good = bin_good.describe()
pmf_bad = bin_bad.describe()

print "PMF good"
display(pmf_good)

print "PMF bad"
display(pmf_bad)

# cho testing data, xac dinh prob cua data nay
test_data = np.array([-10, -1, 1, 2, 3])

def map_to_prob(data, pmf):
    res = {}
    for i in range(pmf.shape[0]):
        idx = pmf.index[i]
        for j in data:
            if j in idx:
                res[i] = pmf.loc[idx].freqs
                
    return res

test_prob_good = map_to_prob(test_data, pmf_good)
test_prob_bad = map_to_prob(test_data, pmf_bad)
print "Good:", test_prob_good
print "Bad:", test_prob_bad
print "Diff:", np.array(test_prob_good.values()) - np.array(test_prob_bad.values())


# In[20]:


# apply model on quantiles
mu = 1
std = 1
cat = []
freqs = []
for b in bin_good.categories:
    cat.append(b)
    freqs.append(norm.cdf(b.right, mu, std) - norm.cdf(b.left, mu, std))
norm_model = pd.DataFrame({"categories": cat, "freqs": freqs})
norm_model = norm_model.set_index("categories")
norm_model