#!/usr/bin/env python
# coding: utf-8

# # Python数据科学分享——4.数理统计
# 
# > Python与概率论、数理统计的碰撞，发扬了概率编程的范式，让贝叶斯推断、统计模型更精彩，新工具方兴未艾——pymc4、edward2、numpyro
# 
# - toc: true 
# - badges: true
# - comments: true
# - categories: [jupyter,Python,Data Science]
# 
# 

# ![](4.data-stats/4.data_stats.png)

# Probability theory is nothing but common sense reduced to calculation.——Pierre Laplace, 1812
# 
# 概率论就是把（不确定性）常识精简成计算——皮埃尔·拉普拉斯
# 
# 统计定义：
# 1. “A branch of mathematics dealing with the collection, analysis, interpretation, and presentation of masses of numerical data.”(Webster's International Dictionary)
# 2. “The science and art of collecting, summarizing, and analyzing data that are subject to random variation.” (A Dictionary of Epidemiology).
# 3. 处理数据中变异性的科学与艺术——方积乾（中国统计学学家、中山大学教授）

# # Python概率编程
# (Probabilistic programming)
# 
# | 首发年份 | 名称 |简介|
# | :-: | :-: |:-: |
# | [scipy.stats](https://docs.scipy.org/doc/scipy/reference/stats.html) | 2001 | Scipy概率与统计模块 |
# | [PyMC](https://github.com/pymc-devs/pymc3) | 2009 | 纯Python概率编程库(PyMC3基于Theano) |
# | [emcee](https://github.com/dfm/emcee) | 2010 | MCMC采样器库 |
# | [pystan](https://github.com/stan-dev/pystan) | 2013 | 基于Stan(C++)的概率编程库 |
# |[pomegranate](https://github.com/jmschrei/pomegranate)|2014|Cython实现的随机模型库|
# | [ArviZ](https://github.com/arviz-devs/arviz) | 2015 | 贝叶斯推断（模型无关）统一接口|
# | [edward](https://github.com/google/edward2) | 2016 | 基于Tensorflow的概率编程语言(edward2, 2019) |
# | [tf-probability](https://github.com/tensorflow/probability) | 2016 | 基于Tensorflow的概率编程库 |
# | [pyro](https://github.com/pyro-ppl/pyro) | 2017 | 基于PyTorch的概率编程库 |
# |[numpyro](https://github.com/pyro-ppl/numpyro)| 2019|基于Nympy+JAX的概率编程库|
# 
# > [《统计计算》](http://www.math.pku.edu.cn/teachers/lidf/docs/statcomp/html/_statcompbook/sim-bootstrap.html) 李东风(北京大学数学科学学院 副教授)
# 

# In[1]:


# %matplotlib widget
from matplotlib.font_manager import _rebuild

_rebuild()
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid", {"font.sans-serif": ["SimHei", "Arial"]})

import matplotlib as mpl

mpl.rcParams["figure.max_open_warning"] = 0

import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 100)
import scipy
from scipy import stats
import pymc3 as pm
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_probability import edward2 as ed


# # [概率基础](https://github.com/seeingtheory/Seeing-Theory)
# 
# 以下介绍取自美国布朗大学《看见统计》，中文版bug多，而且前端资源网络问题，调整了一下放在服务器上运行
# 
# > 推荐视频教程浙江大学[概率论与数理统计](https://www.icourse163.org/course/ZJU-232005)，[概率论与数理统计--习题与案例分析](https://www.icourse163.org/course/ZJU-1001640007)
# 
# 
# ## [基本概念](http://10.2.3.100:8000/basic-probability) 
#     随机事件 / 期望 / 方差
# ## [古典概型](http://10.2.3.100:8000/compound-probability)
#     集合论 / 古典概型 / 条件概率 
# ## [概率分布](http://10.2.3.100:8000/probability-distributions)
#     随机变量 / 离散型和连续型随机变量 / 中心极限定理 

# ## [统计推断：频率学派](http://10.2.3.100:8000/frequentist-inference)
#     点估计理论 / 置信区间 / Bootstrap方法 
# ## [统计推断：贝叶斯学派](http://10.2.3.100:8000/frequentist-inference)
#     贝叶斯公式 / 似然函数 / 从先验概率到后验概率 
# ## [回归分析](http://10.2.3.100:8000/regression-analysis)
#     最小二乘法 / 相关性 / 方差分析
# 
# ## [推断(inference)与预测(prediction)](https://www.datascienceblog.net/post/commentary/inference-vs-prediction/)
# 
# 
# |对比| 推断|预测|
# |:-:|:-:|:-:|
# |模型选择| 对数据产生过程进行归因（Reason）,选择假设最合理的模型|对各种模型进行评估,选择性能最优的模型|
# |模型验证	| 拟合优度检验（goodneess-of-fit tests）| 通过测试集的损失表现验证|
# |模型应用	|解释数据产生的过程|	预测新特征的结果|
# |统计日常| 咋了？啥样？为啥？| 让我们期待明天会更好|

# # [scipy.stats](https://docs.scipy.org/doc/scipy/reference/stats.html)
# 
# scipy统计学模块，包括常见统计分布、极大似然估计、置信区间、假设检验等统计方法

# ## `numpy.random`随机分布

# ### 均匀分布（Standard uniform）

# In[2]:


np.random.seed(1024)

np.random.rand(3, 4)


# ### 标准正态分布（Standard normal）

# In[3]:


np.random.randn(3, 4)


# ### 带配置参数的分布

# #### 离散型分布

# In[4]:


np.random.poisson(lam=10, size=(10,))


# In[5]:


np.random.binomial(n=10, p=0.6, size=(10,))


# In[6]:


np.random.negative_binomial(n=10, p=0.6, size=(10,))


# In[7]:


np.random.geometric(p=0.6, size=(10,))


# #### 连续分布

# In[8]:


np.random.uniform(low=-1, high=1, size=(3, 4))


# In[9]:


np.random.normal(loc=100, scale=15, size=(3, 4))


# In[10]:


np.random.standard_t(df=3, size=(3, 4))


# In[11]:


np.random.beta(a=0.5, b=0.5, size=(10,))


# #### 多变量分布

# In[12]:


np.random.multinomial(n=4, pvals=[0.1, 0.2, 0.3, 0.4], size=5)


# In[13]:


np.random.multivariate_normal(mean=[10, 10], cov=np.array([[3, 0.5], [0.5, 2]]), size=5)


# ## `scipy.stats`统计函数
# 
# 假设[智商(IQ)](https://www.wanweibaike.com/wiki-IQ)服从均值为100、标准差为15的正态分布

# In[14]:


dist = stats.norm(loc=100, scale=15)
dist


# ### 生成随机变量

# In[15]:


dist.rvs(10)


# In[16]:


xs = np.linspace(50, 150, 100)

fig, ax = plt.subplots(2, 2, figsize=(12, 6))
fig.subplots_adjust(hspace=0.5, wspace=0.2)
ax[0, 0].plot(xs, dist.pdf(xs))  # PDF
ax[0, 0].set_title("PDF")
ax[0, 1].plot(xs, dist.cdf(xs))  # CDF
ax[0, 1].set_title("CDF")
cdf = np.linspace(0, 1, 100)
ax[1, 0].plot(cdf, dist.ppf(cdf))  # P-P图
ax[1, 0].set_title("P-P图")
stats.probplot(dist.rvs(100_000), plot=ax[1, 1])
ax[1, 1].set_title("Q-Q图");


# ### 统计日常

# **1**. 姚明智商132分，有多少人智商比他高？
# 
# ![](4.data-stats/yaoming.jpg)

# In[17]:


dist = stats.norm(loc=100, scale=15)


# In[18]:


print(f"超过姚明智商的人数为：{100 * (1 - dist.cdf(132)):.3f}%")


# In[19]:


# 仿真方法：随机抽取1亿人，统计比姚明IQ高的人数占比

n = int(1e8)
samples = dist.rvs(n)


# In[20]:


print(f"超过姚明智商的人数为：{100 * np.sum(samples > 132) / n:.3f}%")


# **2**. 有人说爱因斯坦的IQ是千里挑一，那么他的IQ是多少?
# 
# ![](4.data-stats/einstein.jpg)
# 
# > 推荐：《上帝掷骰子吗:量子物理史话》——像武侠小说一样精彩的量子力学科普

# In[21]:


dist.ppf(0.999)


# In[22]:


# 仿真方法

samples = np.sort(samples)
samples[int(0.999 * n)]


# **3**. IQ在(70, 90]区间的人有多少?

# In[23]:


print(f"IQ在(70, 90]区间的人数占比为：{100 * dist.cdf(90) - dist.cdf(70):.2f}%")


# In[24]:


# 仿真方法

np.sum((samples > 70) & (samples < 90)) / n


# **4**. 测试了100个用户的IQ样本，计算参数$\mu$和$\sigma$的极大似然估计（MLE），以及$\mu$的95%置信区间

# In[25]:


data = np.random.normal(110, 15, 100)


# In[26]:


loc, scale = stats.norm.fit(data)
loc, scale


# In[27]:


dist = stats.norm(loc, scale)


# 95%置信区间

# In[28]:


itv95 = dist.interval(0.95)
itv95


# In[29]:


fig, ax = plt.subplots(figsize=(12,6))
xs = np.linspace(data.min(), data.max(), 100)
ax.hist(data, 12, histtype="stepfilled", density=True, alpha=0.5)
ax.plot(xs, dist.pdf(xs))
ax.plot(itv95, [0.001, 0.001], c="r", linewidth=3);


# # [PyMC](https://github.com/pymc-devs/pymc3)
# 
# Python的概率编程库，包括贝叶斯统计建模与概率模型，MCMC采样与变分推断。其中，pymc3基于Theano实现，pymc4(Pre-release)基于TensorFlow Probability实现，开源教程[Python概率编程与贝叶斯方法](https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers)

# In[30]:


pd.Series([d for d in dir(pm.distributions) if d[0].isupper()])


# 分布函数

# In[31]:


d = pm.Normal.dist(mu=0, sd=1)


# In[32]:


d.dist()


# 随机采样

# In[33]:


d.random(size=5)


# 对数概率——适用于各种极大似然估计场景

# In[34]:


d.logp(0).eval()


# ## A/B测试
# 
# 假设实验A点击率（点击用户数/总曝光用户数）是0-1均匀分布

# In[35]:


with pm.Model() as model:
    p = pm.Uniform('p', lower=0, upper=1)


# In[36]:


p


# 按照多重伯努利实验抽样$X\ \sim \text{Ber}(p)$

# In[37]:


p_true = 0.05  # 真实点击率
N = 1500

occurrences = stats.bernoulli.rvs(p_true, size=N)
occurrences


# In[38]:


np.sum(occurrences)


# 实验A的点击率

# In[39]:


np.mean(occurrences)


# In[40]:


np.mean(occurrences) == p_true


# 将样本传入PyMC3的`observed`变量

# In[41]:


with model:
    obs = pm.Bernoulli("obs", p, observed=occurrences)
    step = pm.Metropolis()
    trace = pm.sample(18000, step=step)
    burned_trace = trace[1000:]


# $p_A$的后验分布如下：

# In[44]:


fig = plt.figure(figsize=(12,6))
plt.title("$p_A$后验分布")
plt.vlines(p_true, 0, 90, linestyle="--", label="$p_A$真实值")
plt.hist(burned_trace["p"], bins=25, histtype="stepfilled", density=True)
plt.legend();


# ### 实验A  VS 实验B 
# 
# 我们需要对比实验A和实验B的差异，假设
# 
# 1. 实验A样本量1500，真实点击率$p_A=0.05$
# 2. 实验B样本量750，真实点击率$p_B=0.04$
# 3. 两组实验差异$\text{delta} = p_A - p_B = 0.01$
# 
# > 贝叶斯推断对样本量差异不敏感

# In[45]:


true_p_A = 0.05
true_p_B = 0.04

N_A = 1500
N_B = 750

observations_A = stats.bernoulli.rvs(true_p_A, size=N_A)
observations_B = stats.bernoulli.rvs(true_p_B, size=N_B)
print(f"实验A: {observations_A[:30]}...")
print(f"实验B: {observations_B[:30]}...")


# In[46]:


np.mean(observations_A), np.mean(observations_B)


# In[47]:


with pm.Model() as model:
    p_A = pm.Uniform("p_A", 0, 1)
    p_B = pm.Uniform("p_B", 0, 1)

    delta = pm.Deterministic("delta", p_A - p_B)

    obs_A = pm.Bernoulli("obs_A", p_A, observed=observations_A)
    obs_B = pm.Bernoulli("obs_B", p_B, observed=observations_B)

    step = pm.Metropolis()
    trace = pm.sample(20000, step=step)
    burned_trace = trace[1000:]


# 三个随机变量的后验分布: 

# In[48]:


p_A_samples = burned_trace["p_A"]
p_B_samples = burned_trace["p_B"]
delta_samples = burned_trace["delta"]


# In[ ]:


fig = plt.figure(figsize=(15, 10))
fig.subplots_adjust(hspace=0.5)
ax = plt.subplot(311)
plt.xlim(0, 0.1)
plt.hist(
    p_A_samples,
    histtype="stepfilled",
    bins=25,
    alpha=0.85,
    label="$p_A$后验",
    color="#A60628",
    density=True,
)
plt.vlines(true_p_A, 0, 80, linestyle="--", label="$p_A$真实值")
plt.legend(loc="upper right")
plt.title("$p_A$, $p_B$和delta后验分布")
ax = plt.subplot(312)
plt.xlim(0, 0.1)
plt.hist(
    p_B_samples,
    histtype="stepfilled",
    bins=25,
    alpha=0.85,
    label="$p_B$后验",
    color="#467821",
    density=True,
)
plt.vlines(true_p_B, 0, 80, linestyle="--", label="$p_B$真实值")
plt.legend(loc="upper right")
ax = plt.subplot(313)
plt.hist(
    delta_samples,
    histtype="stepfilled",
    bins=25,
    alpha=0.85,
    label="delta后验",
    color="#7A68A6",
    density=True,
)
plt.vlines(true_p_A - true_p_B, 0, 60, linestyle="--", label="delta真实值")
plt.vlines(0, 0, 60, color="black", alpha=0.2)
plt.legend(loc="upper right");


# In[50]:


fig.savefig("AB_test.png")


# 实验A样本量`N_A = 1500`大于实验B样本量`N_B = 750`，因此$p_B$的后验分布比$p_A$的后验分布宽，说明$p_B$不确定性比$p_A$高。

# ### 实验结论
# 
# $\text{delta}$的后验分布大多数>0，因此可以推断实验A效果大概率比实验B效果好。
# 
# 实验A比实验B效果差的概率：

# In[51]:


np.mean(delta_samples < 0)


# 实验A比实验B效果好的概率：

# In[52]:


np.mean(delta_samples > 0)


# # 高斯过程回归
# （Gaussian Process Regression，GPR）
# 
# 
# 假设从正态分布抽样一组带噪声 $\epsilon$ 的样本
# 
# $$
#     y \sim \mathcal{N}(\mu = f(x), \sigma=\epsilon)
# $$
# 
# 对于线性回归问题 $f(x) = ax + b$
# 
# 高斯过程可以给出先验分布 $f$
# 
# $$
# f(x) \sim \mathcal{GP}(\mu_x, K(x, x^T, h))
# $$
# 
# 其中，$\mu_x$是函数均值，$K(x, x^T)$是核函数的协方差矩阵，$h$是数据平滑参数
# 
# > 高斯过程可以生成任意曲线（曲面），必要条件是噪声能够近似成高斯分布，经典的线性回归模型都可以看作是高斯过程模型。

# ## 生成模型示例

# In[53]:


def gauss_kernel(x, knots, h):
    return np.array([np.exp(-((x - k) ** 2) / (2 * h ** 2)) for k in knots])


plt.figure(figsize=(16, 4))
hs = [0.05, 0.1, 0.5, 1]
x = np.linspace(0, 1, 20)
for i, h in enumerate(hs):
    plt.subplot(1, 4, i + 1)
    for j in range(3):
        plt.plot(x, stats.multivariate_normal.rvs(cov=gauss_kernel(x, x, h)))
    plt.title("h = %.2f" % h)
plt.tight_layout()


# ## 模型示例

# In[54]:


n = 20
xs = np.r_[
    np.linspace(0, 0.5 * np.pi, 8),
    np.linspace(0.5 * np.pi, 1.5 * np.pi, 4),
    np.linspace(1.5 * np.pi, 2 * np.pi, 8),
]
ys = np.sin(xs) + np.random.normal(0, 0.2, n)

xp = np.c_[np.linspace(0, 2 * np.pi, 100)]


# In[55]:


fig = plt.figure(figsize=(12, 4))
ax = plt.axes()
ax.scatter(xs, ys)
ax.plot(xp, np.sin(xp));


# In[56]:


with pm.Model() as gp_context:
    h = pm.Gamma("h", 2, 0.5)
    c = pm.gp.cov.ExpQuad(1, ls=h)
    gp = pm.gp.Marginal(cov_func=c)
    ϵ = pm.HalfCauchy("e", 1)
    y_est = gp.marginal_likelihood("y_est", X=np.c_[xs], y=ys, noise=ϵ)


# In[ ]:


fig, ax = plt.subplots(2, 1, figsize=(12, 8))
xa = np.linspace(0, 20, 200)
alphas = [1.0, 2.0, 3.0, 7.5]
betas = [0.5, 0.5, 1.0, 1.0]
for a, b in zip(alphas, betas):
    pdf = stats.gamma.pdf(xa, a, scale=1.0 / b)
    ax[0].plot(xa, pdf, label=r"$\alpha$ = {}, $\beta$ = {}".format(a, b))
ax[0].set_xlabel("x", fontsize=12)
ax[0].set_ylabel("f(x)", fontsize=12)
ax[0].legend(loc=1)

xa = np.linspace(0, 5, 200)
for b in [0.5, 1.0, 2.0]:
    pdf = stats.cauchy.pdf(xa, scale=b)
    ax[1].plot(xa, pdf, label=r"$\beta$ = {}".format(b))
ax[1].set_xlabel("x", fontsize=12)
ax[1].set_ylabel("f(x)", fontsize=12)
ax[1].legend(loc=1);


# In[57]:


fig.show()


# In[59]:


with gp_context:
    trace = pm.sample(tune=1000)


# In[60]:


pm.traceplot(trace,figsize=(16, 6));


# In[61]:


with gp_context:
    fp = gp.conditional("fp", xp)
    ppc = pm.sample_posterior_predictive(trace, vars=[fp], samples=100)


# In[62]:


fig, ax = plt.subplots(2, 1, figsize=(12, 6))
ax[0].plot(xp, ppc["fp"].T, c="grey", alpha=0.1)
ax[0].scatter(xs, ys, c="red")
pm.gp.util.plot_gp_dist(ax[1], ppc["fp"], xp, palette="cool")
fig.show();


# # [tesnorflow.probability](https://github.com/tensorflow/probability)
# 
# 基于tesnorflow的贝叶斯推断和统计分析库

# ## 随机分布

# In[68]:


tfd = tfp.distributions


# In[69]:


pd.Series([str(x).split(".")[-1][:-2] for x in tfd.distribution.Distribution.__subclasses__()])


# In[70]:


dist = tfd.Normal(loc=100, scale=15)


# In[71]:


x = dist.sample((3, 4))
x


# In[72]:


n = 100
fig, ax = plt.subplots(figsize=(12, 6))

xs = dist.sample(n)
ax.hist(xs, density=True)

xp = tf.linspace(50.0, 150.0, 100)
ax.plot(xp, dist.prob(xp));


# ## 随机采样

# In[73]:


dist = tfd.Normal(loc=[3, 4, 5, 6], scale=0.5)


# In[74]:


dist.sample(5)


# In[75]:


xp = tf.linspace(0.0, 9.0, 100)[:, tf.newaxis]
fig = plt.figure(figsize=(12, 6))
plt.plot(np.tile(xp, dist.batch_shape), dist.prob(xp));


# ## 高斯混合模型

# In[76]:


gmm = tfd.MixtureSameFamily(
    mixture_distribution=tfd.Categorical(probs=[0.4, 0.1, 0.2, 0.3]),
    components_distribution=tfd.Normal(
        loc=[3.0, 4.0, 5.0, 6.0], scale=[0.1, 0.5, 0.5, 0.1]
    ),
)

xs = gmm.sample(10000)


# In[77]:


fig = plt.figure(figsize=(12, 6))
sns.distplot(xs);


# ## TFP高斯过程回归

# In[78]:


xs = tf.Variable([0.0, 1.0, 2.0, 5.0, 6.0, 8.0])
ys = tf.sin(xs) + tfd.Normal(loc=0, scale=0.5).sample(xs.shape[0])
xp = tf.linspace(-1.0, 9.0, 100)[:, None]


# In[79]:


kernel = tfp.math.psd_kernels.ExponentiatedQuadratic(length_scale=1.5)
reg = tfd.GaussianProcessRegressionModel(
    kernel, xp[:, tf.newaxis], xs[:, tf.newaxis], ys
)


# In[80]:


ub, lb = reg.mean() + [2 * reg.stddev(), -2 * reg.stddev()]
fig, ax = plt.subplots(figsize=(12, 6))
ax.fill_between(np.ravel(xp), np.ravel(ub), np.ravel(lb), alpha=0.2)
ax.plot(xp, reg.mean(), c="red", linewidth=2)
ax.scatter(xs[:], ys[:], s=50, c="k");


# # [Edward2](https://github.com/google/edward2)
# 
# Edward2是谷歌推出的*简单*概率编程语言，基于NumPy和TensorFlow生态系统构建，可以将模型编写为概率模块与模型计算结合，从而实现灵活的训练和推理。
# 
# Edward2借助GPU提升计算性能，谷歌使用[NUTS](https://papers.nips.cc/paper/7987-simple-distributed-and-accelerated-probabilistic-programming.pdf)(No-U-Turn Sampler, HMC变体)采样实现贝叶斯逻辑回归（Bayesian logistic regression）的运行时间对比。
# 
# | 模块 | 运行时间 (ms) |
# |:-:|:-:|
# | Stan (CPU) | 201.0|
# | PyMC3 (CPU) | 74.8| 
# | Handwritten TF (CPU)| 66.2| 
# | Edward2 (CPU)|  68.4| 
# | Handwritten TF (1 GPU) | 9.5| 
# | Edward2 (1 GPU)|  9.7| 
# | Edward2 (8 GPU)|  2.3| 

# ## 随机变量

# In[81]:


normal_rv = ed.Normal(loc=0., scale=1.)
normal_rv


# In[82]:


normal_rv.distribution.log_prob(1.231)


# In[83]:


dirichlet_rv = ed.Dirichlet(concentration=tf.ones([2, 10]))
dirichlet_rv


# ## 基本操作

# In[84]:


x = ed.Normal(loc=tf.zeros(20), scale=tf.ones(20)); x


# In[85]:


y = 5; x + y, x / y


# In[86]:


tf.tanh(x * y)


# In[87]:


base = tf.range(20)

fig, ax = plt.subplots(figsize=(12, 6))
ax.scatter(base[:], x[:], c="r", label="x")
ax.scatter(base[:], (x + y)[:], c="g", label="x + y")
ax.scatter(base[:], (x / y)[:], c="b", label="x / y")
ax.scatter(base[:], tf.tanh(x * y)[:], c="k", label="tf.tanh(x * y)")
ax.set_ylim(-5, 10)
ax.legend(loc='upper right');