In [1]:

using Distributions
using StatsPlots

# warm up (少し時間を取られる。その間に下のセルでコードを入力する)
histogram(randn(10^5); norm=true, alpha=0.3, size=(300, 200))
plot!(Normal())

Out[1]:

In [2]:

# コインを20回投げたときの表が出る回数の分布は正規分布で近似される。

n = 20
p = 0.5
sample_of_means = rand(Binomial(n, p), 10^5)
histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label="rand(Binomial($n, $p))")
plot!(Normal(n*p, √(n*p*(1-p))), 0, 20; label="normal approx.", lw=2)

Out[2]:

In [3]:

# 回数n、確率パラメータpの二項分布オブジェクトを次のように作れる。

bin = Binomial(n, p)

Out[3]:

Binomial{Float64}(n=20, p=0.5)

In [4]:

# 確率分布オブジェクトの乱数

rand(bin)

Out[4]:

In [5]:

# 確率分布オブジェクトの乱数を複数個生成

@show rand(bin, 20);

rand(bin, 20) = [7, 14, 13, 12, 7, 7, 9, 13, 4, 10, 13, 13, 11, 10, 8, 12, 9, 8, 9, 12]

In [6]:

# サンプルのヒストグラム
# norm = true で確率密度函数と比較できるようにする。
# alpha = 0.3 で半透明化pp
# bin = -0.5:n+0.5 で 1 刻みの適切なビンを設定

histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label="sample of means")

Out[6]:

In [7]:

# -0.5:n+0.5 は [-0.5, 0.5, 1.5, 2.5, ..., 19.5, 20.5] とほぼ等価

-0.5:n+0.5

Out[7]:

-0.5:1.0:20.5

In [8]:

@show collect(-0.5:n+0.5);

collect(-0.5:n + 0.5) = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5]

In [9]:

# 平均μ、標準偏差σの正規分布オブジェクトは次にようにして作れる。

normal = Normal(n*p, √(n*p*(1-p)))

Out[9]:

Normal{Float64}(μ=10.0, σ=2.23606797749979)

In [10]:

# 確率分布オブジェクト dist は plot(dist) でプロットできる。
# lw = 2 で2倍の太さでプロット

plot(normal, 0, 20; label="normal dist.", lw=2)

Out[10]:

In [11]:

# 以上を重ねてプロット

histogram(sample_of_means; norm=true, alpha=0.3, bin=-0.5:n+0.5, label="rand(Binomial($n, $p))")
plot!(Normal(n*p, √(n*p*(1-p))), 0, 20; label="normal approx.", lw=2)

Out[11]:

In [12]:

# サイコロを10回振って出た目の平均値の分布は正規分布でよく近似される。

n = 10
M = [mean(rand(1:6, n)) for _ in 1:10^5]
histogram(M; norm=true, alpha=0.3, bin=0.95:0.1:6.05, label="mean(rand(1:6, 10))")
plot!(Normal(mean(1:6), std(1:6; corrected=false)/√n), 1, 6; label="normal approx.", lw=2)

Out[12]:

In [13]:

# 2つの確率パラメータ(比率パラメータ)が等しい二項分布で生成されたサンプルのZ統計量は
# 近似的に標準正規分布に従う。

""""""
function gensample(bin1, bin2)
    m, n = ntrials(bin1), ntrials(bin2)
    a, b = rand(bin1), rand(bin2)
    c, d = m - a, n - b
    a, b, c, d
end

"""Z statistic of 2×2 contingency table"""
function zstat(a, b, c, d)
    a*d - b*c == 0 && return zero(inv(a))
    m, n = a + c, b + d
    N = a + b + c + d
    p = (a + b)/N
    Z = (a/m - b/n) / √(p*(1-p)*(1/m + 1/n))
    Z
end

m, n, p = 30, 20, 0.3
bin1 = Binomial(m, p)
bin2 = Binomial(n, p)
Z = [zstat(gensample(bin1, bin2)...) for _ in 1:10^5]

histogram(Z; norm=true, alpha=0.3, bin=-4.25:0.5:4.25, label="Z statistics")
plot!(Normal(), -4, 4; label="std normal dist.", lw=2)

Out[13]:

In [14]:

@code_warntype zstat(1, 2, 3, 4)

Variables
  #self#::Core.Const(zstat)
  a::Int64
  b::Int64
  c::Int64
  d::Int64
  Z::Float64
  p::Float64
  N::Int64
  n::Int64
  m::Int64

Body::Float64
1 ─       Core.NewvarNode(:(Z))
│         Core.NewvarNode(:(p))
│         Core.NewvarNode(:(N))
│         Core.NewvarNode(:(n))
│         Core.NewvarNode(:(m))
│   %6  = (a * d)::Int64
│   %7  = (b * c)::Int64
│   %8  = (%6 - %7)::Int64
│   %9  = (%8 == 0)::Bool
└──       goto #3 if not %9
2 ─ %11 = Main.inv(a)::Float64
│   %12 = Main.zero(%11)::Core.Const(0.0)
└──       return %12
3 ─ %14 = (a + c)::Int64
│   %15 = (b + d)::Int64
│         (m = %14)
│         (n = %15)
│         (N = a + b + c + d)
│   %19 = (a + b)::Int64
│         (p = %19 / N)
│   %21 = (a / m)::Float64
│   %22 = (b / n)::Float64
│   %23 = (%21 - %22)::Float64
│   %24 = p::Float64
│   %25 = (1 - p)::Float64
│   %26 = (1 / m)::Float64
│   %27 = (1 / n)::Float64
│   %28 = (%26 + %27)::Float64
│   %29 = (%24 * %25 * %28)::Float64
│   %30 = √%29::Float64
│         (Z = %23 / %30)
└──       return Z

In [15]:

# 数式処理を使って、Z統計量の2乗がPearsonのχ²統計量に一致することを確認。
# 上の数値計算で使ったzstat函数をそのまま数式処理で利用する。

using SymPy

@syms a b c d
Z = zstat(a, b, c, d)

Out[15]:

$\begin{equation*}\frac{\frac{a}{a + c} - \frac{b}{b + d}}{\sqrt{\frac{\left(a + b\right) \left(- \frac{a + b}{a + b + c + d} + 1\right) \left(\frac{1}{b + d} + \frac{1}{a + c}\right)}{a + b + c + d}}}\end{equation*}$

In [16]:

# Z^2 |> factor は factor(Z^2) に等価
# 以下の計算結果はPearsonのχ²統計量の有名な表示に一致

Z^2 |> factor

Out[16]:

$\begin{equation*}\frac{\left(a d - b c\right)^{2} \left(a + b + c + d\right)}{\left(a + b\right) \left(a + c\right) \left(b + d\right) \left(c + d\right)}\end{equation*}$

In [17]:

# (O - E)²/E の和で定義されたPearsonのχ²統計量

"""Pearson's χ² statistic of 2×2 contingency table"""
function chisqstat(a, b, c, d)
    a*d - b*c == 0 && return zero(inv(a))
    N = a + b + c + d
    s, f = a + b, c + d
    m, n = a + c, b + d
    Ea, Eb, Ec, Ed = m*s/N, n*s/N, m*f/N, n*f/N
    X² = (a - Ea)^2/Ea + (b - Eb)^2/Eb + (c - Ec)^2/Ec + (d - Ed)^2/Ed
end

X² = chisqstat(a, b, c, d)

Out[17]:

$\begin{equation*}\frac{\left(d - \frac{\left(b + d\right) \left(c + d\right)}{a + b + c + d}\right)^{2} \left(a + b + c + d\right)}{\left(b + d\right) \left(c + d\right)} + \frac{\left(c - \frac{\left(a + c\right) \left(c + d\right)}{a + b + c + d}\right)^{2} \left(a + b + c + d\right)}{\left(a + c\right) \left(c + d\right)} + \frac{\left(b - \frac{\left(a + b\right) \left(b + d\right)}{a + b + c + d}\right)^{2} \left(a + b + c + d\right)}{\left(a + b\right) \left(b + d\right)} + \frac{\left(a - \frac{\left(a + b\right) \left(a + c\right)}{a + b + c + d}\right)^{2} \left(a + b + c + d\right)}{\left(a + b\right) \left(a + c\right)}\end{equation*}$

In [18]:

# 以下の計算結果は2×2の分割表のPearson χ²統計量に関する有名な公式

X² |> factor

Out[18]:

$\begin{equation*}\frac{\left(a d - b c\right)^{2} \left(a + b + c + d\right)}{\left(a + b\right) \left(a + c\right) \left(b + d\right) \left(c + d\right)}\end{equation*}$

In [19]:

# 正確二項検定のP値の定義

# \lessapprox TAB → ⪅
# \approx TAB → ≈
x ⪅ y = x < y || x ≈ y

"""p-value function of exact binomial test"""
pval_exact(dist, k) = sum(pdf(dist, j) for j in support(dist) if pdf(dist, j) ⪅ pdf(dist, k))

Out[19]:

pval_exact (generic function with 1 method)

In [20]:

# 正確二項検定のP値の例　(パラメータpを固定した場合)

n, p = 20, 0.3
bin = Binomial(n, p)

k = support(bin)
y = pval_exact.(bin, k)
plot(k, y; label="")
plot!(; xtick=0:20, ytick=0:0.05:1)
plot!(; xlabel="data k", ylabel="p-value for parameter n = $n, p = $p")

Out[20]:

In [21]:

# 正確二項検定のP値の例　(データkを固定した場合)

n, k = 20, 6

p = 0:0.002:1
y = @. pval_exact(Binomial(n, p), k)
plot(p, y; label="")
plot!(; xtick=0:0.1:1, ytick=0:0.05:1)
plot!(; xlabel="parameter p", ylabel="p-value for data n = $n, k = $k")

Out[21]:

In [22]:

# 信頼区間函数の定義

using Roots

"""generic confidence interval function"""
function confint(pvalfunc, mlefunc, n, k, pmin, pmax; α = 0.05)
    f(p) = pvalfunc(n, p, k) - α
    ci = find_zeros(f, pmin, pmax)
    length(ci) ≥ 2 && return ci
    pmle = mlefunc(n, k)
    first(ci) > pmle ? [pmin, first(ci)] : [first(ci), pmax]
end

"""confidence interval function for exact binomial test"""
function confint_bin(n, k; α = 0.05)
    pvalfunc(n, p, k) = pval_exact(Binomial(n, p), k)
    mlefunc(n, k) = k/n
    pmin, pmax = 0.0, 1.0
    confint(pvalfunc, mlefunc, n, k, pmin, pmax; α)
end

Out[22]:

confint_bin (generic function with 1 method)

In [23]:

# 正確二項検定のデータkを固定した場合のP値函数のプロットに
# 95%信頼区間のプロットを追加

n, k = 20, 6
α = 0.05
ci = confint_bin(n, k; α)

p = 0:0.002:1
y = @. pval_exact(Binomial(n, p), k)
plot(p, y; label="")
plot!(ci, [α, α]; label="$(100(1 - α))% confidence interval", lw=3)
plot!(; xtick=0:0.1:1, ytick=0:0.05:1)
plot!(; xlabel="parameter p", ylabel="p-value for data n = $n, k = $k")

Out[23]:

In [24]:

# データkを動かして信頼区間をプロット (1)

n = 20
k = 0:n
α = 0.05
ci = confint_bin.(n, k; α)
ciL, ciR = first.(ci), last.(ci)

plot(; legend=:topleft)
plot!(k, ciL; label="min. of confidence interval")
plot!(k, ciR; label="max. of confidenceinterval")
plot!(k, ciL; label="$(100(1 - α))% confidence interval (n = $n)", lw=0, frange=ciR, fa=0.1)
plot!(; xtick=0:n, ytick=0:0.1:1)
plot!(; xlabel="data k", ylabel="parameter p")

Out[24]:

In [25]:

# データkを動かして信頼区間をプロット (2)

n = 20
k = 0:n
α = 0.05
ci = confint_bin.(n, k; α)

plot(; legend=:topleft)
for i in eachindex(ci)
    plot!([k[i], k[i]], ci[i]; label="", lw=5)
end
plot!(; xtick=0:n, ytick=0:0.1:1)
plot!(; xlabel="data k", ylabel="parameter p")
title!("$(100(1 - α))% confidence intervals (n = $n)"; titlefontsize=12)

Out[25]:

In [26]:

# データkを動かして信頼区間をプロット (3)

n = 20
k = 0:n
α = 0.05
ci = confint_bin.(n, k; α)

plot(; legend=:topleft)
for i in eachindex(ci)
    plot!([k[i], k[i]], ci[i]; label="", lw=5, c=1)
end
plot!(; xtick=0:n, ytick=0:0.1:1)
plot!(; xlabel="data k", ylabel="parameter p")
title!("$(100(1 - α))% confidence intervals (n = $n)"; titlefontsize=12)

Out[26]:

In [ ]: