safediv(x, y) = iszero(x) ? x : x/y safemult(x, y) = iszero(x) ? x : x*y function chisq(A) r, c = size(A) n = sum(A) M = vec(sum(A, dims=2)) N = vec(sum(A, dims=1)) sum(safediv((A[i,j] - M[i]*N[j]/n)^2, M[i]*N[j]/n) for i in 1:r, j in 1:c) end function gstat(A) r, c = size(A) n = sum(A) M = vec(sum(A, dims=2)) N = vec(sum(A, dims=1)) 2sum(safemult(A[i,j], log(A[i,j]) - log(M[i]*N[j]/n)) for i in 1:r, j in 1:c) end A = [ 1 2 3 2 4 6 ] @show chisq(A) @show gstat(A) B = [ 1 8 1 2 1 6 ] @show chisq(B) @show gstat(B); using SymPy @vars a b c d A = [ a b c d ] chisq(A).factor() using Distributions using Plots # Plots.jlのデフォルトの設定を表示 #@show Plots.reset_defaults() # legendの半透明化 @show default(:bglegend, plot_color(default(:bg), 0.5)) @show default(:fglegend, plot_color(ifelse(isdark(plot_color(default(:bg))), :white, :black), 0.6)); using Base64 displayfile(mime, file; tag="img") = open(file) do f base64 = base64encode(f) display("text/html", """<$(tag) src="data:$(mime);base64,$(base64)"/>""") end pyplotclf() = if backend() == Plots.PyPlotBackend(); PyPlot.clf(); end pyplot(fmt=:svg) ecdf(x; Y=rand(10)) = mean(Y .≤ x) eccdf(x; Y=rand(10)) = mean(Y .≥ x) n = 2^6 dist = Gamma(10, 0.5) Y = rand(dist, n) x = range(mean(dist)-5std(dist), mean(dist)+5std(dist), length=400) plot(size=(400, 270), legend=:outertop) plot!(x, eccdf.(x; Y=Y); label="eccdf of $dist, n = $n") plot!(x, ccdf.(dist, x); label="ccdf of $dist", ls=:dash) equantile(p; Y=randn(100)) = quantile(Y, p) ecquantile(p; Y=randn(100)) = quantile(Y, 1-p) n = 2^6 dist = Gamma(10, 0.5) Y = rand(dist, n) p = range(0, 1, length=200) plot(size=(400, 270), legend=:outertop) plot!(p, ecquantile.(p; Y=Y); label="ecquantile of $dist, n = $n") plot!(p, cquantile.(dist, p); label="cquantile of $dist", ls=:dash) param_indep(p, q) = p .* q' p = [0.2, 0.3, 0.5] q = [0.1, 0.2, 0.3, 0.4] P = param_indep(p, q) @show p @show q @show sum(P) P df_chisq(r, c) = (r-1)*(c-1) df_chisq(P) = prod(size(P) .- 1) @show size(P) @show size(P) .- 1 @show df_chisq(P); function plot_sim(title, PearsonChisq, G_Statistics, df, binstep) chisq_dist = Chisq(df) f(x) = pdf(chisq_dist, x) xmax = 4df + 2 x = range(0, xmax, length=200) bin = range(0, xmax, step=binstep) P1 = plot(xlabel="x") plot!(title=title, titlefontsize=9, legendfontsize=8, guidefontsize=8) histogram!(PearsonChisq; bin=bin, norm=true, alpha=0.3, label="Pearson's χ²-statistics") plot!(x, f.(x); label="pdf of Chisq(df=$(df))") plot!(tickfontsize=7) P2 = plot(xlabel="x") plot!(title=title, titlefontsize=9, legendfontsize=8, guidefontsize=8) histogram!(G_Statistics; bin=bin, norm=true, alpha=0.3, label="G-statistics") plot!(x, f.(x); label="pdf of Chisq(df=$(df))") plot!(tickfontsize=7) P3 = plot(guidefontsize=8) plot!(xlabel="ccdf of Chisq(df=$(df))", ylabel="eccdf of Pearson's χ²-statistics") xx = ccdf.(chisq_dist, x) yy = eccdf.(x; Y=PearsonChisq) plot!(xx, yy; label="") plot!([0,1], [0,1]; label="", color=:black, ls=:dot, alpha=0.5) plot!(xtick=0:0.1:1, ytick=0:0.1:1, tickfontsize=7, xrotation=90) P4 = plot(guidefontsize=8) plot!(xlabel="ccdf of Chisq(df=$(df))", ylabel="eccdf of G-statistics") xx = ccdf.(chisq_dist, x) yy = eccdf.(x; Y=G_Statistics) plot!(xx, yy; label="") plot!([0,1], [0,1]; label="", color=:black, ls=:dot, alpha=0.7) plot!(xtick=0:0.1:1, ytick=0:0.1:1, tickfontsize=7, xrotation=90) α_max = 0.055 x0 = range(cquantile(chisq_dist, α_max), 2xmax, length=200) P5 = plot(guidefontsize=8) plot!(xlabel="ccdf of Chisq(df=$(df))", ylabel="eccdf of Pearson's χ²-statistics") xx = ccdf.(chisq_dist, x0) yy = eccdf.(x0; Y=PearsonChisq) plot!(xx, yy; label="") plot!([0, α_max], [0, α_max]; label="", color=:black, ls=:dot, alpha=0.5) plot!(xtick=0:0.005:1, ytick=0:0.005:1, tickfontsize=7, xrotation=90) P6 = plot(guidefontsize=8) plot!(xlabel="ccdf of Chisq(df=$(df))", ylabel="eccdf of G-statistics") xx = ccdf.(chisq_dist, x0) yy = eccdf.(x0; Y=G_Statistics) plot!(xx, yy; label="") plot!([0, α_max], [0, α_max]; label="", color=:black, ls=:dot, alpha=0.5) plot!(xtick=0:0.005:1, ytick=0:0.005:1, tickfontsize=7, xrotation=90) plot(P1, P3, P5, P2, P4, P6; size=(800, 500), layout=grid(2, 3; widths=[3.2/8, 2.4/8, 2.4/8]) ) end prod_Poisson(Λ) = product_distribution(Poisson.(vec(Λ))) function sim_Poisson(; λ=100, P=param_indep([0.2, 0.3, 0.5], [0.1, 0.2, 0.3, 0.4]), L=10^5) dist = prod_Poisson(λ*P) PearsonChisq = Array{Float64,1}(undef, L) G_Statistics = Array{Float64,1}(undef, L) for l in 1:L A = reshape(rand(dist), size(P)) PearsonChisq[l] = chisq(A) G_Statistics[l] = gstat(A) end PearsonChisq, G_Statistics end function plot_sim_Poisson(; λ=100, P=param_indep([0.2, 0.3, 0.5], [0.1, 0.2, 0.3, 0.4]), L=10^5, binstep=0.5 ) @show expectation = λ*P @show total = sum(expectation) @show r, c = size(expectation) @show df = df_chisq(expectation) @time PearsonChisq, G_Statistics = sim_Poisson(λ=λ, P=P, L=L) title = "$(r)×$(c) Poisson distributions (λ = $(λ))" sleep(0.1) plot_sim(title, PearsonChisq, G_Statistics, df, binstep) end P = param_indep([0.2, 0.3, 0.5], [0.1, 0.2, 0.3, 0.4]) plot_sim_Poisson(λ=50, P=P) plot_sim_Poisson(λ=100, P=P) plot_sim_Poisson(λ=200, P=P) function sim_Multinomial(; n=100, P=param_indep([0.2, 0.3, 0.5], [0.1, 0.2, 0.3, 0.4]), L=10^5) dist = Multinomial(n, vec(P)) PearsonChisq = Array{Float64,1}(undef, L) G_Statistics = Array{Float64,1}(undef, L) for l in 1:L A = reshape(rand(dist), size(P)) PearsonChisq[l] = chisq(A) G_Statistics[l] = gstat(A) end PearsonChisq, G_Statistics end function plot_sim_Multinomial(; n=100, P=param_indep([0.2, 0.3, 0.5], [0.1, 0.2, 0.3, 0.4]), L=10^5, binstep=0.5 ) @show expectation = n*P @show total = sum(expectation) @show r, c = size(expectation) @show df = df_chisq(expectation) @time PearsonChisq, G_Statistics = sim_Multinomial(n=n, P=P, L=L) title = "$(r)×$(c)-nomial distribution (n = $(n))" sleep(0.1) plot_sim(title, PearsonChisq, G_Statistics, df, binstep) end plot_sim_Multinomial(n=50, P=P) plot_sim_Multinomial(n=100, P=P) plot_sim_Multinomial(n=200, P=P) function rand_prod_Multinomial(M, q)    r, c = length(M), length(q)    A = Array{Int, 2}(undef, r, c) for i in 1:r A[i,:] = rand(Multinomial(M[i], q)) end A end M = [20, 30, 50] q = [0.1, 0.2, 0.3, 0.4] rand_prod_Multinomial(M, q) function sim_prod_Multinomial(; M=[20, 30, 50], q=[0.1, 0.2, 0.3, 0.4], L=10^5) PearsonChisq = Array{Float64,1}(undef, L) G_Statistics = Array{Float64,1}(undef, L) for l in 1:L A = rand_prod_Multinomial(M, q) PearsonChisq[l] = chisq(A) G_Statistics[l] = gstat(A) end PearsonChisq, G_Statistics end function plot_sim_prod_Multinomial(; M=[20, 30, 50], q=[0.1, 0.2, 0.3, 0.4], L=10^5, binstep=0.5 ) n = sum(M) @show expectation = M*q' @show total = sum(expectation) @show r, c = size(expectation) @show df = df_chisq(expectation) @time PearsonChisq, G_Statistics = sim_prod_Multinomial(M=M, q=q, L=L) if c == 2 title = "$(r) binomial distributions (n = $n)" elseif c == 3 title = "$(r) trinomial distributions (n = $n)" elseif c == 4 title = "$(r) quadranomial distributions (n = $n)" else title = "$(r) $(c)-nomial distributions (n = $n)" end sleep(0.1) plot_sim(title, PearsonChisq, G_Statistics, df, binstep) end plot_sim_prod_Multinomial(M=div.(M,2), q=q) plot_sim_prod_Multinomial(M=M, q=q) plot_sim_prod_Multinomial(M=2M, q=q) A = [ 2 1 1 0 0 8 3 3 0 0 0 2 1 1 1 0 0 0 1 1 0 0 0 0 1 ] r, c = size(A) n = sum(A) M = vec(sum(A, dims=2)) p = M/n N = vec(sum(A, dims=1)) q = N/n P = param_indep(p, q) df = df_chisq(P) @show chi_squared = chisq(A) @show p_value = ccdf(Chisq(df), chi_squared) n*P plot_sim_Poisson(λ=n, P=P; binstep=1) plot_sim_Multinomial(n=n, P=P; binstep=1) plot_sim_prod_Multinomial(M=M, q=q; binstep=1) plot_sim_prod_Multinomial(M=N, q=p; binstep=1) plot_sim_Poisson(λ=2n, P=P; binstep=1) plot_sim_Multinomial(n=2n, P=P; binstep=1) plot_sim_prod_Multinomial(M=2M, q=q; binstep=1) plot_sim_prod_Multinomial(M=2N, q=p; binstep=1) @show M = [10, 15] @show q = [0.1, 0.9] P = param_indep(M/sum(M), q) E = round.(Int, 2M*q') display("text/html", raw"$\text{期待値} = " * sympy.latex(Sym.(E)) * raw"$") plot_sim_Poisson(λ=50, P=P, binstep=0.2) plot_sim_Multinomial(n=50, P=P, binstep=0.2) plot_sim_prod_Multinomial(M=2M, q=q, binstep=0.2) plot_sim_prod_Multinomial(M=round.(Int, sum(2M)*q), q=M/sum(M), binstep=0.2) E = round.(Int, 4M*q') display("text/html", raw"$\text{期待値} = " * sympy.latex(Sym.(E)) * raw"$") plot_sim_Poisson(λ=100, P=P, binstep=0.2) plot_sim_Multinomial(n=100, P=P, binstep=0.2) plot_sim_prod_Multinomial(M=4M, q=q, binstep=0.2) plot_sim_prod_Multinomial(M=round.(Int, sum(4M)*q), q=M/sum(M), binstep=0.2)