using Pkg; Pkg.activate("../."); Pkg.instantiate();
using IJulia; try IJulia.clear_output(); catch _ end
using Distributions, StatsPlots, SpecialFunctions, Plots, LaTeXStrings, Plots.PlotMeasures
Plots.default(
#linecolor=:red,
linewidth=3,
plot_titlefontsize=16,
guidefontsize=14,
tickfontsize=12,
size = (1000, 800)
)
Problem: We observe the following sequence of heads (outcome $=1$) and tails (outcome $=0$) when tossing the same coin repeatedly $$D=\{1011001\}\,.$$
What is the probability that heads comes up next?
so usually you select a model for generating one observation $x_n$ and then use (in-)dependence assumptions to combine these models into a likelihood function for the model parameters.
This parameter estimation "recipe" works if the right-hand side (RHS) factors can be evaluated; the computational details can be quite challenging and this is what machine learning is about.
$\Rightarrow$ Machine learning is EASY, apart from computational details :)
Hence, for equal model priors ($p(m_1)=p(m_2)=0.5$), the Bayes Factor reports the posterior probability ratio for the two models.
In principle, any hard decision on which is the better model has to accept some ad hoc arguments, but Jeffreys (1961) advises the following interpretation of the log-Bayes factor
$\log_{10} B_{12}$ | Evidence for $m_1$ |
0 to 0.5 | not worth mentioning |
0.5 to 1 | substantial |
1 to 2 | strong |
>2 | decisive |
In principle, you now have the recipe in your hands now to solve all your prediction/classification/regression etc problems by the same method:
At the beginning of this lesson, we posed the following challenge:
We observe a the following sequence of heads (outcome = $1$) and tails (outcome = $0$) when tossing the same coin repeatedly $$D=\{1011001\}\,.$$
What is the probability that heads comes up next? We solve this in the next slides ...
where the Gamma function is sort-of a generalized factorial function. In particular, if $\alpha,\beta$ are integers, then $$\frac{\Gamma(\alpha+\beta)}{\Gamma(\alpha)\Gamma(\beta)} = \frac{(\alpha+\beta-1)!}{(\alpha-1)!\,(\beta-1)!}$$
so we get a closed-form posterior.
params = [
(α=0.1, β=0.1),
(α=1.0, β=1.0),
(α=2.0, β=3.0),
(α=8.0, β=4.0)
]
x = 0:0.01:1
plots = []
for (i, (α, β)) in enumerate(params)
beta_dist = Beta(α, β)
y = pdf.(beta_dist, x)
xlabel = i in [3, 4] ? "μ" : ""
ylabel = i in [1, 3] ? "Density" : ""
push!(plots, plot(x, y, label="α=$α, β=$β", xlabel=xlabel, ylabel=ylabel))
end
plot(plots..., layout=(2, 2), suptitle="PDFs of Beta Distributions", legend=:topleft, link=:both, padding=10)
hence the posterior is also beta-distributed as
$$ p(\mu|D) = \mathrm{Beta}(\mu|\,n+\alpha, N-n+\beta) $$Let's interpret this decomposition of the posterior prediction. Before the data $D$ was observed, our model generated a prior prediction $ p(x_\bullet=1) = \frac{\alpha}{\alpha+\beta}$. Next, the degree to which the actually observed data matches this prediction is represented by the prediction error $ \frac{n}{N} - \frac{\alpha}{\alpha-\beta}$. The prior prediction is then updated to a posterior prediction $p(x_\bullet=1|D)$ by adding a fraction of the prediction error to the prior prediction. Hence, the data plays the role of "correcting" the prior prediction.
Note that, since $0\leq \underbrace{\frac{N}{N+\alpha+\beta}}_{\text{gain}} \lt 1$, the Bayesian prediction lies between (fuses) the prior and data-based predictions.
So, this coin is biased!
In order predict the outcomes of future coin tosses, we'll compare two models.
All models have the same data generating distribution (also Bernoulli)
but they have different priors: $$\begin{aligned} p(\mu|m_1) &= \mathrm{Beta}(\mu|\alpha=100,\beta=500) \\ p(\mu|m_2) &= \mathrm{Beta}(\mu|\alpha=8,\beta=13) \\ \end{aligned}$$
( but you are not supposed to know that the real coin has probability for heads $p(x_n=1|\mu) = 0.4$ ).
# computes log10 of Gamma function
function log10gamma(num)
num = convert(BigInt, num)
return log10(gamma(num))
end
μ = 0.4; # specify model parameter
n_tosses = 500 # specify number of coin tosses
samples = rand(n_tosses) .<= μ # Flip 200 coins
function handle_coin_toss(prior :: Beta, observation :: Bool)
posterior = Beta(prior.α + observation, prior.β + (1 - observation))
return posterior
end
function log_evidence_prior(prior :: Beta, N :: Int64, n :: Int64)
log_evidence = log10gamma(prior.α + prior.β) - log10gamma(prior.α) - log10gamma(prior.β) + log10gamma(n+prior.α) + log10gamma((N-n)+prior.β) - log10gamma(N+prior.α+prior.β)
return log_evidence
end
priors = [Beta(100., 500.), Beta(8., 13.)] # specify prior distributions
n_models = length(priors)
posterior_distributions = [[d] for d in priors] # save a sequence of posterior distributions for every prior, starting with the prior itself
log_evidences = [[] for _ in priors] # maintain a vector of log evidences to plot later
for (N, sample) in enumerate(samples) #for every sample we want to update our posterior
for (i, prior) in enumerate(priors) #at every sample we want to update all distributions
posterior = handle_coin_toss(prior, sample) #do bayesian updating
push!(posterior_distributions[i], posterior) #add posterior to vector of posterior distributions
log_evidence = log_evidence_prior(posterior_distributions[i][N], N, sum(samples[1:N])) #compute log evidence and add to vector
push!(log_evidences[i], log_evidence)
priors[i] = posterior #the prior for the next sample is the posterior from the current sample
end
end
#animate posterior distributions over time in a gif
anim = @animate for i in 1:5:n_tosses
p = plot(title=string("n = ", i))
for j in 1:n_models
plot!(posterior_distributions[j][i+1], xlims = (0, 1), fill=(0, .2,), label=string("Posterior m", j), linewidth=2, ylims=(0,28), xlabel="μ")
end
end
gif(anim, "anim_bay_ct.gif")
[ Info: Saved animation to /Users/pol/git/bertdv/BMLIP/lessons/notebooks/anim_bay_ct.gif
(This is an animation. If you are viewing these notes in PDF format you can see the animation at https://nbviewer.org/github/bertdv/BMLIP/blob/master/lessons/notebooks/Bayesian-Machine-Learning.ipynb .)
evidences = map(model -> exp.(model), log_evidences)
anim = @animate for i in 1:n_tosses
p = plot(title=string(L"\frac{p_i(\mathbf{x}_{1:n})}{\sum_i p_i(\mathbf{x}_{1:n})}"," n = ", i), ylims=(0, 1))
total = sum([evidences[j][i] for j in 1:n_models])
bar!([(evidences[j][i] / total) for j in 1:n_models], group=["Model $i" for i in 1:n_models])
end
gif(anim, "anim_bay_ct_log_evidence.gif")
[ Info: Saved animation to /Users/pol/git/bertdv/BMLIP/lessons/notebooks/anim_bay_ct_log_evidence.gif
Consider the task: predict a datum $x$ from an observed data set $D$.
Bayesian | Maximum Likelihood | |
1. Model Specification | Choose a model $m$ with data generating distribution $p(x|\theta,m)$ and parameter prior $p(\theta|m)$ | Choose a model $m$ with same data generating distribution $p(x|\theta,m)$. No need for priors. |
2. Learning | use Bayes rule to find the parameter posterior, $$ p(\theta|D) \propto p(D|\theta) p(\theta) $$ | By Maximum Likelihood (ML) optimization, $$ \hat \theta = \arg \max_{\theta} p(D |\theta) $$ |
3. Prediction | $$ p(x|D) = \int p(x|\theta) p(\theta|D) \,\mathrm{d}\theta $$ | $$ p(x|D) = p(x|\hat\theta) $$ |
$\Rightarrow$ Maximum Likelihood estimation is at best an approximation to Bayesian learning, but for good reason a very popular learning method when faced with lots of available data.
The following Gibbs Inequality holds (see wikipedia for proof): $$D_{\text{KL}}[q,p] \geq 0 \quad \text{with equality only if } p=q $$
The KL divergence can be interpreted as a distance between two probability distributions.
As an aside, note that $D_{\text{KL}}[q,p] \neq D_{\text{KL}}[p,q]$. Both divergences are relevant.
function kullback_leibler(q :: Normal, p :: Normal) #Calculates the KL Divergence between two gaussians (see https://stats.stackexchange.com/questions/7440/kl-divergence-between-two-univariate-gaussians for calculations)
return log((p.σ / q.σ)) + ((q.σ)^2 + (p.μ - q.μ)^2) / (2*p.σ^2) - (1. / 2.)
end
μ_p = 0 #statistics of distributions we'll keep constant (we'll vary the mean of q). Feel free to change these and see what happens
σ_p = 1
σ_q = 1
p = Normal(μ_p, σ_p)
anim = @animate for i in 1:100
μ_seq = [(j / 10.) - 5. + μ_p for j in 1:i] #compute the sequence of means tested so far (to compute sequence of KL divergences)
kl = [kullback_leibler(Normal(μ, σ_q), p) for μ in μ_seq] #compute KL divergence data
viz = plot(right_margin=8mm, title=string(L"D_{KL}(Q || P) = ", round(100 * kl[i]) / 100.), legend=:topleft) #build plot and format title and margins
μ_q = μ_seq[i] #extract mean of current frame from mean sequence
q = Normal(μ_q, σ_q)
plot!(p, xlims = (μ_p - 8, μ_p + 8), fill=(0, .2,), label=string("P"), linewidth=2, ylims=(0,0.5)) #plot p
plot!(q, fill=(0, .2,), label=string("Q"), linewidth=2, ylims=(0,0.5)) #plot q
plot!(twinx(), μ_seq, kl, xticks=:none, ylims=(0, maximum(kl) + 3), linewidth = 3, #plot KL divergence data with different y-axis scale and different legend
legend=:topright,xlims = (μ_p - 8, μ_p + 8), color="green", label=L"D_{KL}(Q || P)")
end
gif(anim, "anim_lat_kl.gif")
[ Info: Saved animation to /Users/pol/git/bertdv/BMLIP/lessons/notebooks/anim_lat_kl.gif
open("../../styles/aipstyle.html") do f display("text/html", read(f, String)) end