import pandas as pd
import numpy as np
import itertools as it
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
def color_print(string, bcolor=bcolors.WARNING):
print(bcolor + string + bcolors.ENDC)
Consider three binary variables $a, b, c \in \{0, 1\}$ having the joint distribution given by the table below. Show by direct evaluation that this distribution has the property that $a$ and $b$s are marginally dependent, so that $p(a,b) \neq p(a)p(b)$, but that they become independent when conditioned on $c$ so that $p(a,b|c) = p(a|c)p(b|c)$ fo both $c=0$ and $c=1$
p = np.array([
[0, 0, 0, 0.192],
[0, 0, 1, 0.144],
[0, 1, 0, 0.048],
[0, 1, 1, 0.216],
[1, 0, 0, 0.192],
[1, 0, 1, 0.064],
[1, 1, 0, 0.048],
[1, 1, 1, 0.096],
])
p = pd.DataFrame(p, columns=["a", "b", "c", "p_abc"])
p = p.set_index(["a", "b", "c"])
# p(a, b)
p.xs(0, level=2) + p.xs(1, level=2)
p_abc | ||
---|---|---|
a | b | |
0.0 | 0.0 | 0.336 |
1.0 | 0.264 | |
1.0 | 0.0 | 0.256 |
1.0 | 0.144 |
# p(a)p(b)
p_marg = p.xs(0, level=2) + p.xs(1, level=2)
p_a = p_marg.xs(0, level=-1) + p_marg.xs(1, level=-1)
p_b = p_marg.xs(0, level=0) + p_marg.xs(1, level=0)
p_a["key"] = 1
p_b["key"] = 1
p_marg = pd.merge(p_a.reset_index(), p_b.reset_index(), on="key",
suffixes=("_a", "_b")).drop("key", axis=1)
p_marg.set_index(["a", "b"]).prod(axis=1)
a b 0.0 0.0 0.3552 1.0 0.2448 1.0 0.0 0.2368 1.0 0.1632 dtype: float64
# p(a,b|c=0)
p_marg = p.xs(0, level=-1)
p_marg = p_marg / p_marg.sum()
p_marg
p_abc | ||
---|---|---|
a | b | |
0.0 | 0.0 | 0.4 |
1.0 | 0.1 | |
1.0 | 0.0 | 0.4 |
1.0 | 0.1 |
# p(a|c=0)p(b|c=0)
p_a = p_marg.sum(level=0)
p_b = p_marg.sum(level=1)
p_a = p_a.reset_index().assign(key=1)
p_b = p_b.reset_index().assign(key=1)
p_fact = (pd.merge(p_a, p_b, on="key", suffixes=("_a", "_b"))
.drop("key", axis=1)
.set_index(["a", "b"]))
p_fact.prod(axis=1)
a b 0.0 0.0 0.4 1.0 0.1 1.0 0.0 0.4 1.0 0.1 dtype: float64
Evaluate the distributions $p(a)$, $p(b | c)$, and $p(c | a)$ corresponding to the joint distribution given in Table 8.2. Hence show by direct evaluation that $p(a, b, c) = p(a)p(c | a)p(b | c)$. Draw the corresponding directed graph.
p_a = p.sum(level=0)
a = 1
p_a["key"] = [1,2]
p_a
p_abc | key | |
---|---|---|
a | ||
0.0 | 0.6 | 1 |
1.0 | 0.4 | 2 |
# p(c|a)
p_c_giv_a = p.sum(level=["a", "c"]) / p.sum(level=["a", "c"]).sum(level=0)
p_c_giv_a["key"] = [1, 1, 2, 2]
p_c_giv_a
p_abc | key | ||
---|---|---|---|
a | c | ||
0.0 | 0.0 | 0.4 | 1 |
1.0 | 0.6 | 1 | |
1.0 | 0.0 | 0.6 | 2 |
1.0 | 0.4 | 2 |
# p(b|c)
p_b_giv_c = p.sum(level=["b", "c"]) / p.sum(level=["b", "c"]).sum(level=1)
p_b_giv_c["key"] = [1, 2, 1, 2]
p_b_giv_c
p_abc | key | ||
---|---|---|---|
b | c | ||
0.0 | 0.0 | 0.8 | 1 |
1.0 | 0.4 | 2 | |
1.0 | 0.0 | 0.2 | 1 |
1.0 | 0.6 | 2 |
fact = p_a.merge(p_c_giv_a.reset_index(), on="key")
fact = fact.drop(["key"], axis=1).set_index(["a", "c"])
fact = fact.prod(axis=1)
fact = pd.DataFrame(fact, columns=["p"])
fact["key"] = [1, 2, 1, 2]
fact = fact.reset_index().merge(p_b_giv_c.reset_index(), on="key")
fact = fact.rename({"c_x": "c"}, axis=1).drop(["key", "c_y"], axis=1)
fact = fact.groupby(["a", "c", "b"]).sum().prod(axis=1)
fact.name="p_factorized"
Final Answer:
Contains a bug: c-level is swapped
p.join(fact, on=["a", "b", "c"])
p_abc | p_factorized | |||
---|---|---|---|---|
a | b | c | ||
0.0 | 0.0 | 0.0 | 0.192 | 0.192 |
1.0 | 0.144 | 0.048 | ||
1.0 | 0.0 | 0.048 | 0.144 | |
1.0 | 0.216 | 0.216 | ||
1.0 | 0.0 | 0.0 | 0.192 | 0.192 |
1.0 | 0.064 | 0.048 | ||
1.0 | 0.0 | 0.048 | 0.064 | |
1.0 | 0.096 | 0.096 |
Consider the graphical model represented by the noist-OR $$ p(y=1|x_1, \ldots, x_M) = 1 - (1 - \mu_0)\prod_{n=1}^N(1 - \mu_i)^{x_i} $$
Show that $p$ can be interpreted as a "soft" (probabilisitc) form of the logical OR function (i.e, the function that gives $y=1$ whenever at least on the $x_i=1$). Discuss the interpretation of $\mu_0$
# ** Considering two parameters plus the null term: mu_0, mu_1, mu_2 **
mu_0 = 0
x1, x2 = 1, 1
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2 in it.product([0, 1], repeat=2):
p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2
print(f"p(y|mu_1={mu_1}, mu_2={mu_2})={p_y}")
p(y|mu_1=0, mu_2=0)=0 p(y|mu_1=0, mu_2=1)=1 p(y|mu_1=1, mu_2=0)=1 p(y|mu_1=1, mu_2=1)=1
# ** Considering four parameters plus the null term: mu_0, mu_1, mu_2, mu_3, mu_4 **
x1, x2, x3, x4 = 1, 1, 1, 1
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3, mu_4 in it.product([0, 1], repeat=4):
p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3 * (1 - mu_4) ** x4
print(f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y}")
p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=0)=0 p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=1 p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=0)=1 p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=1 p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=0)=1 p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1 p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=0)=1 p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1 p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=0)=1 p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=1 p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=0)=1 p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=1 p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=0)=1 p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1 p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=0)=1 p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1
First remarks:
# In this example, we "turn off" x1 and x3, meaning that the OR
# function is computing using only x2 and x4
x1, x2, x3, x4 = 0, 1, 0, 1
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3, mu_4 in it.product([0, 1], repeat=4):
p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3 * (1 - mu_4) ** x4
p_y_str = f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y}"
if (mu_2 == 0 or mu_3 == 0) and p_y == 0:
color_print(p_y_str)
else:
print(p_y_str)
p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=0)=0 p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=1 p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=0)=0 p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=1 p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=0)=1 p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1 p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=0)=1 p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1 p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=0)=0 p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=1 p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=0)=0 p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=1 p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=0)=1 p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1 p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=0)=1 p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1
The more general case
$\mu_0$ controls the level of the null term: the bigger it is, the more the null terms (0) converge to 1
x1, x2, x3 = 0, 1, 0
mu_0 = 0.45
# We consider x1 and x2 as variables to consider inside the OR
for mu_1, mu_2, mu_3 in it.product([0, 1], repeat=3):
p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3
p_y_str = f"p(y|mu_1={mu_1}, mu_2={mu_2}, mu_3={mu_3}, mu_4={mu_4})={p_y:.2}"
print(p_y_str)
p(y|mu_1=0, mu_2=0, mu_3=0, mu_4=1)=0.45 p(y|mu_1=0, mu_2=0, mu_3=1, mu_4=1)=0.45 p(y|mu_1=0, mu_2=1, mu_3=0, mu_4=1)=1.0 p(y|mu_1=0, mu_2=1, mu_3=1, mu_4=1)=1.0 p(y|mu_1=1, mu_2=0, mu_3=0, mu_4=1)=0.45 p(y|mu_1=1, mu_2=0, mu_3=1, mu_4=1)=0.45 p(y|mu_1=1, mu_2=1, mu_3=0, mu_4=1)=1.0 p(y|mu_1=1, mu_2=1, mu_3=1, mu_4=1)=1.0
$\forall n\geq 1.\mu_n$ controls the value of the positive terms
mu_1, mu_2, mu_3 = 0.3, 0.2, 0.5
mu_0 = 0
# We consider x1 and x2 as variables to consider inside the OR
for x1, x2, x3 in it.product([0, 1], repeat=3):
p_y = 1 - (1 - mu_0) * (1 - mu_1) ** x1 * (1 - mu_2) ** x2 * (1 - mu_3) ** x3
p_y_str = f"p(y|x1={x1}, x2={x2}, x3={x3})={p_y:.2f}"
print(p_y_str)
p(y|x1=0, x2=0, x3=0)=0.00 p(y|x1=0, x2=0, x3=1)=0.50 p(y|x1=0, x2=1, x3=0)=0.20 p(y|x1=0, x2=1, x3=1)=0.60 p(y|x1=1, x2=0, x3=0)=0.30 p(y|x1=1, x2=0, x3=1)=0.65 p(y|x1=1, x2=1, x3=0)=0.44 p(y|x1=1, x2=1, x3=1)=0.72
Consider the example of the car fuel system, and suppose that insted of observing the state of the fuel gauge $G$ directly, the gauge is seen by the driver $D$ who reports to us the reading of the gauge. This report is either that the gauge shows full $D=1$ or that it shows empty $D=0$.
$$ p(D=1|G=1) = 0.9\\ p(D=0|G=0) = 0.9 $$Suppose that the driver tells us that the fuel gauge reads empty (D=0).
pB1 = 0.9
pB0 = 1 - pB1
pG0 = 0.315
pG1 = 1 - pG0
pF1 = 0.9
pF0 = 1 - pF1
pG0_giv_F0 = 0.81
pG1_giv_F0 = 1 - pG0_giv_F0
pD1_giv_G1 = 0.9
pD0_giv_G0 = 0.9
pD0_giv_G1 = 1 - pD1_giv_G1
pD1_giv_G0 = 1 - pD0_giv_G0
pD0 = pG1 * pD0_giv_G1 + pG0 * pD0_giv_G0
pF0_D0 = pF0 * pG0_giv_F0 * pD0_giv_G0 + pF0 * pG1_giv_F0 * pD0_giv_G1
pF0
0.09999999999999998
# The probability that the tank is empty given that the
# driver told us is empty
pF0_D0 / pD0
0.21249999999999997
pG1_giv_B1F1 = 0.8
pG1_giv_B1F0 = 0.2
pG1_giv_B0F1 = 0.2
pG1_giv_B0F0 = 0.1
pG0_giv_B1F1 = 1 - pG1_giv_B1F1
pG0_giv_B1F0 = 1 - pG0_giv_B1F1
pG0_giv_B0F1 = 1 - pG1_giv_B0F1
pG0_giv_B0F0 = 1 - pG1_giv_B0F0
pG0_giv_B0 = pG0_giv_B0F0 * pF0 + pG0_giv_B0F1 * pF1
pG1_giv_B0 = 1 - pG0_giv_B0
pD0_B0 = pB0 * pG0_giv_B0 * pD0_giv_G0 + pB0 * pG1_giv_B0 * pD0_giv_G1
pF0_D0_B0 = pF0 * pB0 * (pG0_giv_B0F0 * pD0_giv_G0 + pG1_giv_B0F0 * pD0_giv_G1)
pF0_D0_B0 / pD0
0.023295454545454536
pF0
0.09999999999999998
Show that there are $2^{M(M-1)/2}$ distinct undirected graphs over a set of $M$ distinct random variables. Draw the eight posibilities for the case of $M=3$
Any given $M$-node Markov Random Field (MRF)has ${M}\choose{2}$ edges. Notice
$$ \begin{aligned} {M\choose 2} &= \frac{M!}{(M-2)!2!}\\ &=\frac{M(M-1)(M-2)!}{(M-2)!2!}\\ &=\frac{M(M-1)}{2} \end{aligned} $$Therefore, the number of edges $E$ is given by $E = M(M-1)/2$
from scipy.special import comb
for nodes in range(2, 8):
edges = nodes * (nodes - 1) // 2
print(f"A {nodes}-node MRF has {edges:>2.0f} edge(s)")
A 2-node MRF has 1 edge(s) A 3-node MRF has 3 edge(s) A 4-node MRF has 6 edge(s) A 5-node MRF has 10 edge(s) A 6-node MRF has 15 edge(s) A 7-node MRF has 21 edge(s)
To compute how many distinct graphs can be produced given an $M$-node MRF, we notice that, if the graph is not connected to any edge, then we are choosing 0 out of $M$ nodes; for which there exists only one possible configuration. If the graph is connected via two nodes (or one edge), then the total possible configurations are given by ${E\choose 1}$. In general, if the graph has $E$ edges, then the number possible configurations considering $k$ edges is given by ${E\choose k}$. Therefore, the total number of configurations for a MRF is given by the sum of configuration ($C$) that a given number of edges can have. This is
$$ C = \sum_{k=0}^E{E \choose k} $$for nodes in range(2, 8):
edges = nodes * (nodes - 1) // 2
configurations = sum(comb(edges, k) for k in range(edges + 1))
print(f"A {nodes}-node MRF has {configurations:>7.0f} possible configurations")
A 2-node MRF has 2 possible configurations A 3-node MRF has 8 possible configurations A 4-node MRF has 64 possible configurations A 5-node MRF has 1024 possible configurations A 6-node MRF has 32768 possible configurations A 7-node MRF has 2097152 possible configurations
Next, by the binomial theorem, we know that $$ (x + y)^N = \sum_{k=0}^N{N \choose k}x^{N-k}y^k $$
Thus, $$ \begin{aligned} (1 + 1)^N = 2 ^ N &= \sum_{k=0}^N{N \choose k}1^{N-k}1^k \\ &= \sum_{k=0}^N{N \choose k} \end{aligned} $$
Therefore, the total number of cofigurations can be written as: $$ 2^E = \sum_{k=0}^E{E \choose k} = \sum_{k=0}^{M(M-1)/2}{M(M-1)/2 \choose k} $$
Which implies that there are $2^{M(M-1)/2}$ possible configurations
for nodes in range(2, 8):
edges = nodes * (nodes - 1) // 2
configurations = 2 ** edges
print(f"A {nodes}-node MRF has {configurations:>7.0f} possible configurations")
A 2-node MRF has 2 possible configurations A 3-node MRF has 8 possible configurations A 4-node MRF has 64 possible configurations A 5-node MRF has 1024 possible configurations A 6-node MRF has 32768 possible configurations A 7-node MRF has 2097152 possible configurations