#!/usr/bin/env python
# coding: utf-8
# In[11]:
from IPython.display import Image
Image('../../Python_probability_statistics_machine_learning_2E.png',width=200)
# In[12]:
import numpy as np
np.random.seed(12345)
# # Conditional Expectation and Mean Squared Error
#
# In this section, we work through a detailed example using conditional
# expectation and optimization methods. Suppose we have two fair six-sided dice
# ($X$ and $Y$) and we want to measure the sum of the two variables as $Z=X+Y$.
# Further, let's suppose that given $Z$, we want the best estimate of $X$ in the
# mean-squared-sense. Thus, we want to minimize the following:
# $$
# J(\alpha) = \sum ( x - \alpha z )^2 \mathbb{P}(x,z)
# $$
# where $\mathbb{P}$ is the probability mass function for this problem.
# The idea is that when we have solved this problem, we will have a function of
# $Z$ that is going to be the minimum MSE estimate of $X$. We can substitute in
# for $Z$ in $J$ and get:
# $$
# J(\alpha) = \sum ( x - \alpha (x+y) )^2 \mathbb{P}(x,y)
# $$
# Let's work out the steps in Sympy in the following:
# In[13]:
import sympy as S
from sympy.stats import density, E, Die
x=Die('D1',6) # 1st six sided die
y=Die('D2',6) # 2nd six sides die
a=S.symbols('a')
z = x+y # sum of 1st and 2nd die
J = E((x-a*(x+y))**2) # expectation
print(S.simplify(J))
# With all that setup we can now use basic calculus to minimize the
# objective function $J$,
# In[14]:
sol,=S.solve(S.diff(J,a),a) # using calculus to minimize
print(sol) # solution is 1/2
# **Programming Tip.**
#
# Sympy has a `stats` module that can do some basic work with expressions
# involving probability densities and expectations. The above code uses its `E`
# function to compute the expectation.
#
#
#
# This says that $z/2$ is the MSE estimate of $X$ given $Z$ which means
# geometrically (interpreting the MSE as a squared distance weighted by the
# probability mass function) that $z/2$ is as *close* to $x$ as we are going to
# get for a given $z$.
#
# Let's look at the same problem using the conditional expectation operator $
# \mathbb{E}(\cdot|z) $ and apply it to our definition of $Z$. Then
# $$
# \mathbb{E}(z|z)=\mathbb{E}(x+y|z)=\mathbb{E}(x|z)+\mathbb{E}(y|z)=z
# $$
# using the linearity of the expectation. Now, since by the
# symmetry of the problem (i.e., two identical die), we have
# $$
# \mathbb{E}(x|z)=\mathbb{E}(y|z)
# $$
# we can plug this in and solve
# $$
# 2 \mathbb{E}(x|z)=z
# $$
# which once again gives,
# $$
# \mathbb{E}(x|z) =\frac{z}{2}
# $$
# In[15]:
get_ipython().run_line_magic('matplotlib', 'inline')
from numpy import arange,array
from matplotlib.pylab import subplots, cm
from sympy import Integer
fig,ax = subplots()
v = arange(1,7) + arange(1,7)[:,None]
foo=lambda i: density(z)[Integer(i)].evalf() # some tweaks to get a float out
Zmass=array(list(map(foo,v.flat)),dtype=float).reshape(6,6)
pc=ax.pcolor(arange(1,8),arange(1,8),Zmass,cmap=cm.gray)
_=ax.set_xticks([(i+0.5) for i in range(1,7)])
_=ax.set_xticklabels([str(i) for i in range(1,7)])
_=ax.set_yticks([(i+0.5) for i in range(1,7)])
_=ax.set_yticklabels([str(i) for i in range(1,7)])
for i in range(1,7):
for j in range(1,7):
_=ax.text(i+.5,j+.5,str(i+j),fontsize=18,fontweight='bold',color='goldenrod')
_=ax.set_title(r'Probability Mass for $Z$',fontsize=18)
_=ax.set_xlabel('$X$ values',fontsize=18)
_=ax.set_ylabel('$Y$ values',fontsize=18);
cb=fig.colorbar(pc)
_=cb.ax.set_title(r'Probability',fontsize=12)
fig.savefig('fig-probability/Conditional_expectation_MSE_001.png')
#
#
#
#
# The values of $Z$ are in yellow with the corresponding values for $X$ and $Y$ on the axes. The gray scale colors indicate the underlying joint probability density.
#
#
#
#
#
# which is equal to the estimate we just found by minimizing the MSE.
# Let's explore this further with [Figure](#fig:Conditional_expectation_MSE_001). [Figure](#fig:Conditional_expectation_MSE_001) shows the values of $Z$ in yellow with
# the corresponding values for $X$ and $Y$ on the axes. Suppose $z=2$, then the
# closest $X$ to this is $X=1$, which is what $\mathbb{E}(x|z)=z/2=1$ gives. What
# happens when $Z=7$? In this case, this value is spread out diagonally along the
# $X$ axis so if $X=1$, then $Z$ is 6 units away, if $X=2$, then $Z$ is 5 units
# away and so on.
#
# Now, back to the original question, if we had $Z=7$ and we wanted
# to get as close as we could to this using $X$, then why not choose
# $X=6$ which is only one unit away from $Z$? The problem with doing
# that is $X=6$ only occurs 1/6 of the time, so we are not likely to
# get it right the other 5/6 of the time. So, 1/6 of the time we are
# one unit away but 5/6 of the time we are much more than one unit
# away. This means that the MSE score is going to be worse. Since
# each value of $X$ from 1 to 6 is equally likely, to play it safe,
# we choose $7/2$ as the estimate, which is what the conditional
# expectation suggests.
#
# We can check this claim with samples using Sympy below:
# In[16]:
import numpy as np
from sympy import stats
# Eq constrains Z
samples_z7 = lambda : stats.sample(x, S.Eq(z,7))
#using 6 as an estimate
mn= np.mean([(6-samples_z7())**2 for i in range(100)])
#7/2 is the MSE estimate
mn0= np.mean([(7/2.-samples_z7())**2 for i in range(100)])
print('MSE=%3.2f using 6 vs MSE=%3.2f using 7/2 ' % (mn,mn0))
# **Programming Tip.**
#
# The `stats.sample(x, S.Eq(z,7))` function call samples the `x` variable subject
# to a condition on the `z` variable. In other words, it generates random samples
# of `x` die, given that the sum of the outcomes of that die and the `y` die add
# up to `z==7`.
#
#
#
# Please run the above code repeatedly until you are convinced that the
# $\mathbb{E}(x|z)$ gives the lower MSE every time. To push this reasoning,
# let's consider the case where the die is so biased so that the outcome of `6`
# is ten times more probable than any of the other outcomes. That is,
# $$
# \mathbb{P}(6) = 2/3
# $$
# whereas $\mathbb{P}(1)=\mathbb{P}(2)=\ldots=\mathbb{P}(5)=1/15$.
# We can explore this using Sympy as in the following:
# In[17]:
# here 6 is ten times more probable than any other outcome
x=stats.FiniteRV('D3',{1:1/15., 2:1/15.,
3:1/15., 4:1/15.,
5:1/15., 6:2/3.})
# As before, we construct the sum of the two dice, and plot the
# corresponding probability mass function in [Figure](#fig:Conditional_expectation_MSE_002). As compared with [Figure](#fig:Conditional_expectation_MSE_001), the probability mass has been shifted
# away from the smaller numbers.
# In[18]:
z = x + y
foo=lambda i: density(z)[S.Integer(i)].evalf() # some tweaks to get a float out
v = np.arange(1,7) + np.arange(1,7)[:,None]
Zmass=np.array(list(map(foo,v.flat)),dtype=float).reshape(6,6)
from matplotlib.pylab import subplots, cm
fig,ax=subplots()
pc=ax.pcolor(np.arange(1,8),np.arange(1,8),Zmass,cmap=cm.gray)
_=ax.set_xticks([(i+0.5) for i in range(1,7)])
_=ax.set_xticklabels([str(i) for i in range(1,7)])
_=ax.set_yticks([(i+0.5) for i in range(1,7)])
_=ax.set_yticklabels([str(i) for i in range(1,7)])
for i in range(1,7):
for j in range(1,7):
_=ax.text(i+.5,j+.5,str(i+j),fontsize=18,fontweight='bold',color='goldenrod')
_=ax.set_title(r'Probability Mass for $Z$; Nonuniform Case',fontsize=13)
_=ax.set_xlabel('$X$ values',fontsize=18)
_=ax.set_ylabel('$Y$ values',fontsize=18);
cb=fig.colorbar(pc)
_=cb.ax.set_title(r'Probability',fontsize=10)
fig.savefig('fig-probability/Conditional_expectation_MSE_002.png')
#
#
#
#
# The values of $Z$ are in yellow with the corresponding values for $X$ and $Y$ on the axes.
#
#
#
#
#
# Let's see what the conditional expectation says about how we can estimate $X$
# from $Z$.
# In[19]:
E(x, S.Eq(z,7)) # conditional expectation E(x|z=7)
# Now that we have $\mathbb{E}(x|z=7) = 5$, we can generate
# samples as before and see if this gives the minimum MSE.
# In[20]:
samples_z7 = lambda : stats.sample(x, S.Eq(z,7))
#using 6 as an estimate
mn= np.mean([(6-samples_z7())**2 for i in range(100)])
#5 is the MSE estimate
mn0= np.mean([(5-samples_z7())**2 for i in range(100)])
print('MSE=%3.2f using 6 vs MSE=%3.2f using 5 ' % (mn,mn0))
# Using a simple example, we have emphasized the connection between minimum mean
# squared error problems and conditional expectation. Hopefully, the last two
# figures helped expose the role of the probability density. Next, we'll
# continue revealing the true power of the conditional expectation as we
# continue to develop corresponding geometric intuition.