#!/usr/bin/env python # coding: utf-8 # # Lecture 14: Location, Scale and LOTUS # # # ## Stat 110, Prof. Joe Blitzstein, Harvard University # # ---- # ## Standard Normal (from last time...) # # - $\mathcal{Z} \sim \mathcal{N}(0,1)$ # - PDF $\frac{1}{\sqrt{2\pi}} ~~ e^{-\frac{z^2}{2}}$ # - CDF $\Phi$ # - Mean $\mathbb{E}(\mathcal{Z}) = 0$ # - Variance $\operatorname{Var}(\mathcal{Z}) = \mathbb{E}(\mathcal{Z}^2) = 1$ # - Skew (3<sup>rd</sup> moment) $\mathbb{E}(\mathcal{Z^3}) = 0$ (odd moments are 0 since they are odd functions) # - $-\mathcal{Z} \sim \mathcal{N}(0,1)$ (by symmetry; this simply flips the bell curve about its mean) # # ... and with the standard normal distribution under our belts, we can now turn to the more general form. # # But first let's revisit variance once more and extend what we know. # # ---- # ## Rules on Variance # # \begin{align} # & \text{[1]} & \operatorname{Var}(X) &= \mathbb{E}( (X - \mathbb{E}X)^2 ) \\ # & & &= \mathbb{E}X^2 - (\mathbb{E}X)^2 \\ # \\ # & \text{[2]} & \operatorname{Var}(X+c) &= \operatorname{Var}(X) \\ # \\ # & \text{[3]} & \operatorname{Var}(cX) &= c^2 ~~ \operatorname{Var}(X) \\ # \\ # & \text{[4]} & \operatorname{Var}(X+Y) &\neq \operatorname{Var}(X) + \operatorname{Var}(Y) ~~ \text{in general} # \end{align} # # * We already know $\text{[1]}$ # * Re $\text{[2]}$, adding a constant $c$ has no effect on $\operatorname{Var}(X)$. # * Re $\text{[3]}$, pulling out a scaling constant $c$ means you have to square it. # * $\operatorname{Var}(X) \ge 0$; $\operatorname{Var}(X)=0$ if and only if $P(X=a) = 1$ for some $a$... _variance can never be negative!_ # * Re $\text{[4]}$, unlike expected value, variance is _not_ linear. But if $X$ and $Y$ are independent, then $\operatorname{Var}(X+Y) = \operatorname{Var}(X) + \operatorname{Var}(Y)$. # # As a case in point for (4), consider # # \begin{align} # \operatorname{Var}(X + X) &= \operatorname{Var}(2X) \\ # &= 4 ~~ \operatorname{Var}(X) \\ # &\neq 2 ~~ \operatorname{Var}(X) & \quad \blacksquare \\ # \end{align} # # ... and now we know enough about variance to return back to the general form of the normal distribution. # # ---- # ## General Normal Distribution # # ### Description # # Let $X = \mu + \sigma \mathcal{Z}$, where # # - $\mu \in \mathbb{R}$ (also known as _location_) # - $\sigma \gt 0$ (_standard deviation_, also known as _scale_) # # Then $X \sim \mathcal{N}(\mu, \sigma^2)$ # # ### Expected value # # It follows immediately that # # \begin{align} # \mathbb{E}(X) &= \mu # \end{align} # # ### Variance # # From what we know about variance, # # \begin{align} # \operatorname{Var}(\mu + \sigma \mathcal{Z}) &= \sigma^2 ~~ \operatorname{Var}(\mathcal{Z}) \\ # &= \sigma^2 # \end{align} # # ---- # ## Standardization # # Solving for $\mathcal{Z}$, we have # # \begin{align} # \mathcal{Z} &= \frac{X - \mu}{\sigma} # \end{align} # # ### CDF & PDF # # For the general normal distribution, we can _standardize_ it to allow us to obtain both cdf and pdf. # # Given $X \sim \mathcal{N}(\mu, \sigma^2)$, we can get the cdf and pdf # # \begin{align} # \text{cdf} ~~ P(X \le x) &= P\left(\frac{X-\mu}{\sigma} \le \frac{x - \mu}{\sigma}\right) \\ # &= \Phi \left(\frac{x-\mu}{\sigma} \right) \\ # \\ # \Rightarrow \text{pdf} ~~ \Phi' \left(\frac{x-\mu}{\sigma} \right) &= \frac{1}{\sigma} ~~ \frac{1}{\sqrt{2\pi}} ~~ e^{-\frac{\left(\frac{x-\mu}{\sigma}\right)^2}{2}} # \end{align} # # ### $-X$ # # We can also do $-X$, but apply what we've just covered. # # \begin{align} # -X &= -\mu + \sigma (-\mathcal{Z}) \sim \mathcal{N}(-\mu, \sigma^2) # \end{align} # # # ### Linearity? # # Later we will show that if $X_j \sim \mathcal{N}(\mu, \sigma^2)$ are independent (consider $j \in {1,2}$), then $X_1 + X_2 \sim \mathcal{N}(\mu_1 + \mu_2, \sigma_1^2 + \sigma_2^2)$. # # # ### $\Phi$ and the 68-95-98.7% Rule # # Since $\Phi$ cannot be computed in terms of other functions, we have the 68-95-98.7% Rule. # # If $X \sim \mathcal{N}(\mu, \sigma^2)$, then as a rule of thumb $\Phi$ takes on the following values with relation to $\sigma$: # # \begin{align} # P(\lvert X-\mu \rvert &\le \sigma) \approx 0.68 \\ # P(\lvert X-\mu \rvert &\le 2 \sigma) \approx 0.95 \\ # P(\lvert X-\mu \rvert &\le 3 \sigma) \approx 0.987 # \end{align} # # ---- # ## Variance of $\operatorname{Pois}(\lambda)$ # # ### Intuition # # Suppose we have the following: # $\newcommand\T{\Rule{0pt}{1em}{.3em}}$ # \begin{array}{|c|c|} # \hline Prob \T & P_0 & P_1 & P_1 & P_3 & \dots \\\hline # X \T & 0 & 1 & 2 & 3 & \dots \\\hline # X^2 \T & 0^2 & 1^2 & 2^2 & 3^2 & \dots \\\hline # \end{array} # # And so you can see that the probabilities for $X$ also are the same for $X^2$. That means we should be able to do this: # # \begin{align} # \mathbb{E}(X) &= \sum_x x ~ P(X=x) \\ # \mathbb{E}(X^2) &= \sum_x x^2 ~ P(X=x) \\ # \end{align} # # ### The case for $\operatorname{Pois}(\lambda)$ # # Let $X \sim \operatorname{Pois}(\lambda)$. # # Recall that $\operatorname{Var}(X) = \mathbb{E}X^2 - (\mathbb{E}X)^2$. We know that $\mathbb{E}(X) = \lambda$, so all we need to do is figure out what $\mathbb{E}(X^2)$ is. # # \begin{align} # \mathbb{E}(X^2) &= \sum_{k=0}^{\infty} k^2 ~ \frac{e^{-\lambda} \lambda^k}{k!} \\ # \\ # \text{recall that } \sum_{k=0}^{\infty} \frac{\lambda^k}{k!} &= e^{\lambda} & \quad \text{Taylor series for } e^x \\ # \\ # \sum_{k=1}^{\infty} \frac{k ~ \lambda^{k-1}}{k!} &= e^{\lambda} & \quad \text{applying the derivative operator} \\ # \sum_{k=1}^{\infty} \frac{k ~ \lambda^{k}}{k!} &= \lambda ~ e^{\lambda} & \quad \text{muliply by } \lambda \text{, replenishing it} \\ # \sum_{k=1}^{\infty} \frac{k^2 ~ \lambda^{k-1}}{k!} &= \lambda ~ e^{\lambda} + e^{\lambda} = e^{\lambda} (\lambda + 1) & \quad \text{applying the derivative operator again} \\ # \sum_{k=1}^{\infty} \frac{k^2 ~ \lambda^{k}}{k!} &= \lambda e^{\lambda} (\lambda + 1) & \quad \text{replenish } \lambda \text{ one last time} \\ # \\ # \therefore \mathbb{E}(X^2) &= \sum_{k=0}^{\infty} k^2 ~ \frac{e^{-\lambda} \lambda^k}{k!} \\ # &= e^{-\lambda} \lambda e^{\lambda} (\lambda + 1) \\ # &= \lambda^2 + \lambda \\ # \\ # \operatorname{Var}(X) &= \mathbb{E}(X^2) - (\mathbb{E}X)^2 \\ # &= \lambda^2 + \lambda - \lambda^2 \\ # &= \lambda & \quad \blacksquare # \end{align} # # ---- # ## Variance of $\operatorname{Binom}(X)$ # # Let $X \sim \operatorname{Binom}(n,p)$. # # $\mathbb{E}(X) = np$. # # Find $\operatorname{Var}(X)$ using all the tricks you have at your disposal. # # ### The path of least resistance # # Let's try applying (4) from the above Rules of Variance. # # We can do so because $X \sim \operatorname{Binom}(n,p)$ means that the $n$ trials are _independent Bernoulli_. # # \begin{align} # X &= I_1 + I_2 + \dots + I_n & \quad \text{where } I_j \text{ are i.i.d. } \operatorname{Bern}(p) \\ # \\ # \Rightarrow X^2 &= I_1^2 + I_2^2 + \dots + I_n^2 + 2I_1I_2 + 2I_1I_3 + \dots + 2I_{n-1}I_n & \quad \text{don't worry, this is not as bad as it looks} \\ # \\ # \therefore \mathbb{E}(X^2) &= n \mathbb{E}(I_1^2) + 2 \binom{n}{2} \mathbb{E}(I_1I_2) & \quad \text{by symmetry} \\ # &= n p + 2 \binom{n}{2} \mathbb{E}(I_1I_2) & \quad \text{since } \mathbb{E}(I_j^2) = \mathbb{E}(I_j) \\ # &= n p + n (n-1) p^2 & \quad \text{since } I_1I_2 \text{ is the event that both } I_1 \text{ and } I_2 \text{ are successes} \\ # &= np + n^2 p^2 - np^2 \\ # \\ # \operatorname{Var}(X) &= \mathbb{E}(X^2) - (\mathbb{E}X)^2 \\ # &= np + n^2 p^2 - np^2 - (np)^2 \\ # &= np - np^2 \\ # &= np(1-p) \\ # &= npq & \quad \blacksquare # \end{align} # # ---- # ## Variance of $\operatorname{Geom}(p)$ # # Let $X \sim \operatorname{Geom}(p)$. # # It has PDF $q^{k-1}p$. # # Find $\operatorname{Var}(X)$. # # ### Applying what we know of the Geometric Series # # \begin{align} # a + ar + ar^2 + ar^3 + \dots &= \sum_{k=0}^{\infty} ar^k = \frac{a}{1-r} & \quad \text{for } \lvert r \rvert \le 1 \\ # \\ # \therefore 1 + r + r^2 + r^3 + \dots &= \sum_{k=0}^{\infty} r^k = \frac{1}{1-r} & \quad \text{when } a = 1 \\ # \\ # \\ # \text{and since we know } \sum_{k=0}^{\infty} q^k &= \frac{1}{1-q} \\ # \sum_{k=1}^{\infty} k q^{k-1} &= \frac{1}{(1-q)^2} & \quad \text{differentiate wrt }q \\ # \sum_{k=1}^{\infty} k q^{k} &= \frac{q}{(1-q)^2} & \quad \text{multiply by }q \\ # \sum_{k=1}^{\infty} k^2 q^{k-1} &= - \frac{q+1}{(q-1)^3} = \frac{(-1)(q+1)}{(-1)^3(1-q)^3} = \frac{q+1}{p^3} & \quad \text{differentiate once more by }q \\ # \\ # \Rightarrow \mathbb{E}(X) &= \sum_{k=0}^{\infty} k q^{k-1} p \\ # &= p \sum_{k=1}^{\infty} k q^{k-1} \\ # &= \frac{p}{(1-q)^2} \\ # &= \frac{1}{p} \\ # \\ # \Rightarrow \mathbb{E}(X^2) &= \sum_{k=0}^{\infty} k^2 q^{k-1} p \\ # &= p \sum_{k=1}^{\infty} k^2 q^{k-1} \\ # &= p \frac{q+1}{p^3} \\ # &= \frac{q+1}{p^2} \\ # \\ # \operatorname{Var}(X) &= \mathbb{E}(X^2) - (\mathbb{E}X)^2 \\ # &= \frac{q+1}{p^2} - \left( \frac{1}{p} \right)^2 \\ # &= \frac{q+1}{p^2} - \frac{1}{p^2} \\ # &= \boxed{\frac{q}{p^2}} & \quad \blacksquare # \end{align} # # ---- # ## Why is LOTUS true? # # Proving LOTUS for the discrete case, we will show $\mathbb{E}(g(X)) = \sum_{x} g(x) \, P(X=x)$. # # Building on what we did when we proved linearity, we have # # \begin{align} # \underbrace{g(x) \, P(X=x)}_{\text{grouped "super-pebble"}} &= \underbrace{\sum_{s \in S} g(X(s)) \, P(\{s\})}_{\text{individual pebbles}} \\ # &= \sum_{x} \sum_{s: X(s)=x} g(X(s)) \, P(\{s\}) \\ # &= \sum_{x} g(x) \sum_{s: X(s)=x} P(\{s\}) \\ # &= \sum_{x} g(x) P(X=x) & \quad \blacksquare # \end{align} # # ---- # View [Lecture 14: Location, Scale, and LOTUS | Statistics 110](http://bit.ly/2CyYFg4) on YouTube.