#!/usr/bin/env python # coding: utf-8 # ### Introduction to Random Variables and Statistical Distributions # In[113]: from scipy.stats import bernoulli, poisson, binom, norm, mvn import numpy as np from matplotlib import pyplot as plt import matplotlib import pandas as pd get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: headimg = plt.imread('../data/quarterheads.jpg') tailimg = plt.imread('../data/quartertails.jpg') # ### Discrete random variables # A discrete random variable can take a finite number of possible values. A toss of a two-sided coin can be thought of as a random variable, or a roll of a 6 sided dice. # # #### Probability distributions for discrete random variables # A random variable is generated from a probability distribution. There are different types of distributions defined for discrete random variables. These include: # - Bernoulli distribution # - Binomial distribution # - Multinoulli distribution # - Poisson distribution # # #### Bernoulli distribution # Bernoulli distribution represents a binary random variable which takes value 1 with success probability $p$ and value 0 with failure probability $q=1-p$. A bernoulli distribution has only one parameter: $p$. # In[180]: theta = 0.5 # let us draw a sample from a bernoulli distribution b = bernoulli.rvs(theta,size=1) print(b) if b[0] == 0: plt.imshow(tailimg) plt.axis('off') else: plt.imshow(headimg) plt.axis('off') # In[183]: # you can also draw samples simultaneously theta = 0.9 samples = bernoulli.rvs(theta,size=1000) print(samples) # count the number of successes (sample = 1). What happens when you change p? print(np.count_nonzero(samples==1)) # In[185]: # plotting the probability mass function for the Bernoulli distribution a = np.arange(2) # domain of the bernoulli variable colors = ['r','g','y','b'] plt.figure(figsize=(12,8)) for i, theta in enumerate([0.1, 0.2, 0.6, 0.7]): ax = plt.subplot(1, 4, i+1) plt.bar(a, bernoulli.pmf(a, theta), label=theta, color=colors[i], alpha=0.2) ax.xaxis.set_ticks(a) plt.legend(loc=0) if i == 0: plt.ylabel("PDF at $k$") plt.suptitle("Bernoulli probability") # #### Binomial distribution # Another popular distribution for a discrete random variable is the binomial distribution. A binomial distribution has two parameters $n$ and $\theta$, where $0 \le \theta \le 1$. The sample generated by a binomial distribution denotes the number of successes observed in a sequence of $n$ binary trials (e.g., toss of a coin) when the probability of each success is $\theta$. # # The samples that are drawn from a binomial distribution range between 0 and $n$. # # The probability distribution is defined as: # \begin{equation} # p(k;n,\theta) = P(X = k) = \binom{n}{k}\theta^k (1 - \theta)^{n-k} # \end{equation} # # In[213]: #sampling from a binomial distribution sample = binom.rvs(20,0.9,1) print(sample) # In[214]: #plotting the pmf for a bernoulli distribution plt.figure(figsize=(12,6)) k = np.arange(0, 22) for p, color in zip([0.1, 0.3, 0.6, 0.8], colors): rv = binom(20, p) plt.plot(k, rv.pmf(k), lw=2, color=color, label=p) plt.fill_between(k, rv.pmf(k), color=color, alpha=0.3) plt.legend() plt.title("Binomial distribution") plt.tight_layout() plt.ylabel("PDF at $k$") plt.xlabel("$k$") # #### Multinoulli distribution # A multinoulli distribution is a generalization of Bernoulli distribution for trials which can take more than two possibilities ($k > 2$). The parameter for multinoulli distribution is a vector ${\bf \theta}$ which has $k$ entries. Each entry $\theta_i$ indicates the probability of observing the category $i$ in a single trial. # In[216]: [1/6.]*6 # In[215]: # generate samples from a multinoulli distribution. Essentially simulated a single roll of dice. Note that the output is a vector of length $k = 6$ np.random.multinomial(1, [1/6.]*6, size=1) # #### Multinomial distribution # A multinomial distribution is a generalization of Binomial distribution for trials which can take more than two possibilities. The parameters for the multinomial distribution is a vector ${\bf \theta}$ and $n$. # # In[4]: # generate samples from a multinomial distribution. Note that the output is a vector of length $k = 6$ np.random.multinomial(20, [1/6.]*6, size=1) # #### Poisson distribution # Typically used to model counts. Domain ($\chi$) is $0 ... \infty$. It has one parameter, $\lambda$. # # The *probability mass function* is given as: # $P(X = k) = \frac{\lambda^ke^{-\lambda}}{k!}$. # # The expected value or mean of $X$ is $\lambda$. # In[219]: lambd = 50 rv = poisson(lambd) # calculate the pmf for different values of k and plot k = np.arange(100) plt.bar(k,rv.pmf(k)) plt.xlim([0,100]) # In[88]: # generating samples from this distribution samples = rv.rvs(1000) h = plt.hist(samples,bins=20,density=True) plt.xlim([0,100]) # ### Continuous random variables # A continuous random variable can take an infinite number of possible values. Several interesting distributions exist: # - alpha An alpha continuous random variable. # - beta A beta continuous random variable. # - gamma A gamma continuous random variable. # - expon An exponential continuous random variable. # - gauss Gaussian random variable # # #### Gaussian distribution # One of the most popular distribution is the Gaussian distribution. This distribution is defined for any number of variables. For a single variable case, the distribution is defined using two parameters: $\mu$ and $\sigma$. $\mu$ or the mean can take any value and $\sigma$ or the standard deviation is $\ge 0$. # # For a continuous distribution, you cannot compute the probability mass at any value of the random variable. Instead, you can compute the density using the probability density function: # $$p(x) = \frac{1}{\sigma\sqrt{2\pi}}\exp[-\frac{1}{2}(\frac{x - \mu}{\sigma})^2]$$ # The random variable represented using a Gaussian distribution can take any value from $-\infty$ to $\infty$. # In[230]: # set the parameters mu = 125 sigma = 8 # draw 1000 samples from this distribution #samples = norm(mu, sigma).rvs(1000) # plot an empirical distribution, i.e., a histogram #h = plt.hist(samples, 30, density=True, alpha=.3) # Compute the density at several instances of the random variable x = np.linspace(90, 170, 10001) # plot the density plt.plot(x, norm(mu, sigma).pdf(x), linewidth=2) # #### Real-world example # Consider heights and weights of a population sample # In[227]: hw = pd.read_csv('../data/heightweight.csv') # In[228]: ax = plt.subplot(111) hw['Weight'].hist(bins=20,ax=ax) ax.set_xlabel('Weight (lbs)') # #### Multi-dimensional or multivariate Gaussian distribution # A distribution can be defined for multivariate random variables. One example is the multivariate Gaussian. In general, the random variable is a $D$ length vector ${\bf X}$. The two parameters of this distribution are a mean vector ${\bf \mu}$ and a covariance matrix $\Sigma$. The pdf at any value of ${\bf X}$ is given by: # $$ # \mathcal{N}({\bf X}| {\bf \mu,\Sigma}) \triangleq \frac{1}{(2\pi)^{D/2}|{\bf \Sigma}|^{D/2}}exp\left[-\frac{1}{2}{\bf (x-\mu)^\top\Sigma^{-1}(x-\mu)}\right] # $$ # Note that if $D = 1$, it reduces to a univariate Gaussian distribution. # In[233]: #define the parameters for D = 2 mu = np.array([10,10]) Sigma = np.array([[4,1.],[1.,1]]) rv = np.random.multivariate_normal(mu,Sigma) #sample some points s = np.random.multivariate_normal(mu,Sigma,1000) fig = plt.figure() plt.subplot(111) plt.scatter(s[:,0],s[:,1]) # add a contour plot smin = np.min(s,axis=0) smax = np.max(s,axis=0) t1=np.linspace(smin[0],smax[0],1000) t2=np.linspace(smin[1],smax[1],1000) # evaluate pdf at each of these mesh points # In[9]: np.cov(s.transpose())