#!/usr/bin/env python # coding: utf-8 # # Block 6: Random utility models # ### Alfred Galichon (NYU & Sciences Po) # ## 'math+econ+code' masterclass on optimal transport and economic applications # © 2018-2021 by Alfred Galichon. Past and present support from NSF grant DMS-1716489, ERC grant CoG-866274, and contributions by Jules Baudet, Pauline Corblet, Gregory Dannay, and James Nesbit are acknowledged. # # #### With python code examples # ### Learning objectives # # * Emax operator and generalized entropy of choice # # * The Daly-Zachary-Williams theorem # # * The MEV class # ### References # # * Galichon (2016). *Optimal Transport Methods in Economics*, App. E. Princeton University Press. # # * McFadden (1981). "Econometric Models of Probabilistic Choice", in C.F. Manski and D. McFadden (eds.), *Structural Analysis of Discrete Data with Econometric Applications*, MIT Press. # # * McFadden (1989). "A Method of Simulated Moments for Estimation of Discrete Response Models Without Numerical # Integration". *Econometrica*. # # * Berry, Levinsohn, and Pakes (1995). "Automobile Prices in Market Equilibrium," *Econometrica*. # # * Train. (2009). *Discrete Choice Methods with Simulation*. 2nd Edition. Cambridge University Press. # # * Galichon and Salanie (2020). "Cupid's Invisible Hands". Preprint (first version 2011). # # * Chiong, Galichon and Shum (2016), "Duality in Discrete Choice Models". *Quantitative Economics*. # # * Galichon. "On the representation of the nested logit model". *Econometric Theory* (forthcoming). # # * Greene and Hensher (1997), *Multinomial Logit and Discrete Choice Models*. # ## A look at our data # The data is taken from Greene and Hensher (1997). 210 individuals are surveyed about their choice of travel mode between Sydney, Canberra and Melbourne, and the various costs (time and money) associated with each alternative. Therefore there are 840 = 4 x 210 observations, which we can stack into `travelmodedataset` a 3 dimensional array whose dimensions are mode,individual,dummy for choice+covariates. # # --- # # First, let's load the libraries we will need and let us get the data files. # In[13]: import numpy as np import os import pandas as pd #thepath = os.path.join(os.getcwd(),'data_mec_optim/demand_travelmode/') thepath = 'https://raw.githubusercontent.com/math-econ-code/mec_optim_2021-01/master/data_mec_optim/demand_travelmode/' travelmode = pd.read_csv(thepath + 'travelmodedata.csv') travelmode['choice'] = np.where(travelmode['choice'] =='yes' , 1, 0) # Our data looks as follows: # In[2]: travelmode.head() # Each individual is surveyed about their perceiption of various costs associated with each of the four alternatives. Hence each individual appears four times in the data set, and there are 210 observations in total: # In[3]: nobs = travelmode.shape[0] ncols = travelmode.shape[1] nbchoices = 4 ninds = int(nobs/nbchoices) ninds # We consutruct a matrix `muhat_i_y` of dummies for various choices (1 if individual `i` chooses alternative `y`, 0 else). # In[4]: muhat_i_y = travelmode['choice'].to_numpy().reshape(ninds,4).T muhat_iy = muhat_i_y.flatten() # We compute the inconditional market shares associated with each of the four travel modes: # In[5]: s_y = travelmode.groupby(['mode']).mean()['choice'].to_frame().sort_index() s_y.transpose() # # Emax operators and demand maps # ## Discrete choice models # # # Assume a consumer is facing a number of options $y\in\mathcal{Y}_{0}=\mathcal{Y}\cup\left\{ 0\right\}$, where $y=0$ is a default option. The consumer is drawing a utility shock which is a vector $\varepsilon =(\varepsilon_{0},\ldots,\varepsilon_{\left\vert \mathcal{Y}\right\vert })\sim\mathbf{P}$ such that the utility of option $y$ is $U_{y}+\varepsilon _{y}$, while the outside option yields utility $\varepsilon_{0}$. # # $U$ is called vector of *systematic utilities*; $\varepsilon$ is called vector of *utility shocks*. # # We assume throughout that $\mathbf{P}$ has a density with respect to the Lebesgue measure, and has full support. # # The preferred option is the one which attains the maximum in # # \begin{align*} # \max_{y\in\mathcal{Y}}\left\{ U_{y}+\varepsilon_{y},\varepsilon_{0}\right\}. # \end{align*} # ## Demand map # # Let $s_{y}=\sigma_{y}\left( U\right) $ be the probability of choosing option $y$, where $\sigma$ is given by # # \begin{align*} # \sigma_{y}\left( U\right) =\Pr(U_{y}+\varepsilon_{y}\geq U_{z}+\varepsilon_{z}\mbox{ for all }z\in\mathcal{Y}_{0}). # \end{align*} # # The map $\sigma$ is called *demand map*, and the vector $s$ is called vector of market shares, or vector of choice probabilities. # # Note that if $s=\sigma(U)$, then $s_{y}>0$ for all $y\in\mathcal{Y}_{0}$ and $\sum_{y\in\mathcal{Y}_{0}}s_{y}=1$. # # Note that because the distribution $\mathbf{P}$ of $\varepsilon$ is continuous, the probability of being indifferent between two options is zero, and hence we could have indifferently replaced weak preference $\geq$ by strict preference $>$. Without this, choice probabilities may not have been well defined. # ### Properties # # * $\sigma_{y}\left( U\right) $ is increasing in $U_{y}$. # # * $\sigma_{y}\left( U\right) $ is weakly decreasing in $U_{y^{\prime}}$ for $y^{\prime}\neq y$. # # * If one replaces $\left( U_{y}\right) $ by $\left( U_{y}+c\right) $, for a constant $c$, one has $\sigma\left( U+c\right) =\sigma\left( U\right) .$ # # ### Normalization # # Because of the last property, we can normalize the utility of one of the alternatives. We will normalize the utility of the utility associated to $y=0$, and hence take # # \begin{align*} # U_{0}=0. # \end{align*} # # Thus in the sequel, $\sigma$ will be seen as a mapping from $\mathbb{R}^{\mathcal{Y}}$ to the set of $\left( s_{y}\right)_{y\in \mathcal{Y}}$ such that $s_{y}>0$ and $\sum_{y\in\mathcal{Y}}s_{y}<1$, and the choice probability of alternative $y=0$ is recovered by # # \begin{align*} # s_{0}=1-\sum_{y\in\mathcal{Y}}s_{y}. # \end{align*} # ### The Daly-Williams-Zachary theorem # # Define the expected indirect utility of consumers by # # \begin{align*} # G(U)=\mathbb{E}\left[ \max_{y\in\mathcal{Y}}(U_{y}+\varepsilon_{y}% # ,\varepsilon_{0})\right] # \end{align*} # # This is called *Emax operator*, a.k.a. *McFadden's surplus function*. # # As the expectation of the maximum of terms which are linear in $U$, $G$ is convex function in $U$ (strictly convex in fact), and # # \begin{align*} # \frac{\partial G}{\partial U_{y}}(U)=\Pr(U_{y}+\varepsilon_{y}\geq # U_{z}+\varepsilon_{z}\mbox{ for all }z\in\mathcal{Y}_{0}). # \end{align*} # # But the right-hand side is simply the probability $s_{y}$ of choosing option $y$; therefore, we get: # # --- # # **Theorem (Daly-Zachary-Williams)** # # The demand map $\sigma$ is the gradient of the Emax operator $G$, that is # # # \begin{align*} # \sigma\left( U\right) =\nabla G(U). # \end{align*} # # # Examples # ## Model 1: the logit model # # Assume that $\mathbf{P}$ is the distribution of i.i.d. *centered type $1$ extreme value* a.k.a. *centered Gumbel* terms, which has c.d.f. # # \begin{align*} # F\left( z\right) =\exp\left( -\exp\left( -x+\gamma\right) \right) # \end{align*} # # where $\gamma=0.5772...$ (Euler's constant). The mean of this distribution is zero. # # Basic fact from extreme value theory: if $(\varepsilon_{y})$ have i.i.d. Gumbel distributions, then $\max\left\{U_{y}+\varepsilon_{y}\right\} $ has the same distribution as $\log\left(\sum_{y=1}^{n}\exp U_{y}\right) +\epsilon$, where $\epsilon$ is also a Gumbel. (Proof of this fact later). # # Notes: # # * This distribution is sometimes called the "Gumbel max" distribution, to contrast it with the distribution of its opposite, which is then called *Gumbel min*. # # * The literature usually calls *standard Gumbel* the distribution with c.d.f. $\exp\left(-\exp\left( -x\right) \right) $; but that distribution has mean $\gamma$, which is why we slightly depart from the convention. # The Emax operator associated with the logit model can be given in closed form as # # \begin{align*} # G(U)=\log\left( 1+\sum_{y\in\mathcal{Y}}\exp(U_{y})\right) # \end{align*} # # where $s_{0}=1-\sum_{y\in\mathcal{Y}}s_{y}$. This is called a *log-partition function* # # As a result, the choice probability of alternative $y$ is proportional to the exponential of the systematic utility associated with $U$, that is # # \begin{align*} # \sigma_{y}\left( U\right) =\frac{\exp U_{y}}{1+\sum_{y^{\prime}% # \in\mathcal{Y}}\exp(U_{y^{\prime}})} # \end{align*} # # which is called a *Gibbs distribution*. # # * Assume that the random utility shock is scaled by a factor $T$. Then # # \begin{align*} # \sigma_{y}\left( U\right) =\frac{\exp\left( U_{y}/T\right) }{1+\sum_{y^{\prime}\in\mathcal{Y}}\exp(U_{y^{\prime}}/T)} # \end{align*} # # which is sometimes called the *soft-max operator* and converges as $T\rightarrow0$ toward # # \begin{align*} # \max_{y\in\mathcal{Y}}\left\{ U_{y},0\right\} . # \end{align*} # ## Generalizing the logit model: the multivariate extreme value (MEV) class # # **See class notes for an introduction using Pickand's representation.** # # Let $\mathbf{F}$ be a cumulative distribution such that function $g$ defined by # # \begin{align*} # g\left( x_{1},...,x_{n}\right) =-\log\mathbf{F}\left( -\log x_{1},...,-\log # x_{n}\right) \label{cdfMEV} # \end{align*} # # is positive homogeneous of degree $1$. (This inverts into $\mathbf{F}\left(u_{1},...,u_{n}\right) =\exp\left( -g\left( e^{-u_{1}},...,e^{-u_{n} }\right) \right) $). We have by a theorem of McFadden (1978): # # --- # **Theorem** # # Let $\left( \varepsilon_{y}\right) _{1\leq y\leq n}$ be a random vector with c.d.f. $\mathbf{F}$, and define # # \begin{align*} # Z=\max_{y=1,...,n}\left\{ U_{y}+\varepsilon_{y}\right\} . # \end{align*} # # Then $Z$ has the same distribution as $\log g\left( e^{U_{1}},...,e^{U_{n}}\right) +\gamma+\epsilon$, where $\epsilon$ is a standard Gumbel. In particular, # \begin{align*} # \mathbb{E}\left[ \max_{y=1,...,n}\left\{ U_{y}+\varepsilon_{y}\right\} # \right] =\log g\left( e^{U_{1}},...,e^{U_{n}}\right) +\gamma # \end{align*} # where $\gamma$ is the Euler constant $\gamma\simeq0.5772$. # # --- # # **Proof** # # Let $F_{Z}$ be the c.d.f. of $Z=\max_{y=1,\ldots,n}\left\{ U_{y}+\varepsilon_{y}\right\} $. One has # # \begin{align*} # F_{Z}\left( z\right) & =\Pr\left( \max_{y=1,\ldots,n}\left\{ # U_{y}+\varepsilon_{y}\right\} \leq z\right) =\Pr\left( \forall # y:~\varepsilon_{y}\leq z-U_{y}\right) \\ # & =\mathbf{F}\left( z-U_{1},...,z-U_{n}\right) =\exp\left( -g\left( # e^{U_{1}-z},...,e^{U_{n}-z}\right) \right) \\ # & =\exp\left( -e^{-z}g\left( e^{U_{1}},...,e^{U_{n}}\right) \right) # =\varphi\left( z-\log g\left( e^{U_{1}},...,e^{U_{n}}\right) -\gamma # \right) # \end{align*} # # where $\varphi\left( z\right) :=\exp\left( -e^{-\left( z-\gamma\right) }\right) $ is the cdf of the standard Gumbel distribution. Hence $Z$ has the distribution of $\log g\left( e^{U_{1}},...,e^{U_{n}}\right) +\gamma +\epsilon$, where $\epsilon$ is a standard Gumbel. # As a result, the choice probability of alternative $y$ is # # \begin{align*} # \sigma_{y}\left( U\right) =\frac{\frac{\partial g}{\partial x_{y}}\left(e^{U_{1}},...,e^{U_{n}}\right) }{g\left( e^{U_{1}},...,e^{U_{n}}\right) # }e^{U_{y}}. # \end{align*} # # The MEV\ framework has several commonly used examples: logit, nested logit, mixture of logit... # # We just saw the logit model, in which $g\left( x_{1},...,x_{n}\right) =e^{-\gamma}\sum_{y=1}^{n}x_{y}$. In this case, the distribution of # # \begin{align*} # Z=\max_{y=1,\ldots,n}\left\{ U_{y}+\varepsilon_{y}\right\} # \end{align*} # # is $\log\sum_{y=1}^{n}e^{U_{y}}+\epsilon$, where $\epsilon$ is a standard Gumbel. # ## Model 2: the nested logit model # # The nested logit model is an instance of MEV model where alternatives can be grouped in nests. Eg, people choose their means of transportation (nest), and within this nest, a particular operator. # # **See class notes for an introduction using positive-stable distributions representation.** # # # Let $\mathcal{X}$ be the set of nests and assume that for each nest $x$, there is a set $\mathcal{Y}_{x}$ alternatives. Let $U_{xy}$ be utility from alternative $y$ in nest $x$, and $\lambda_{x}\in\left[ 0,1\right] $ and # # \begin{align*} # g\left( U_{xy}\right) =e^{-\gamma}\sum_{x\in\mathcal{X}}\left( \sum # _{y\in\mathcal{Y}_{x}}U_{xy}^{1/\lambda_{x}}\right) ^{\lambda_{x}}. # \end{align*} # # In this case, # # \begin{align*} # G\left( U\right) & =\mathbb{E}\left[ \max_{x\in\mathcal{X}}\max # _{y\in\mathcal{Y}_{x}}\left\{ U_{xy}+\varepsilon_{xy}\right\} \right] =\log\sum_{x\in\mathcal{X}}\left( \sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right) ^{\lambda_{x}}\\ # \sigma_{xy}\left( U\right) & =\frac{\left( \sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right) ^{\lambda_{x}}}{\sum_{x\in\mathcal{X}}\left(\sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right) ^{\lambda_{x}}}\frac{e^{U_{xy}/\lambda_{x}}}{\left( \sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right)} # \end{align*} # # so the demand map has an interesting interpretation as "choice of nest then choice of alternative". # Assume that $\left( \varepsilon_{1},\varepsilon_{2}\right) $ have a nested logit distribution with two nests, that is, their cdf is given by # # \begin{align*} # \mathbf{F}\left( u_{1},u_{2}\right) =\exp\left( -e^{-\gamma}\left(e^{-u_{1}/\lambda}+e^{-u_{2}/\lambda}\right) ^{\lambda}\right) . # \end{align*} # # Particular cases: # # * When $\lambda=1$, $\varepsilon_{1}$ and $\varepsilon_{2}$ are independent and one recovers the logit model. # # * When $\lambda\rightarrow0$, $\mathbf{F}\left( u_{1},u_{2}\right) =\exp\left( -e^{-\gamma}e^{\max\left\{ -u_{1},-u_{2}\right\} }\right)=\min\left\{ \mathbf{F}\left( u_{1}\right) ,\mathbf{F}\left( u_{2}\right) \right\} $ and therefore $\varepsilon_{1}$ and $\varepsilon_{2}$ are perfectly correlated. # # In general one can show that # # \begin{align*} # \lambda=\sqrt{1-cor\left( \varepsilon_{1},\varepsilon_{2}\right) }% # \end{align*} # # This formula, due to Tiago de Oliviera, is now easy to prove using the positive-stable representation. # ## Other popular models (beyond MEV) # # * Probit model (later) # # * Berry-Pakes' pure characteristics model (later) # # * Berry-Levinsohn-Pakes' mixed logit coefficient model (later) # # Demand inversion # In many settings, the econometrician observes the market shares $s_{y}$ and wants to deduce the corresponding vector of systematic utilities. That is, we would like to solve: # # --- # **Problem**. # # Given a vector $s$ with positive entries satisfying $\sum_{y\in\mathcal{Y}}s_{y}<1$, characterize and compute the set # # \begin{align*} # \sigma^{-1}\left( s\right) =\left\{ U\in\mathbb{R}^{\mathcal{Y}}% # :\sigma\left( U\right) =s\right\} . # \end{align*} # # --- # # This problem is called "demand inversion", or "conditional choice # probability inversion", or "identification problem". It is a central issue in econometrics/industrial organization and will be a key building block for matching models. # ## Demand inversion via convex analysis # # Galichon and Salanié (2011) suggested to use convex analysis for the inversion of discrete choice problems. Indeed, if $G$ is strictly convex and $C^{1}$, then # # \begin{align*} # \sigma^{-1}\left( s\right) =\nabla G^{-1}(s)=\nabla G^{\ast}\left( # s\right), # \end{align*} # # which leads Galichon and Salanié to define the *entropy of choice* by # # \begin{align*} # G^{\ast}(s)=\max_{U}\left\{ \sum_{y\in\mathcal{Y}}s_{y}U_{y}-G(U)\right\}. # \label{eq:constrG}% # \end{align*} # # In terms of convex analysis, $G^{\ast}$ is the Legendre-Fenchel transform of $G$. It follows from the envelope theorem that $\sigma^{-1}\left( s\right) $ is the vector $U$ such that # # \begin{align*} # U\in\arg\max_{U}\left\{ \sum_{y\in\mathcal{Y}}s_{y}U_{y}-G(U)\right\} . # \end{align*} # ## Entropy of choice # # Convex duality implies that if $s$ and $U$ are related by $s\in\partial G\left( U\right)$, then # # \begin{align*} # G(U)=\sum_{y\in\mathcal{Y}}s_{y}U_{y}-G_{x}^{\ast}(s). \label{fenchel}% # \end{align*} # # But letting $Y=\arg\max_{y}\left\{ U_{y}+\varepsilon_{y}\right\}$, $G\left( U\right) =\mathbb{E}\left[ U_{Y}+\varepsilon_{Y}\right]$ implies # # \begin{align*} # G(U)=\sum_{y\in\mathcal{Y}}s_{y}U_{y}+\mathbb{E}\left[ \varepsilon # _{Y}\right] , # \end{align*} # # thus one has # # \begin{align*} # G^{\ast}(s)=-\mathbb{E}\left[ \varepsilon_{Y}\right] . \label{Fenchel}% # \end{align*} # # Hence, the entropy of choice $G^{\ast}\left( s\right) $ is interpreted as minus\ the expected amount of heterogeneity needed to rationalize the choice probabilities $s$. # ## Examples of entropy of choice and identification # ### Logit model # # Then # # \begin{align*} # G^{\ast}\left( s\right) =s_{0}\log(s_{0})+\sum_{y\in\mathcal{Y}}s_{y}\log s_{y} # \end{align*} # # where $s_{0}=1-\sum_{y\in\mathcal{Y}}s_{y}$. Hence, $G^{\ast}$ is a bona fide entropy function when $\mathbf{P}$ is Gumbel--hence the name of *entropy of choice* in general. # # As a result, # # \begin{align*} # \sigma_{y}^{-1}\left( s\right) =\log\frac{s_{y}}{s_{0}} # \end{align*} # # which is the celebrated *log-odds ratio formula*: the log of the odds of alternatives $y$ and $0$ identify the difference between the systematic utilities of these alternatives. # **Back to our dataset**. Ignoring any of the information we have on individual, and thus assuming there is a representative consumer with logit random utility, we shall retrieve the systematic utilities from the market shares. # # Define "car" as the default alternative. The utilities in the logit model are obtained by the log-odds ratio formula: # In[7]: Ulogit = np.log(s_y/s_y.loc[['car']].to_numpy()[0]) Ulogit.transpose() # ### Nested logit model # The entropy of choice $G^{\ast}$ in the nested logit model is given by # # \begin{align*} # G^{\ast}\left( s\right) =\sum_{x\in\mathcal{X}}\sum_{y\in\mathcal{Y}_{x}}\lambda_{x}s_{xy}\ln s_{xy}+\sum_{x\in\mathcal{X}}\left( 1-\lambda_{x}\right) \left( \sum_{z\in\mathcal{Y}_{x}}s_{xz}\right) \ln\left(\sum_{z\in\mathcal{Y}_{x}}s_{xz}\right) \label{Gstarnestedlogit} # \end{align*} # # if $s_{xy}\geq0$ and $\sum_{x\in\mathcal{X}}\sum_{y\in\mathcal{Y}_{x}}s_{xy}=1$, $G^{\ast}\left( s\right) =+\infty$ otherwise. # # Identification in the nested logit model: with normalization $\sum _{x\in\mathcal{X}}\left( \sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right) ^{\lambda_{x}}=1$, one has $s_{xy}=\left( \sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}\right) ^{\lambda_{x}-1}e^{U_{xy}/\lambda_{x}}$, thus $\sum_{y\in\mathcal{Y}_{x}}e^{U_{xy}/\lambda_{x}}=\left( \sum _{y\in\mathcal{Y}_{x}}s_{xy}\right) ^{1/\lambda_{x}}$, therefore # # \begin{align*} # U_{xy}=\lambda_{x}\log s_{xy}-\left( \lambda_{x}-1\right) \log\sum # _{y\in\mathcal{Y}_{x}}s_{xy}. # \end{align*} # **Back to our dataset**. Still ignoring any observation about individual agents, let us retrieve the systematic utilities from the market shares in the nested logit model with two nests, "noncar" and "car", and taking $\lambda=0.5$ in both nests. Do: # # In[11]: λ = [0.5, 0.5] nest = Ulogit.index.isin(['car']) Unocar = λ[0] * np.log(s_y[~nest]) - (λ[0] - 1) * np.log(np.sum(s_y[~nest])) Ucar = λ[1]*np.log(s_y[nest]) - (λ[1] - 1) * np.log(np.sum(s_y[nest])) display(Unocar.transpose()) display(Ucar.transpose()) # This yields the systematic utilities (nested logit): # In[12]: Unested = (pd.concat([Unocar,Ucar]) - Ucar.to_numpy()[0]).sort_index() Unested.transpose()