#!/usr/bin/env python # coding: utf-8 # # Block 10: the gravity equation # ### Alfred Galichon (NYU & Sciences Po) # ## 'math+econ+code' masterclass on optimal transport and economic applications # © 2018-2021 by Alfred Galichon. Past and present support from NSF grant DMS-1716489, ERC grant CoG-866274, and contributions by Jules Baudet, Pauline Corblet, Gregory Dannay, and James Nesbit are acknowledged. # # #### With python code examples # ## Learning objectives # # * Regularized optimal transport # # * The gravity equation # # * Generalized linear models # # * Pseudo-Poisson maximum likelihood estimation # ## References # # * Anderson and van Wincoop (2003). "Gravity with Gravitas: A Solution to the Border Puzzle". *American Economic Review*. # # * Head and Mayer (2014). "Gravity Equations: Workhorse, Toolkit and Cookbook". *Handbook of International Economics*. # # * Choo and Siow (2005). "Who marries whom and why". *Journal of Political Economy*. # # * Gourieroux, Trognon, Monfort (1984). "Pseudo Maximum Likelihood Methods: Theory". *Econometrica*. # # * McCullagh and Nelder (1989). *Generalized Linear Models*. Chapman and Hall/CRC. # # * Santos Silva and Tenreyro (2006). "The Log of Gravity". *Review of Economics and Statistics*. # # * Yotov et al. (2011). *An advanced guide to trade policy analysis*. WTO. # # * Guimares and Portugal (2012). "Real Wages and the Business Cycle: Accounting for Worker, Firm, and Job Title Heterogeneity". *AEJ: Macro*. # # * Dupuy and G (2014), "Personality traits and the marriage market". *Journal of Political Economy*. # # * Dupuy, G and Sun (2019), "Estimating matching affinity matrix under low-rank constraints". *Information and Inference*. # # * Carlier, Dupuy, Galichon and Sun "SISTA: learning optimal transport costs under sparsity constraints." *Communications on Pure and Applied Mathematics* (forthcoming). # # Motivation # # The gravity equation is a very useful tool for explaining trade flows by various measures of proximity between countries. # # A number of regressors have been proposed. They include: geographic distance, common official languague, common colonial past, share of common religions, etc. # # The dependent variable is the volume of exports from country $i$ to country $n$, for each pair of country $\left( i,n\right)$. # # Today, we shall see a close connection between gravity models of international trade and separable matching models. # # --- # # To start with, let's load some of the libraries we shall need. # In[1]: import numpy as np import os import pandas as pd import string as str import math import sys import time from scipy import optimize, special # !python -m pip install -i https://pypi.gurobi.com gurobipy ## only if Gurobi not here import gurobipy as grb # And let's load our data, which comes from the book *An Advanced Guide to Trade Policy Analysis: The Structural Gravity Mode*, by Yotov et al. We will estimate the gravity model using optimal transport as well as using Poisson regression. # In[2]: #thepath = os.path.join(os.getcwd(),'data_mec_optim/gravity_wtodata/') thepath = 'https://raw.githubusercontent.com/math-econ-code/mec_optim_2021-01/master/data_mec_optim/gravity_wtodata/' tradedata = pd.read_csv(os.path.join(thepath ,'1_TraditionalGravity_from_WTO_book.csv'), sep=',') tradedata = tradedata[['exporter', 'importer','year', 'trade', 'DIST','ln_DIST', 'CNTG', 'LANG', 'CLNY']] tradedata.sort_values(['year','exporter','importer'], inplace = True) tradedata.reset_index(inplace = True, drop = True) nbt = len(tradedata['year'].unique()) nbi = len(tradedata['importer'].unique()) nbk = 4 tradedata.head() # Consistent the common practice, we only look at the flows of trade between pairs of distinct countries: # In[3]: tradedata.loc[np.where(tradedata['importer']==tradedata['exporter'],True, False),['DIST', 'ln_DIST', 'CNTG', 'LANG', 'CLNY']]=0 # Let's prepare the data so that we can use it. We want to construct # * $D_{ni,t}^k$ which is the $k$th pairwise discrepancy measure between importer $n$ and exporter $i$ at time $t$ # # * $X_{n,t}$ total value of expenditure of importer $n$ at time $t$ # # * $Y_{i,t}$ total value of production of exporter $i$ at time $t$ # In[4]: Xhatnit = [] Dnikt = [] years = tradedata['year'].unique() for t, year in enumerate(years): tradedata_year = tradedata[tradedata['year']==year] Xhatnit.append(tradedata_year.pivot(index = 'exporter', columns = 'importer', values ='trade').values) np.fill_diagonal(Xhatnit[t],0) Dnikt.append(tradedata_year[[ 'ln_DIST', 'CNTG', 'LANG', 'CLNY']].values) Xnt = np.zeros((nbi,nbt)) Yit = np.zeros((nbi,nbt)) for t in range(nbt): Xnt[:,t] = Xhatnit[t].sum(axis = 1) Yit[:,t] = Xhatnit[t].sum(axis = 0) totalmass_t = sum(Xhatnit).sum(axis=(0,1))/nbt pihat_nit = Xhatnit/totalmass_t # Let's standardize the data and construct some useful objects: # In[5]: meanD_k = np.asmatrix([mat.mean(axis = 0) for mat in Dnikt]).mean(axis = 0) sdD_k = np.asmatrix([mat.std(axis = 0,ddof = 1) for mat in Dnikt]).mean(axis = 0) Dnikt = [(mat - meanD_k)/sdD_k for mat in Dnikt] p_nt = Xnt/totalmass_t q_nt = Yit/totalmass_t IX = np.repeat(1, nbi).reshape(nbi,1) tIY = np.repeat(1, nbi).reshape(1,nbi) f_nit = [] g_nit = [] for t in range(nbt): f_nit.append(p_nt[:,t].reshape(nbi,1).dot(tIY)) g_nit.append(IX.dot(q_nt[:,t].reshape(1,nbi))) # ### Regularized optimal transport # # Consider the optimal transport duality # # \begin{align*} # \max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}=\min_{u_{x}+v_{y}\geq\Phi_{xy}}\sum_{x\in\mathcal{X}}p_{x}u_{x}+\sum_{y\in\mathcal{Y}}q_{y}v_{y} # \end{align*} # # Now let's assume that we are adding an entropy to the primal objective function. For any $\sigma>0$, we get # # \begin{align*} # & \max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}-\sigma\sum_{xy}\pi_{xy}\ln\pi_{xy}\\ # & =\min_{u,v}\sum_{x\in\mathcal{X}}p_{x}u_{x}+\sum_{y\in\mathcal{Y}}q_{y}v_{y}+\sigma\sum_{xy}\exp\left( \frac{\Phi_{xy}-u_{x}-v_{y}-\sigma}{\sigma}\right) # \end{align*} # # The latter problem is an unconstrained convex optimization problem. But the most efficient numerical computation technique is often coordinate descent, i.e. alternate between minimization in $u$ and minimization in $v$. # ### Iterated fitting # # Maximize wrt to $u$ yields # # \begin{align*} # e^{-u_{x}/\sigma}=\frac{p_{x}}{\sum_{y}\exp\left( \frac{\Phi_{xy}-v_{y}-\sigma}{\sigma}\right) } # \end{align*} # # and wrt $v$ yields # # \begin{align*} # e^{-v_{y}/\sigma}=\frac{q_{y}}{\sum_{x}\exp\left( \frac{\Phi_{xy}-v_{y}-\sigma}{\sigma}\right) } # \end{align*} # # It is called the "iterated projection fitting procedure" (ipfp), aka "matrix scaling", "RAS algorithm", "Sinkhorn-Knopp algorithm", "Kruithof's method", "Furness procedure", "biproportional fitting procedure", "Bregman's procedure". See survey in Idel (2016). # # Maybe the most often reinvented algorithm in applied mathematics. Recently rediscovered in a machine learning context. # ### Econometrics of matching # # The goal is to estimate the matching surplus $\Phi_{xy}$. For this, take a linear parameterization # # \begin{align*} # \Phi_{xy}^{\beta}=\sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}. # \end{align*} # # Following Choo and Siow (2006), Galichon and Salanie (2011) introduce logit heterogeneity in individual preferences and show that the equilibrium now maximizes the *regularized Monge-Kantorovich problem* # # \begin{align*} # W\left( \beta\right) =\max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}^{\beta}-\sigma\sum_{xy}\pi_{xy}\ln\pi_{xy} # \end{align*} # # By duality, $W\left( \beta\right) $ can be expressed # # \begin{align*} # W\left( \beta\right) =\min_{u,v}\sum_{x}p_{x}u_{x}+\sum_{y}q_{y}v_{y}+\sigma\sum_{xy}\exp\left( \frac{\Phi_{xy}^{\beta}-u_{x}-v_{y}-\sigma}{\sigma}\right) # \end{align*} # # and w.l.o.g. can set $\sigma=1$ and drop the additive constant $-\sigma$ in the $\exp$. # ### Estimation # # We observe the actual matching $\hat{\pi}_{xy}$. Note that $\partial W/ \partial\beta^{k}=\sum_{xy}\pi_{xy}\phi_{xy}^{k},$\ hence $\beta$ is estimated by running # # # \begin{align*} # \min_{u,v,\beta}\sum_{x}p_{x}u_{x}+\sum_{y}q_{y}v_{y}+\sum_{xy}\exp\left(\Phi_{xy}^{\beta}-u_{x}-v_{y}\right) -\sum_{xy,k}\hat{\pi}_{xy}\beta_{k}\phi_{xy}^{k} # \end{align*} # # which is still a convex optimization problem. # # This is actually the objective function of the log-likelihood in a Poisson regression with $x$ and $y$ fixed effects, where we assume # # \begin{align*} # \pi_{xy}|xy\sim Poisson\left( \exp\left( \sum_{k=1}^{K}\beta_{k}\phi # _{xy}^{k}-u_{x}-v_{y}\right) \right) . # \end{align*} # ### Poisson regression with fixed effects # # Let $\theta=\left( \beta,u,v\right) $ and $Z=\left( \phi,D^{x},D^{y}\right) $ where $D_{x^{\prime}y^{\prime}}^{x}=1\left\{ x=x^{\prime}\right\} $ and $D_{x^{\prime}y^{\prime}}^{y}=1\left\{ y=y^{\prime}\right\}$ are $x$-and $y$-dummies. Let $m_{xy}\left( Z;\theta\right) =\exp\left(\theta^{\intercal}Z_{xy}\right) $ be the parameter of the Poisson distribution. # # The conditional likelihood of $\hat{\pi}_{xy}$ given $Z_{xy}$ is # # \begin{align*} # l_{xy}\left( \hat{\pi}_{xy};\theta\right) & =\hat{\pi}_{xy}\log m_{xy}\left( Z;\theta\right) -m_{xy}\left( Z;\theta\right) \\ # & =\hat{\pi}_{xy}\left( \theta^{\intercal}Z_{xy}\right) -\exp\left(\theta^{\intercal}Z_{xy}\right) \\ # & =\hat{\pi}_{xy}\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right) -\exp\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right) # \end{align*} # # Summing over $x$ and $y$, the sample log-likelihood is # # \begin{align*} # \sum_{xy}\hat{\pi}_{xy}\sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-\sum_{x}p_{x}u_{x}-\sum_{y}q_{y}v_{y}-\sum_{xy}\exp\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right) # \end{align*} # # hence we recover the [objective function](#objFun). # ### From Poisson to pseudo-Poisson # # If $\pi_{xy}|xy$ is Poisson, then $\mathbb{E}\left[\pi_{xy}\right]=m_{xy}\left( Z_{xy};\theta\right) =\mathbb{V}ar\left( \pi_{xy}\right) $. While it makes sense to assume the former equality, the latter is a rather strong assumption. # # For estimation purposes, $\hat{\theta}$ is obtained by # # \begin{align*} # \max_{\theta}\sum_{xy}l\left( \hat{\pi}_{xy};\theta\right) =\sum_{xy}\left(\hat{\pi}_{xy}\left( \theta^{\intercal}Z_{xy}\right) -\exp\left(\theta^{\intercal}Z_{xy}\right) \right) # \end{align*} # # however, for inference purposes, one shall not assume the Poisson distribution. Instead # # \begin{align*} # \sqrt{N}\left( \hat{\theta}-\theta\right) \Longrightarrow\left(A_{0}\right) ^{-1}B_{0}\left( A_{0}\right) ^{-1} # \end{align*} # # where $N=\left\vert \mathcal{X}\right\vert \times\left\vert \mathcal{Y}\right\vert $ and $A_{0}$ and $B_{0}$ are estimated by # # \begin{align*} # \hat{A}_{0} & =N^{-1}\sum_{xy}D_{\theta\theta}^{2}l\left( \hat{\pi}_{xy};\hat{\theta}\right) =N^{-1}\sum_{xy}\exp\left( \hat{\theta}^{\intercal}Z_{xy}\right) Z_{xy}Z_{xy}^{\intercal}\\ # \hat{B}_{0} & =N^{-1}\sum_{xy}\left( \hat{\pi}_{xy}-\exp\left( \hat{\theta}^{\intercal}Z_{xy}\right) \right) ^{2}Z_{xy}Z_{xy}^{\intercal}. # \end{align*} # # # # DG XXX ESTIMATION HERE, USING EXP FAMILIES # # The gravity equation # # "Structural gravity equation" (Anderson and van Wincoop, 2003) as exposited in Head and Mayer (2014) # handbook chapter: # # \begin{align*} # X_{ni}=\underset{S_{i}}{\underbrace{\frac{Y_{i}}{\Omega_{i}}}}\underset{M_{n}}{\underbrace{\frac{X_{n}}{\Psi_{n}}}}\Phi_{ni}% # \end{align*} # # where $n$=importer, $i$=exporter, $X_{ni}$=trade flow from $i$ to $n$, $Y_{i}=\sum_{n}X_{ni}$ is value of production, $X_{n}=\sum_{i}X_{ni}$ is importers' expenditures, and $\phi_{ni}$=bilateral accessibility of $n$ to $i$. # # $\Omega_{i}$ and $\Psi_{n}$ are \textquotedblleft multilateral resistances\textquotedblright, satisfying the set of implicit equations # # \begin{align*} # \Psi_{n}=\sum_{i}\frac{\Phi_{ni}Y_{i}}{\Omega_{i}}\text{ and }\Omega_{i}% # =\sum_{n}\frac{\Phi_{ni}X_{n}}{\Psi_{n}}% # \end{align*} # # These are exactly the same equations as those of the regularized OT. # ## Explaining trade # # Parameterize $\Phi_{ni}=\exp\left( \sum_{k=1}^{K}\beta_{k}D_{ni}^{k}\right) $, where the $D_{ni}^{k}$ are $K$ pairwise measures of distance between $n$ and $i$. We have # # \begin{align*} # X_{ni}=\exp\left( \sum_{k=1}^{K}\beta_{k}D_{ni}^{k}-s_{i}-m_{n}\right) # \end{align*} # # where fixed effects $s_{i}=-\ln S_{i}$ and $m_{n}=-\ln M_{n}$ are adjusted by # # \begin{align*} # \sum_{i}X_{ni}=Y_{i}\text{ and }\sum_{n}X_{ni}=X_{n}. # \end{align*} # # Standard choices of $D_{ni}^{k}$'s: # # * Logarithm of bilateral distance between $n$ and $i$ # # * Indicator of contiguous borders; of common official language; of # colonial ties # # * Trade policy variables: presence of a regional trade agreement; tariffs # # * Could include many other measures of proximity, e.g. measure of genetic/cultural distance, intensity of communications, etc. # We will solve this model by fixing a $\beta$ and solving the matching problem using IPFP. Then in an outer loop we will solve for the $\beta$ which minimizes the distance between model and empirical moments. # In[6]: sigma = 1 maxiterIpfp = 1000 maxiter = 500 tolIpfp = 1e-12 tolDescent = 1e-12 t_s = 0.03 iterCount = 0 contIter = True v_it = np.zeros((nbi, nbt)) beta_k = np.repeat(0, nbk) thegrad = np.repeat(0, nbk) pi_nit = [] theval_old = -math.inf ptm = time.time() while(contIter): #print("Iteration", iterCount) for t in range(nbt): #print("Year", t) D_ij_k = Dnikt[t] Phi = D_ij_k.dot(beta_k.reshape(nbk,1)).reshape(nbi,nbi) contIpfp = True iterIpfp = 0 v = v_it[:, t].reshape(1,nbi) f = f_nit[t] g = g_nit[t] K = np.exp(Phi/sigma) np.fill_diagonal(K,0) fK = np.multiply(f,K) gK = np.multiply(g,K) while(contIpfp): iterIpfp = iterIpfp + 1 u = sigma * np.log(np.sum(np.multiply(gK,np.exp((-IX.dot(v))/sigma)), axis = 1)).flatten() vnext = sigma * np.log(np.sum(np.multiply(fK,np.exp((-u.T.dot(tIY))/sigma)), axis = 0)) error = np.max(np.abs(np.sum(np.multiply(gK,np.exp((-IX.dot(vnext) - u.T.dot(tIY))/sigma)), axis = 1) - 1)) if (error < tolIpfp or iterIpfp >= maxiterIpfp): contIpfp = False v = vnext v_it[:,t] = np.asarray(v)[0] fgK = np.multiply(f,gK) pi_nit.append(np.multiply(fgK,np.exp((-IX.dot(v) - u.T.dot(tIY))/sigma))) thegrad = thegrad + (pi_nit[t]-pihat_nit[t]).flatten(order = 'F').dot(D_ij_k) beta_k = beta_k - t_s * thegrad nonzero_pi_nit = np.concatenate(pi_nit).ravel()[np.where(np.concatenate(pi_nit).ravel()>0, True, False)] theval = float(np.sum(np.multiply(thegrad,beta_k), axis = 1)) - sigma * float(np.sum(np.multiply(nonzero_pi_nit, np.log(nonzero_pi_nit)),axis=(0,1))) iterCount = iterCount + 1 if (iterCount > maxiter or np.abs(theval - theval_old) < tolDescent): contIter = False theval_old = theval thegrad = np.repeat(0, nbk) pi_nit = [] diff = time.time() - ptm print('Time elapsed = ', diff, 's.') # In[7]: beta_k = beta_k/sdD_k print(beta_k) # We recover the PPML estimates on Table 1 p. 42 of [Yotov et al.'s book](https://www.wto.org/english/res_e/booksp_e/advancedwtounctad2016_e.pdf) # ### Comparison XXXXXXXXXXXXXXXXXXXX # We can compare the results and speed of our computation to that of Poisson regression packages. As a warning, these give the same results, but at the cost of a much longer run time, so use at your own risk. We can solve instead using the Poisson regression from the glm package. XXXXXXXXXXXX