#!/usr/bin/env python
# coding: utf-8
# #
Block 10: the gravity equation
# ### Alfred Galichon (NYU & Sciences Po)
# ## 'math+econ+code' masterclass on optimal transport and economic applications
# © 2018-2021 by Alfred Galichon. Past and present support from NSF grant DMS-1716489, ERC grant CoG-866274, and contributions by Jules Baudet, Pauline Corblet, Gregory Dannay, and James Nesbit are acknowledged.
#
# #### With python code examples
# ## Learning objectives
#
# * Regularized optimal transport
#
# * The gravity equation
#
# * Generalized linear models
#
# * Pseudo-Poisson maximum likelihood estimation
# ## References
#
# * Anderson and van Wincoop (2003). "Gravity with Gravitas: A Solution to the Border Puzzle". *American Economic Review*.
#
# * Head and Mayer (2014). "Gravity Equations: Workhorse, Toolkit and Cookbook". *Handbook of International Economics*.
#
# * Choo and Siow (2005). "Who marries whom and why". *Journal of Political Economy*.
#
# * Gourieroux, Trognon, Monfort (1984). "Pseudo Maximum Likelihood Methods: Theory". *Econometrica*.
#
# * McCullagh and Nelder (1989). *Generalized Linear Models*. Chapman and Hall/CRC.
#
# * Santos Silva and Tenreyro (2006). "The Log of Gravity". *Review of Economics and Statistics*.
#
# * Yotov et al. (2011). *An advanced guide to trade policy analysis*. WTO.
#
# * Guimares and Portugal (2012). "Real Wages and the Business Cycle: Accounting for Worker, Firm, and Job Title Heterogeneity". *AEJ: Macro*.
#
# * Dupuy and G (2014), "Personality traits and the marriage market". *Journal of Political Economy*.
#
# * Dupuy, G and Sun (2019), "Estimating matching affinity matrix under low-rank constraints". *Information and Inference*.
#
# * Carlier, Dupuy, Galichon and Sun "SISTA: learning optimal transport costs under sparsity constraints." *Communications on Pure and Applied Mathematics* (forthcoming).
# # Motivation
#
# The gravity equation is a very useful tool for explaining trade flows by various measures of proximity between countries.
#
# A number of regressors have been proposed. They include: geographic distance, common official languague, common colonial past, share of common religions, etc.
#
# The dependent variable is the volume of exports from country $i$ to country $n$, for each pair of country $\left( i,n\right)$.
#
# Today, we shall see a close connection between gravity models of international trade and separable matching models.
#
# ---
#
# To start with, let's load some of the libraries we shall need.
# In[1]:
import numpy as np
import os
import pandas as pd
import string as str
import math
import sys
import time
from scipy import optimize, special
# !python -m pip install -i https://pypi.gurobi.com gurobipy ## only if Gurobi not here
import gurobipy as grb
# And let's load our data, which comes from the book *An Advanced Guide to Trade Policy Analysis: The Structural Gravity Mode*, by Yotov et al. We will estimate the gravity model using optimal transport as well as using Poisson regression.
# In[2]:
#thepath = os.path.join(os.getcwd(),'data_mec_optim/gravity_wtodata/')
thepath = 'https://raw.githubusercontent.com/math-econ-code/mec_optim_2021-01/master/data_mec_optim/gravity_wtodata/'
tradedata = pd.read_csv(os.path.join(thepath ,'1_TraditionalGravity_from_WTO_book.csv'), sep=',')
tradedata = tradedata[['exporter', 'importer','year', 'trade', 'DIST','ln_DIST', 'CNTG', 'LANG', 'CLNY']]
tradedata.sort_values(['year','exporter','importer'], inplace = True)
tradedata.reset_index(inplace = True, drop = True)
nbt = len(tradedata['year'].unique())
nbi = len(tradedata['importer'].unique())
nbk = 4
tradedata.head()
# Consistent the common practice, we only look at the flows of trade between pairs of distinct countries:
# In[3]:
tradedata.loc[np.where(tradedata['importer']==tradedata['exporter'],True, False),['DIST', 'ln_DIST', 'CNTG', 'LANG', 'CLNY']]=0
# Let's prepare the data so that we can use it. We want to construct
# * $D_{ni,t}^k$ which is the $k$th pairwise discrepancy measure between importer $n$ and exporter $i$ at time $t$
#
# * $X_{n,t}$ total value of expenditure of importer $n$ at time $t$
#
# * $Y_{i,t}$ total value of production of exporter $i$ at time $t$
# In[4]:
Xhatnit = []
Dnikt = []
years = tradedata['year'].unique()
for t, year in enumerate(years):
tradedata_year = tradedata[tradedata['year']==year]
Xhatnit.append(tradedata_year.pivot(index = 'exporter', columns = 'importer', values ='trade').values)
np.fill_diagonal(Xhatnit[t],0)
Dnikt.append(tradedata_year[[ 'ln_DIST', 'CNTG', 'LANG', 'CLNY']].values)
Xnt = np.zeros((nbi,nbt))
Yit = np.zeros((nbi,nbt))
for t in range(nbt):
Xnt[:,t] = Xhatnit[t].sum(axis = 1)
Yit[:,t] = Xhatnit[t].sum(axis = 0)
totalmass_t = sum(Xhatnit).sum(axis=(0,1))/nbt
pihat_nit = Xhatnit/totalmass_t
# Let's standardize the data and construct some useful objects:
# In[5]:
meanD_k = np.asmatrix([mat.mean(axis = 0) for mat in Dnikt]).mean(axis = 0)
sdD_k = np.asmatrix([mat.std(axis = 0,ddof = 1) for mat in Dnikt]).mean(axis = 0)
Dnikt = [(mat - meanD_k)/sdD_k for mat in Dnikt]
p_nt = Xnt/totalmass_t
q_nt = Yit/totalmass_t
IX = np.repeat(1, nbi).reshape(nbi,1)
tIY = np.repeat(1, nbi).reshape(1,nbi)
f_nit = []
g_nit = []
for t in range(nbt):
f_nit.append(p_nt[:,t].reshape(nbi,1).dot(tIY))
g_nit.append(IX.dot(q_nt[:,t].reshape(1,nbi)))
# ### Regularized optimal transport
#
# Consider the optimal transport duality
#
# \begin{align*}
# \max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}=\min_{u_{x}+v_{y}\geq\Phi_{xy}}\sum_{x\in\mathcal{X}}p_{x}u_{x}+\sum_{y\in\mathcal{Y}}q_{y}v_{y}
# \end{align*}
#
# Now let's assume that we are adding an entropy to the primal objective function. For any $\sigma>0$, we get
#
# \begin{align*}
# & \max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}-\sigma\sum_{xy}\pi_{xy}\ln\pi_{xy}\\
# & =\min_{u,v}\sum_{x\in\mathcal{X}}p_{x}u_{x}+\sum_{y\in\mathcal{Y}}q_{y}v_{y}+\sigma\sum_{xy}\exp\left( \frac{\Phi_{xy}-u_{x}-v_{y}-\sigma}{\sigma}\right)
# \end{align*}
#
# The latter problem is an unconstrained convex optimization problem. But the most efficient numerical computation technique is often coordinate descent, i.e. alternate between minimization in $u$ and minimization in $v$.
# ### Iterated fitting
#
# Maximize wrt to $u$ yields
#
# \begin{align*}
# e^{-u_{x}/\sigma}=\frac{p_{x}}{\sum_{y}\exp\left( \frac{\Phi_{xy}-v_{y}-\sigma}{\sigma}\right) }
# \end{align*}
#
# and wrt $v$ yields
#
# \begin{align*}
# e^{-v_{y}/\sigma}=\frac{q_{y}}{\sum_{x}\exp\left( \frac{\Phi_{xy}-v_{y}-\sigma}{\sigma}\right) }
# \end{align*}
#
# It is called the "iterated projection fitting procedure" (ipfp), aka "matrix scaling", "RAS algorithm", "Sinkhorn-Knopp algorithm", "Kruithof's method", "Furness procedure", "biproportional fitting procedure", "Bregman's procedure". See survey in Idel (2016).
#
# Maybe the most often reinvented algorithm in applied mathematics. Recently rediscovered in a machine learning context.
# ### Econometrics of matching
#
# The goal is to estimate the matching surplus $\Phi_{xy}$. For this, take a linear parameterization
#
# \begin{align*}
# \Phi_{xy}^{\beta}=\sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}.
# \end{align*}
#
# Following Choo and Siow (2006), Galichon and Salanie (2011) introduce logit heterogeneity in individual preferences and show that the equilibrium now maximizes the *regularized Monge-Kantorovich problem*
#
# \begin{align*}
# W\left( \beta\right) =\max_{\pi\in\mathcal{M}\left( P,Q\right) }\sum_{xy}\pi_{xy}\Phi_{xy}^{\beta}-\sigma\sum_{xy}\pi_{xy}\ln\pi_{xy}
# \end{align*}
#
# By duality, $W\left( \beta\right) $ can be expressed
#
# \begin{align*}
# W\left( \beta\right) =\min_{u,v}\sum_{x}p_{x}u_{x}+\sum_{y}q_{y}v_{y}+\sigma\sum_{xy}\exp\left( \frac{\Phi_{xy}^{\beta}-u_{x}-v_{y}-\sigma}{\sigma}\right)
# \end{align*}
#
# and w.l.o.g. can set $\sigma=1$ and drop the additive constant $-\sigma$ in the $\exp$.
# ### Estimation
#
# We observe the actual matching $\hat{\pi}_{xy}$. Note that $\partial W/ \partial\beta^{k}=\sum_{xy}\pi_{xy}\phi_{xy}^{k},$\ hence $\beta$ is estimated by running
#
#
# \begin{align*}
# \min_{u,v,\beta}\sum_{x}p_{x}u_{x}+\sum_{y}q_{y}v_{y}+\sum_{xy}\exp\left(\Phi_{xy}^{\beta}-u_{x}-v_{y}\right) -\sum_{xy,k}\hat{\pi}_{xy}\beta_{k}\phi_{xy}^{k}
# \end{align*}
#
# which is still a convex optimization problem.
#
# This is actually the objective function of the log-likelihood in a Poisson regression with $x$ and $y$ fixed effects, where we assume
#
# \begin{align*}
# \pi_{xy}|xy\sim Poisson\left( \exp\left( \sum_{k=1}^{K}\beta_{k}\phi
# _{xy}^{k}-u_{x}-v_{y}\right) \right) .
# \end{align*}
# ### Poisson regression with fixed effects
#
# Let $\theta=\left( \beta,u,v\right) $ and $Z=\left( \phi,D^{x},D^{y}\right) $ where $D_{x^{\prime}y^{\prime}}^{x}=1\left\{ x=x^{\prime}\right\} $ and $D_{x^{\prime}y^{\prime}}^{y}=1\left\{ y=y^{\prime}\right\}$ are $x$-and $y$-dummies. Let $m_{xy}\left( Z;\theta\right) =\exp\left(\theta^{\intercal}Z_{xy}\right) $ be the parameter of the Poisson distribution.
#
# The conditional likelihood of $\hat{\pi}_{xy}$ given $Z_{xy}$ is
#
# \begin{align*}
# l_{xy}\left( \hat{\pi}_{xy};\theta\right) & =\hat{\pi}_{xy}\log m_{xy}\left( Z;\theta\right) -m_{xy}\left( Z;\theta\right) \\
# & =\hat{\pi}_{xy}\left( \theta^{\intercal}Z_{xy}\right) -\exp\left(\theta^{\intercal}Z_{xy}\right) \\
# & =\hat{\pi}_{xy}\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right) -\exp\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right)
# \end{align*}
#
# Summing over $x$ and $y$, the sample log-likelihood is
#
# \begin{align*}
# \sum_{xy}\hat{\pi}_{xy}\sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-\sum_{x}p_{x}u_{x}-\sum_{y}q_{y}v_{y}-\sum_{xy}\exp\left( \sum_{k=1}^{K}\beta_{k}\phi_{xy}^{k}-u_{x}-v_{y}\right)
# \end{align*}
#
# hence we recover the [objective function](#objFun).
# ### From Poisson to pseudo-Poisson
#
# If $\pi_{xy}|xy$ is Poisson, then $\mathbb{E}\left[\pi_{xy}\right]=m_{xy}\left( Z_{xy};\theta\right) =\mathbb{V}ar\left( \pi_{xy}\right) $. While it makes sense to assume the former equality, the latter is a rather strong assumption.
#
# For estimation purposes, $\hat{\theta}$ is obtained by
#
# \begin{align*}
# \max_{\theta}\sum_{xy}l\left( \hat{\pi}_{xy};\theta\right) =\sum_{xy}\left(\hat{\pi}_{xy}\left( \theta^{\intercal}Z_{xy}\right) -\exp\left(\theta^{\intercal}Z_{xy}\right) \right)
# \end{align*}
#
# however, for inference purposes, one shall not assume the Poisson distribution. Instead
#
# \begin{align*}
# \sqrt{N}\left( \hat{\theta}-\theta\right) \Longrightarrow\left(A_{0}\right) ^{-1}B_{0}\left( A_{0}\right) ^{-1}
# \end{align*}
#
# where $N=\left\vert \mathcal{X}\right\vert \times\left\vert \mathcal{Y}\right\vert $ and $A_{0}$ and $B_{0}$ are estimated by
#
# \begin{align*}
# \hat{A}_{0} & =N^{-1}\sum_{xy}D_{\theta\theta}^{2}l\left( \hat{\pi}_{xy};\hat{\theta}\right) =N^{-1}\sum_{xy}\exp\left( \hat{\theta}^{\intercal}Z_{xy}\right) Z_{xy}Z_{xy}^{\intercal}\\
# \hat{B}_{0} & =N^{-1}\sum_{xy}\left( \hat{\pi}_{xy}-\exp\left( \hat{\theta}^{\intercal}Z_{xy}\right) \right) ^{2}Z_{xy}Z_{xy}^{\intercal}.
# \end{align*}
#
#
# # DG XXX ESTIMATION HERE, USING EXP FAMILIES
# # The gravity equation
#
# "Structural gravity equation" (Anderson and van Wincoop, 2003) as exposited in Head and Mayer (2014)
# handbook chapter:
#
# \begin{align*}
# X_{ni}=\underset{S_{i}}{\underbrace{\frac{Y_{i}}{\Omega_{i}}}}\underset{M_{n}}{\underbrace{\frac{X_{n}}{\Psi_{n}}}}\Phi_{ni}%
# \end{align*}
#
# where $n$=importer, $i$=exporter, $X_{ni}$=trade flow from $i$ to $n$, $Y_{i}=\sum_{n}X_{ni}$ is value of production, $X_{n}=\sum_{i}X_{ni}$ is importers' expenditures, and $\phi_{ni}$=bilateral accessibility of $n$ to $i$.
#
# $\Omega_{i}$ and $\Psi_{n}$ are \textquotedblleft multilateral resistances\textquotedblright, satisfying the set of implicit equations
#
# \begin{align*}
# \Psi_{n}=\sum_{i}\frac{\Phi_{ni}Y_{i}}{\Omega_{i}}\text{ and }\Omega_{i}%
# =\sum_{n}\frac{\Phi_{ni}X_{n}}{\Psi_{n}}%
# \end{align*}
#
# These are exactly the same equations as those of the regularized OT.
# ## Explaining trade
#
# Parameterize $\Phi_{ni}=\exp\left( \sum_{k=1}^{K}\beta_{k}D_{ni}^{k}\right) $, where the $D_{ni}^{k}$ are $K$ pairwise measures of distance between $n$ and $i$. We have
#
# \begin{align*}
# X_{ni}=\exp\left( \sum_{k=1}^{K}\beta_{k}D_{ni}^{k}-s_{i}-m_{n}\right)
# \end{align*}
#
# where fixed effects $s_{i}=-\ln S_{i}$ and $m_{n}=-\ln M_{n}$ are adjusted by
#
# \begin{align*}
# \sum_{i}X_{ni}=Y_{i}\text{ and }\sum_{n}X_{ni}=X_{n}.
# \end{align*}
#
# Standard choices of $D_{ni}^{k}$'s:
#
# * Logarithm of bilateral distance between $n$ and $i$
#
# * Indicator of contiguous borders; of common official language; of
# colonial ties
#
# * Trade policy variables: presence of a regional trade agreement; tariffs
#
# * Could include many other measures of proximity, e.g. measure of genetic/cultural distance, intensity of communications, etc.
# We will solve this model by fixing a $\beta$ and solving the matching problem using IPFP. Then in an outer loop we will solve for the $\beta$ which minimizes the distance between model and empirical moments.
# In[6]:
sigma = 1
maxiterIpfp = 1000
maxiter = 500
tolIpfp = 1e-12
tolDescent = 1e-12
t_s = 0.03
iterCount = 0
contIter = True
v_it = np.zeros((nbi, nbt))
beta_k = np.repeat(0, nbk)
thegrad = np.repeat(0, nbk)
pi_nit = []
theval_old = -math.inf
ptm = time.time()
while(contIter):
#print("Iteration", iterCount)
for t in range(nbt):
#print("Year", t)
D_ij_k = Dnikt[t]
Phi = D_ij_k.dot(beta_k.reshape(nbk,1)).reshape(nbi,nbi)
contIpfp = True
iterIpfp = 0
v = v_it[:, t].reshape(1,nbi)
f = f_nit[t]
g = g_nit[t]
K = np.exp(Phi/sigma)
np.fill_diagonal(K,0)
fK = np.multiply(f,K)
gK = np.multiply(g,K)
while(contIpfp):
iterIpfp = iterIpfp + 1
u = sigma * np.log(np.sum(np.multiply(gK,np.exp((-IX.dot(v))/sigma)), axis = 1)).flatten()
vnext = sigma * np.log(np.sum(np.multiply(fK,np.exp((-u.T.dot(tIY))/sigma)), axis = 0))
error = np.max(np.abs(np.sum(np.multiply(gK,np.exp((-IX.dot(vnext) - u.T.dot(tIY))/sigma)), axis = 1) - 1))
if (error < tolIpfp or iterIpfp >= maxiterIpfp):
contIpfp = False
v = vnext
v_it[:,t] = np.asarray(v)[0]
fgK = np.multiply(f,gK)
pi_nit.append(np.multiply(fgK,np.exp((-IX.dot(v) - u.T.dot(tIY))/sigma)))
thegrad = thegrad + (pi_nit[t]-pihat_nit[t]).flatten(order = 'F').dot(D_ij_k)
beta_k = beta_k - t_s * thegrad
nonzero_pi_nit = np.concatenate(pi_nit).ravel()[np.where(np.concatenate(pi_nit).ravel()>0, True, False)]
theval = float(np.sum(np.multiply(thegrad,beta_k), axis = 1)) - sigma * float(np.sum(np.multiply(nonzero_pi_nit, np.log(nonzero_pi_nit)),axis=(0,1)))
iterCount = iterCount + 1
if (iterCount > maxiter or np.abs(theval - theval_old) < tolDescent):
contIter = False
theval_old = theval
thegrad = np.repeat(0, nbk)
pi_nit = []
diff = time.time() - ptm
print('Time elapsed = ', diff, 's.')
# In[7]:
beta_k = beta_k/sdD_k
print(beta_k)
# We recover the PPML estimates on Table 1 p. 42 of [Yotov et al.'s book](https://www.wto.org/english/res_e/booksp_e/advancedwtounctad2016_e.pdf)
# ### Comparison XXXXXXXXXXXXXXXXXXXX
# We can compare the results and speed of our computation to that of Poisson regression packages. As a warning, these give the same results, but at the cost of a much longer run time, so use at your own risk. We can solve instead using the Poisson regression from the glm package. XXXXXXXXXXXX