This notebook provides all the steps to replicate the results of our paper Expanding the measurement of culture with a sample of two billion humans published in the Journal of the Royal Society Interface 19:20220085 (2022).
Let's start by importing the required packages
#%pylab --no-import-all
%matplotlib inline
import sys, os, time
import numpy as np
import pandas as pd
pd.set_option('display.width', 160)
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as mpatches
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, manhattan_distances, pairwise_distances
from scipy.stats import zscore
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy import spatial, stats
from scipy.stats import zscore
import MantelTest.MantelTest as MantelTest
import re
import seaborn as sns
Let's setup our paths
mytype = 'penetration'
cut = 'above'
pathfb = './data/'
pathfbor = './data/OriginalData/'
pathregs = pathfb + 'Regs/'
pathsamples = pathfb + '/DemographicData/'
pathout = pathregs + 'Representativeness/'
if os.path.exists(pathout) == False:
os.mkdir(pathout)
pathshare = pathout
if os.path.exists(pathfbor) == False:
os.mkdir(pathfbor)
pathout = pathregs + 'Representativeness/' + mytype + '/'
if os.path.exists(pathout) == False:
os.mkdir(pathout)
pathout = pathout + cut + '/'
if os.path.exists(pathout) == False:
os.mkdir(pathout)
pathshare = pathout
# Type of distance measure
m = 'Cos'
# Import all data
df = pd.read_stata(pathregs + 'AllDistsFull.dta')
# Import all distances
mypairs = pd.read_stata(pathregs + 'AllDists.dta')
mypairs.drop([x for x in mypairs.columns if x.endswith('uk') or x.endswith('usa')], inplace=True, axis=1)
mypairs.drop([x for x in mypairs.columns if x.find('cognate')!=-1], inplace=True, axis=1)
# Import samples
sample_all = pd.read_excel(pathsamples + 'SampleCountries.xlsx', sheet_name=1, keep_default_na=False, na_values=[''])
sample_wvs = pd.read_excel(pathsamples + 'SampleCountries.xlsx', sheet_name=2, keep_default_na=False, na_values=[''])
sample_repr = pd.read_stata(pathsamples + 'representative_all.dta')
/Users/ozak/anaconda3/envs/GeoPython39env/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Unknown extension is not supported and will be removed warn(msg)
sample_repr
countrycode3 | pop_tot | shareWDI_age | shareWDI_gender | countryname | fbpenetration | WVS | shareFB_age | shareFB_gender | countrynameFB | penetrationWDI_age | penetration_tot | diffgender | diffage | pctage | pctgender | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CHN | 1.386395e+09 | 0.288391 | 0.514821 | China | 0.00 | 1.0 | 0.657890 | 0.615631 | CN | 0.002946 | 0.004544 | 0.100810 | 0.369499 | 0.052015 | 0.007033 |
1 | TCD | 1.489999e+07 | 0.553384 | 0.499404 | Chad | 0.02 | NaN | 0.649067 | 0.834220 | TD | 0.015872 | 0.009396 | 0.334817 | 0.095683 | 0.079544 | 0.016224 |
2 | TKM | 5.758075e+06 | 0.409793 | 0.490754 | Turkmenistan | 0.00 | NaN | 0.327031 | 0.472757 | TM | 0.004564 | 0.011462 | 0.017997 | 0.082761 | 0.093721 | 0.019765 |
3 | SSD | 1.257571e+07 | 0.515470 | 0.499250 | South Sudan | 0.01 | NaN | 0.538432 | 0.783775 | SS | 0.031825 | 0.017494 | 0.284525 | 0.022962 | 0.108701 | 0.027796 |
4 | CAF | 4.659080e+06 | 0.529828 | 0.492986 | Central African Republic | 0.02 | NaN | 0.519295 | 0.709647 | CF | 0.043283 | 0.023610 | 0.216661 | 0.010534 | 0.122169 | 0.041294 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
171 | BRN | 4.286970e+05 | 0.348671 | 0.517849 | Brunei Darussalam | 0.81 | NaN | 0.434901 | 0.563942 | BN | 1.139706 | 1.003039 | 0.046093 | 0.086230 | NaN | NaN |
172 | BHR | 1.492584e+06 | 0.339478 | 0.660007 | Bahrain | 0.70 | NaN | 0.460658 | 0.737863 | BH | 0.920223 | 1.004969 | 0.077856 | 0.121181 | NaN | NaN |
173 | KWT | 4.136528e+06 | 0.271252 | 0.589644 | Kuwait | 0.74 | NaN | 0.433159 | 0.708900 | KW | 0.940266 | 1.039519 | 0.119256 | 0.161907 | NaN | NaN |
174 | ARE | 9.400145e+06 | 0.286527 | 0.759823 | United Arab Emirates | 0.91 | NaN | 0.475820 | 0.742728 | AE | 1.122771 | 1.170195 | 0.017095 | 0.189293 | NaN | NaN |
175 | QAT | 2.639211e+06 | 0.369710 | 0.791233 | Qatar | 0.85 | NaN | 0.571068 | 0.762519 | QA | 1.083616 | 1.250374 | 0.028714 | 0.201358 | NaN | NaN |
176 rows × 16 columns
# Generate cutoffs
if cut=='above':
sample_repr['diff' + mytype +'_median'] = (sample_repr[mytype + '_tot']>sample_repr[mytype + '_tot'].median()).astype(int)
sample_repr['diff' + mytype +'_median_wvs'] = (sample_repr[mytype + '_tot']>sample_repr.loc[sample_repr.WVS==1][mytype + '_tot'].median()).astype(int)
else:
sample_repr['diff' + mytype +'_median'] = (sample_repr[mytype + '_tot']<sample_repr[mytype + '_tot'].median()).astype(int)
sample_repr['diff' + mytype +'_median_wvs'] = (sample_repr[mytype + '_tot']<sample_repr.loc[sample_repr.WVS==1][mytype + '_tot'].median()).astype(int)
mysample_all = sample_all.countrycode2.unique()
mysample_wvs = sample_wvs.countrycode2.unique()
mysample_repr = sample_repr.countrynameFB.unique()
mysample_repr = sample_repr.loc[sample_repr['diff' + mytype +'_median_wvs']==1].countrynameFB.unique()
# Drop countries not in sample
mysample = mysample_repr
mypairs = mypairs.loc[np.logical_and(mypairs.ISO_CODE_1.apply(lambda x: x in mysample), mypairs.ISO_CODE_2.apply(lambda x: x in mysample))].copy()
df = df.loc[np.logical_and(df.ISO_CODE_1.apply(lambda x: x in mysample), df.ISO_CODE_2.apply(lambda x: x in mysample))].copy()
SMALL_SIZE = 24
MEDIUM_SIZE = 28
BIGGER_SIZE = 32
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title