In [1]:
import pandas as pd

df = pd.DataFrame(data = [[2018, "a", "b", 500, 1500, 700],
                          [2018, "a", "c", 500, 400, 50],
                          [2018, "b", "a", 1500, 500, 700],
                          [2018, "b", "d", 1500, 750, 200],
                          [2018, "c", "d", 400, 750, 375]],
                  columns = ["year", "exporter", "importer", "importer_gdp", "exporter_gdp", "distance"])
df
Out[1]:
year exporter importer importer_gdp exporter_gdp distance
0 2018 a b 500 1500 700
1 2018 a c 500 400 50
2 2018 b a 1500 500 700
3 2018 b d 1500 750 200
4 2018 c d 400 750 375
In [2]:
df['check_string'] = df.apply(lambda row: ''.join(sorted([row['exporter'], row['importer']])), axis=1)
df
Out[2]:
year exporter importer importer_gdp exporter_gdp distance check_string
0 2018 a b 500 1500 700 ab
1 2018 a c 500 400 50 ac
2 2018 b a 1500 500 700 ab
3 2018 b d 1500 750 200 bd
4 2018 c d 400 750 375 cd
In [3]:
df.drop_duplicates(subset="check_string", inplace=True)
df
Out[3]:
year exporter importer importer_gdp exporter_gdp distance check_string
0 2018 a b 500 1500 700 ab
1 2018 a c 500 400 50 ac
3 2018 b d 1500 750 200 bd
4 2018 c d 400 750 375 cd
In [4]:
export_group = df.groupby("exporter")

df2 = pd.DataFrame(
                    1/(export_group.importer_gdp.sum()/export_group.distance.sum())
                  ).reset_index().rename(columns={0:"remoteness"})
df2
Out[4]:
exporter remoteness
0 a 0.750000
1 b 0.133333
2 c 0.937500
In [5]:
df = df.merge(df2, how="left", on="exporter")
df
Out[5]:
year exporter importer importer_gdp exporter_gdp distance check_string remoteness
0 2018 a b 500 1500 700 ab 0.750000
1 2018 a c 500 400 50 ac 0.750000
2 2018 b d 1500 750 200 bd 0.133333
3 2018 c d 400 750 375 cd 0.937500