Import packages and read the cleaned beacon dataset into pandas:
from collections import defaultdict
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import json
import time
import os
import re
beaconActive = pd.read_csv(os.path.join('data', 'beacon_active.csv'))
Rename the beaconActive DataFrame's 'country_consolidated' variable as 'tld' (top-level domain):
beaconActive.rename(columns={'country_consolidated':'tld'}, inplace=True)
wbGroups = pd.read_excel(os.path.join('data','wb_tlds.xlsx'))
#Select only the country, income_group, and tld variables, and rename them accordingly:
wbGroups = wbGroups[['Economy',
'Income group',
'Domain']]
wbGroups.rename(columns={'Economy':'country','Income group':'income_group','Domain':'tld'}, inplace=True)
wbGroups.isnull().sum()
country 0 income_group 0 tld 1 dtype: int64
#There is one missing 'tld' value
wbGroups[wbGroups['tld'].isnull()]
country | income_group | tld | |
---|---|---|---|
141 | Namibia | Upper middle income | NaN |
#Namibia is missing! The top-level domain 'NA' read into pandas as NaN
#Change NaN to NA in the 'tld' column for Nigerian contexts
wbGroups['tld'] = np.where(wbGroups['tld'].isnull(), 'NA', wbGroups['tld'])
beaconActive['tld'] = np.where(beaconActive['oai_url'].str.contains('.na/'), 'NA', beaconActive['tld'])
Filter the World Bank income group dataset to include only those countries that match beaconActive:
wbGroups = wbGroups[wbGroups['tld'].isin(beaconActive['tld'])].reset_index()
wbGroups.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 139 entries, 0 to 138 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 index 139 non-null int64 1 country 139 non-null object 2 income_group 139 non-null object 3 tld 139 non-null object dtypes: int64(1), object(3) memory usage: 4.5+ KB
#Check for duplicates in the 'tld' column
wbGroups[wbGroups['tld'].duplicated()]
index | country | income_group | tld | |
---|---|---|---|---|
134 | 216 | Kosovo | Upper middle income | AL |
#Kosovo shares a top-level domain with Albania. But both Kosovo and Albania belong to the same income_group
#Merging on 'tld', so delete Kosovo for now
wbGroups.drop(index=134, inplace=True)
#Drop the index column
wbGroups.drop('index', axis=1, inplace=True)
for group in wbGroups['income_group'].unique():
print(group)
Low income Lower middle income Upper middle income High income Unclassified
Change Venezuela's income classification from 'Unclassified' (2021) to its previous classification (2020):
wbGroups['income_group'] = np.where(wbGroups['income_group'].str.contains('Unclassified'),
'Upper middle income',
wbGroups['income_group'])
incomeGroups = beaconActive.merge(wbGroups, how='inner', on='tld')
print(incomeGroups.shape)
(25651, 40)
incomeGroups.isnull().sum()
Unnamed: 0 0 oai_url 0 application 0 version 34 admin_email 658 earliest_datestamp 99 repository_name 4139 set_spec 0 context_name 0 stats_id 0 total_record_count 0 issn 2819 country_marc 2856 country_issn 2931 country_tld 5276 country_ip 191 tld 0 last_completed_update 22 first_beacon 0 last_beacon 0 last_oai_response 99 unresponsive_endpoint 0 unresponsive_context 0 record_count_2010 0 record_count_2011 0 record_count_2012 0 record_count_2013 0 record_count_2014 0 record_count_2015 0 record_count_2016 0 record_count_2017 0 record_count_2018 0 record_count_2019 0 record_count_2020 0 record_count_2021 0 issn_1 2819 issn_2 17232 journal_url 0 country 0 income_group 0 dtype: int64
#Plot the distribution of OJS journals by World Bank income group
sns.set(font_scale=1.25)
sns.set_style('whitegrid')
fig, ax = plt.subplots()
inc = sns.barplot(y=incomeGroups['country'].value_counts().index[:10],
x=incomeGroups['country'].value_counts().iloc[:10],
data=incomeGroups,
color='grey')
ax.set(xlim=(0, 15500),
xlabel = 'Active journals using OJS',
ylabel = 'Country')#,
#title='''The distribution of active journals using OJS across World Bank country groups \n
#based on gross national income (GNI) per capita ($\it{n}$ = 25,651)''')
sns.despine(bottom=True)
plt.xticks([2500, 5000, 7500, 10000, 12500],
['2,500', '5,000', '7,500', '10,000', '12,500'])
for p in inc.patches:
_x = p.get_x() + p.get_width()
_y = p.get_y() + p.get_height() - 0.125
value = '{:,}'.format(p.get_width())
percent = round(((p.get_width() / 25651) * 100), 1)
if len(str(int(p.get_width()))) == 5:
value = str(int(p.get_width()))[:2] + ',' + str(int(p.get_width()))[2:] + f' ({str(percent)})%'
elif len(str(int(p.get_width()))) == 4:
value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + f' ({str(percent)})%'
value = str(int(p.get_width())) + f' ({str(percent)})%'
else:
value = str(int(p.get_width())) + f' ({str(percent)})%'
inc.text(_x + 200, _y, value, ha='left', weight='bold')
inc.figure.savefig(os.path.join('vis', 'OJScountries.png'), bbox_inches='tight')
#Plot the distribution of OJS journals by World Bank income group
fig, ax = plt.subplots()
inc = sns.countplot(y='income_group',
order=['Low income',
'Lower middle income',
'Upper middle income',
'High income'],
data=incomeGroups,
color='grey')
ax.set(xlim=(0, 17500),
xlabel = 'Active journals using OJS',
ylabel = 'World Bank income group')#,
#title='''The distribution of active journals using OJS across World Bank country groups \n
#based on gross national income (GNI) per capita ($\it{n}$ = 25,651)''')
sns.despine(bottom=True)
plt.xticks([2500, 5000, 7500, 10000, 12500, 15000],
['2,500', '5,000', '7,500', '10,000', '12,500', '15,000'])
for p in inc.patches:
_x = p.get_x() + p.get_width()
_y = p.get_y() + p.get_height() - 0.3
value = '{:,}'.format(p.get_width())
percent = round(((p.get_width() / 25651) * 100), 1)
if len(str(int(p.get_width()))) == 5:
value = str(int(p.get_width()))[:2] + ',' + str(int(p.get_width()))[2:] + f' ({str(percent)})%'
elif len(str(int(p.get_width()))) == 4:
value = str(int(p.get_width()))[0] + ',' + str(int(p.get_width()))[1:] + f' ({str(percent)})%'
value = str(int(p.get_width())) + f' ({str(percent)})%'
inc.text(_x + 250, _y, value, ha='left', weight='bold')
inc.figure.savefig(os.path.join('vis', 'OJSincome_groups.png'), bbox_inches='tight')