This link navigates to a Google doc with examples of online journals using OJS to publish open access articles in 60 different languages.
Import packages:
from collections import defaultdict
from collections import Counter
from lxml import html
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import pandas as pd
import numpy as np
import json
import time
import re
import os
Initialize gcld3:
import gcld3
classifier = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=10000)
Store a list of gcld3 language codes corresponding to the 60 different publishing languages in OJS use (except Faroese and Balochi, which are unsupported by gcld3):
known_langs = ['af', 'al', 'ar', 'bg', 'bg-Latn', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'el-Latn', 'en', 'es', 'et',
'eu','fa', 'fi', 'fil', 'fr', 'gd', 'gl', 'hi', 'hi-Latn', 'hr', 'hu', 'hy', 'id', 'ig', 'is', 'it',
'ja','ja-Latn', 'ka', 'kk', 'ko', 'ku', 'lt', 'mk', 'ms', 'ne', 'nl', 'no', 'pl', 'pt', 'ro', 'ru',
'ru-Latn','si','sk', 'sl', 'sr', 'sv', 'sw', 'ta', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'zh', 'zh-Latn']
Create a function that:
def classify_abstracts(path_to_dump, classifier, issn_filter, lang_filter):
metadata_pattern = '<metadata>.+</metadata>'
issn2langs = defaultdict(list) #defaultdict of lists
dcount = defaultdict(int)
#Full processing
with open(path_to_dump, 'r') as f:
article_count = 0
for line in f:
content = re.search(metadata_pattern, line, re.MULTILINE | re.DOTALL)
if content:
tree = html.fromstring(content.group())
for article in tree.xpath('//metadata'):
article_count += 1
for source in article.xpath('.//source'):
source_copy = str(source.text)
source_copy = re.sub('\s', '', source_copy)
if source_copy in issn_filter:
issn = source_copy
for description in article.xpath('.//description'):
if dcount[issn] < 100 and description is not None: #if <100 abstracts have been classified
pred_ = classifier.FindLanguage(text=description.text) #run gcld3
if pred_.is_reliable and pred_.language in lang_filter:
#if the language prediction is reliable and in a known OJS language
issn2langs[issn].append(pred_.language)
#append to a list of language predictions for the journal
dcount[issn] += 1
del pred_
while tree.getprevious() is not None:
del tree.getparent()[0]
del content
print(f"Articles scanned: {article_count}")
print(f"Journals classified: {len(issn2langs)}")
print(f"Missing issns: {set(issn_filter) - set(list(issn2langs.keys()))}")
return issn2langs
path_to_dump = os.path.join('data', 'datadump.txt')
path_to_beacon = os.path.join('data', 'beacon_active.csv')
with open(path_to_beacon, 'r') as f:
df = pd.read_csv(f)
df = df[~df['issn_1'].duplicated()]
issn_filter = [i for i in df['issn_1'].tolist() if isinstance(i, str)]
print(len(issn_filter))
22809
%time issn2langs = classify_abstracts(path_to_dump, classifier, issn_filter, lang_filter=known_langs)
Articles scanned: 7960979 Journals classified: 22559 Missing issns: {'2501-9430', '1047-6857', '2364-3714', '2525-8281', '1856-6073', '1678-9059', '2451-3962', '2358-1069', '2540-8445', '2721-5148', '2798-6241', '2655-2469', '2721-6020', '2734-9314', '2528-4967', '2734-9330', '1411-6340', '2775-5592', '2302-8432', '2355-1720', '2477-5029', '2503-3417', '1668-8708', '2615-6911', '2721-9976', '2598-0637', '1096-746X', '2451-1862', '2222-6737', '2775-1937', '2153-4012', '2600-5689', '1411-2280', '2722-3736', '2086-7840', '2183-0134', '1747-7387', '1693-4458', '2538-399X', '2501-2428', '0436-0265', '2716-3679', '2316-5324', '2655-948X', '2597-8985', '2621-783X', '2722-7960', '2620-5068', '1549-4497', '2598-9626', '2722-7111', '2723-1186', '2580-3123', '2460-352X', '2686-4908', '2622-5867', '1693-461X', '1754-4270', '2685-2799', '2614-4042', '2085-8744', '0124-1192', '2359-1382', '2622-6138', '2602-0254', '2668-9928', '1216-6804', '0103-4979', '0797-8952', '2215-9827', '2723-5319', '2339-1499', '1981-6979', '2656-1565', '1705-9100', '2337-5973', '2549-4317', '2291-8639', '2027-7636', '1987-037X', '1314-586X', '2745-6889', '2723-4088', '2442-9910', '2541-7207', '2501-5915', '2655-6065', '2549-2454', '2716-5043', '2338-0683', '2490-1199', '2715-2138', '2715-5889', '1997-3837', '0258-2724', '2447-7028', '2085-8205', '2238-8494', '2501-7136', '2334-1645', '2526-7744', '2722-0516', '2668-1056', '2375-7817', '2331-6950', '2722-2012', '0315-3681', '0216-7395', '2089-8118', '2734-5475', '2714-6278', '2734-9349', '2621-0622', '0798-0329', '2599-0543', '1978-0125', '2501-1111', '2708-7530', '1693-7619', '2654-5667', '2229-5674', '2735-9417', '2238-944X', '2525-2003', '1806-4280', '2675-4142', '2721-5016', '2502-471X', '2655-6324', '2732-3587', '1978-2403', '2723-7443', '2477-5258', '2620-5726', '2710-0898', '0216-7298', '2489-5512', '2501-9988', '2252-4797', '2350-0123', '2715-4971', '2579-9193', '2252-5262', '2715-1018', '2338-3720', '1081-1451', '1679-4605', '2094-1277', '0940-7855', '1982-6109', '2184-7193', '2615-2037', '2746-6434', '1412-0712', '2537-1754', '2294-9844', '2596-1837', '2668-9758', '2716-408X', '2722-5089', '1979-052X', '2501-1235', '2720-9903', '2406-8802', '2501-8590', '2460-8076', '2366-9217', '2461-0623', '2559-7914', '2698-5446', '1531-0167', '2580-6912', '2359-5965', '2764-1066', '2721-8511', '2685-6123', '2088-4605', '2548-7523', '2312-2528', '2734-9306', '2447-0899', '1980-5772', '2337-568X', '2302-934X', '2763-8669', '2595-9026', '2447-3472', '0126-074X', '2620-5505', '2711-4716', '2734-9357', '2460-3236', '2549-6778', '1412-226X', '2356-5225', '2684-9062', '2303-1409', '2614-719X', '2721-3315', '2622-9765', '2541-6030', '0125-9326', '2527-5445', '2540-9417', '2007-3380', '2699-5433', '2594-813X', '2179-8168', '1678-4944', '1411-2973', '2623-162X', '2174-7210', '2561-7141', '2597-5277', '2599-0551', '2716-0394', '2549-3485', '2615-8396', '2549-3361', '2528-1569', '2076-6327', '0208-5712', '2501-9120', '2686-2565', '1693-024X', '2797-5967', '2198-9397', '2089-6980', '2710-091X', '2406-8616', '2685-712x', '2601-971X', '2616-2504', '1669-726X', '2540-9808', '1412-4246', '2807-9256', '2408-350X', '2685-6425', '2616-6291', '2601-1972', '2614-5944', '2628-7129', '2685-161X', '1806-1230', '2807-887X', '2303-002X', '2685-5070', '0124-0625', '2252-7141'} CPU times: user 1h 22min 58s, sys: 55 s, total: 1h 23min 53s Wall time: 1h 23min 54s
Sanity check:
with open(os.path.join('data', 'issn2langs.json'), 'w') as outfile:
json.dump(issn2langs, outfile)
print(type(issn2langs))
for k, v in issn2langs.items():
print(k) #issn for one journal
print(v) #list of gcld3 language classifications for most recent 100 or fewer articles published in journal
break
<class 'collections.defaultdict'> 2715-2502 ['id', 'en', 'id', 'id', 'id', 'id', 'id', 'en', 'id', 'en', 'en', 'id', 'id', 'ms', 'id', 'en', 'id', 'en', 'id', 'id', 'en']
Create a DataFrame, AA
, that joins gcld3-predicted language codes with data from the beacon. This DataFrame will be used to verify gcld3 language predictions for each journal.
#reload dict {issn: list of gcld3-predicted language codes}
with open(os.path.join('data','issn2langs.json'), 'r') as infile:
issn2langs = json.load(infile)
#load dict {issn: text sample of concatenated titles and abstracts}
with open(os.path.join('data','issn2payload.json'), 'r') as infile:
issn2payload = json.load(infile)
#{issn: primary language}
l1 = {}
for k, v in issn2langs.items():
l1[k] = Counter(v).most_common(1)[0][0]
#{issn: secondary language}
l2 = {}
for k, v in issn2langs.items():
try:
l2[k] = Counter(v).most_common(2)[1][0]
except IndexError:
l2[k] = None
continue
# issn | primary language predicted by gcld3 | secondary language predicted by gcld3
dfL = pd.DataFrame({'issn': l1.keys(),
'pred_1': l1.values(),
'pred_2': l2.values(),
})
# issn | text sample of concatenated titles and abstracts
dfP = pd.DataFrame({'issn': issn2payload.keys(),
'text':issn2payload.values()})
# issn | primary language | secondary language | text sample
dfA = pd.merge(dfL, dfP, how='outer')
#load beacon data
with open('data/beacon_active.csv', 'r') as infile:
bA = pd.read_csv(infile)
#select beacon columns useful for language verification
bA = bA[['context_name', 'issn_1', 'issn_2', 'country_consolidated', 'journal_url']].copy()
#rename then merge into AA
bA.rename(columns = {'issn_1':'issn',
'issn_2':'issn_alt',
'country_consolidated':'tld'}, inplace = True)
AA = pd.merge(dfA, bA, on='issn')
#AA.to_csv('data/AA.csv')
#deduplicate issns
AA = AA[~AA['issn'].duplicated()]
#lowercase top-level domains
AA['tld'] = AA['tld'].str.lower()
AA['pred_1'].unique()
array(['id', 'af', 'en', 'ms', 'es', 'ar', 'pt', 'th', 'ca', 'el', 'uk', 'it', 'fr', 'is', 'de', 'no', 'ja', 'ru', 'tr', 'sv', 'hi', 'pl', 'sr', 'sl', 'cs', 'da', 'vi', 'lt', 'hu', 'hr', 'mk', 'zh', 'ta', 'kk', 'sw', 'gd', 'sk', 'et', 'fa', 'bs', 'eu', 'ro', 'bg', 'fil', 'ka', 'hy', 'uz', 'nl', 'fi', 'ne', 'ig', nan], dtype=object)
Function for applying gcld3 to journal titles and article-level text samples:
def tag_language(s):
l = classifier.FindLanguage(text=s)
if l.is_reliable:
return l.language
AA['title_language'] = pd.Series([tag_language(s) if isinstance(s, str) else None for s in AA['context_name'].tolist()])
AA['text_language'] = pd.Series([tag_language(s) if isinstance(s, str) else None for s in AA['text'].tolist()])
Function for adding journal issns and double-checked language predictions to a cleaned dict clean_d
:
def add2dict(d, issns, langs, corrections=False):
count = 0
corrs = 0
for issn, l2 in zip(issns, langs):
if corrections:
d[issn] = l2
corrs += 1
continue
else:
if issn in d:
continue
else:
d[issn] = l2
count += 1
print(f"{count} journal(s) added;\n{corrs} journals corrected;\n{len(d)} journals cleaned and stored in total")
clean_d = {}
add2dict(d=clean_d,
issns=['2445-6144', '2710-4850'],
langs=['Faroese', 'Balochi'])
2 journal(s) added; 0 journals corrected; 2 journals cleaned and stored in total
pred_1
value but a null pred_2
value, it is probably publishing in language pred_1
(because the most recent 100 article abstracts were all tagged with a single language code, pred_1
):¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'].notnull()) & (AA['pred_2'].isna())].issn.tolist(),
langs=AA[(AA['pred_1'].notnull()) & (AA['pred_2'].isna())].pred_1.tolist())
5791 journal(s) added; 0 journals corrected; 5793 journals cleaned and stored in total
title_language
, article metadata text_language
, and abstracts from the most recent 100 articles published pred_1
, add the journal:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == AA['text_language']) & (AA['pred_1'] == AA['title_language'])].issn.tolist(),
langs=AA[(AA['pred_1'] == AA['text_language']) & (AA['pred_1'] == AA['title_language'])].pred_1.tolist())
1172 journal(s) added; 0 journals corrected; 6965 journals cleaned and stored in total
pred_2
matches its top level domain tld
and the primary language predicted by gcld3 pred_1
is not English, pred_1
is probably an error, so add pred_2
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_2'] == AA['tld']) & (AA['pred_1'] != 'en')].issn.tolist(),
langs=AA[(AA['pred_2'] == AA['tld']) & (AA['pred_1'] != 'en')].pred_2.tolist())
116 journal(s) added; 0 journals corrected; 7081 journals cleaned and stored in total
af
Afrikaans primary language classification actually have an African top-level domain tld
:¶#store list of African country top-level domains
af_tlds = ['AO','BI','BJ','BF','BW','CF','CI','CM','CD','CG','KM','CV','ER','ET','GA','GH',
'GN','GM','GW','GQ','KE','LR','LS','MG','ML','MZ','MR','MU','MW','NA','NE','NG',
'RW','SD','SN','SL','SO','SS','ST','SZ','SC','TD','TG','TZ','UG','ZA','ZM','ZW']
AA['is_af'] = AA['tld'].apply(lambda x: any([i.lower() for i in af_tlds if i.lower() == x]))
#AA[((AA['pred_1'] == 'af') | (AA['pred_2'] == 'af')) & (AA['is_af'] == True)]
add2dict(d=clean_d,
issns=['1013-1116', '0041-4751', '0041-476X', '0254-3486', '2006-1390'],
langs=['af', 'af', 'af', 'af', 'en'])
5 journal(s) added; 0 journals corrected; 7086 journals cleaned and stored in total
pred_1
== af
are misclassifications, add pred_2
instead:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'af')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'af')].pred_2.tolist())
70 journal(s) added; 0 journals corrected; 7156 journals cleaned and stored in total
es
Spanish primary language classification also have a Latin American top-level domain tld
, they are probably publishing in Spanish:¶latam_tlds = ['AW','AR','AG','BS','BZ','BO',#'BR', not brazil! bc portuguese
'BB','CL','CO','CR','CU','CW','KY','DM','DO','EC','GD','GT','GY','HN','HT','JM','KN','LC',
'GP','MX','NI','PA','PE','PR','PY','SV','SR','AN','SX','TC','TT','UY','VC','VE','VG','VI']
AA['is_latam'] = AA['tld'].apply(lambda x: any([i.lower() for i in latam_tlds if i.lower() == x]))
AA['is_latam'].value_counts()
False 20123 True 2468 Name: is_latam, dtype: int64
add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'es') & (AA['is_latam'] == True)].issn.tolist(),
langs=AA[(AA['pred_1'] == 'es') & (AA['is_latam'] == True)].pred_1.tolist())
1681 journal(s) added; 0 journals corrected; 8837 journals cleaned and stored in total
ja
and top-level domain jp
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'ja') & (AA['tld'] == 'jp')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'ja') & (AA['tld'] == 'jp')].pred_1.tolist())
2 journal(s) added; 0 journals corrected; 8839 journals cleaned and stored in total
pred_1
== ja
are misclassifications, so add pred_2
instead:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'ja')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'ja')].pred_2.tolist())
63 journal(s) added; 0 journals corrected; 8902 journals cleaned and stored in total
#pd.set_option("display.max_rows", None)
#AA[(AA['pred_1'] == 'fr') & (AA['pred_2'].notnull())]
add2dict(d=clean_d,
issns=['0008-4123', '2341-0868', '2595-6752', '1026-2881', '2368-8076', '2665-7716', '0702-7818',
'1496-7308', '0002-4805', '1544-4953', '1499-6677', '0705-3657', '2605-0285'],
langs=['en', 'es', 'pt', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en'])
13 journal(s) added; 0 journals corrected; 8915 journals cleaned and stored in total
ms
and top-level domain my
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'ms') & (AA['tld'] == 'my')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'ms') & (AA['tld'] == 'my')].pred_1.tolist())
36 journal(s) added; 0 journals corrected; 8951 journals cleaned and stored in total
pt
and top-level domain br
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'pt') & (AA['tld'] == 'br')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'pt') & (AA['tld'] == 'br')].pred_1.tolist())
1762 journal(s) added; 0 journals corrected; 10713 journals cleaned and stored in total
de
and top-level domain de
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'de') & (AA['tld'] == 'de')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'de') & (AA['tld'] == 'de')].pred_1.tolist())
69 journal(s) added; 0 journals corrected; 10782 journals cleaned and stored in total
hi
| hi-Latn
and top-level domain in
:¶add2dict(d=clean_d,
issns=AA[(AA['pred_1'] == 'hi') | (AA['pred_1'] == 'hi-Latn') & (AA['tld'] == 'in')].issn.tolist(),
langs=AA[(AA['pred_1'] == 'hi') | (AA['pred_1'] == 'hi-Latn') & (AA['tld'] == 'in')].pred_1.tolist())
2 journal(s) added; 0 journals corrected; 10784 journals cleaned and stored in total
#AA[(AA['tld'] == 'ph') & (AA['pred_2'].notnull())]
add2dict(clean_d,
issns=['0012-2858', '2244-6001'],
langs=['en', 'fil'],
corrections=True)
0 journal(s) added; 2 journals corrected; 10784 journals cleaned and stored in total
#AA[(AA['tld'] == 'dk') & (AA['pred_2'].notnull())]
add2dict(d=clean_d,
issns=['2596-6200', '0909-0533', '2446-0591', '2446-3981', '2597-0704',
'1603-8509', '2246-2589', '2244-9140', '1904-5565', '0029-1528'],
langs=['da', 'da', 'da', 'en', 'da', 'da', 'da', 'da', 'da', 'da'])
9 journal(s) added; 0 journals corrected; 10793 journals cleaned and stored in total
#AA[AA['context_name'].str.contains('musi[ck]', regex=True, case=False)]
add2dict(d=clean_d,
issns=['0354-818X', '2312-2528'],
langs=['sr', 'en'])
2 journal(s) added; 0 journals corrected; 10795 journals cleaned and stored in total
add2dict(clean_d,
issns=['0011-3735'], #current musicology, which gcld3 classified as Japanese and Gaelic
langs=['en'], #https://currentmusicology.columbia.edu/
corrections=True)
0 journal(s) added; 1 journals corrected; 10795 journals cleaned and stored in total
gd
contexts:¶#AA[(AA['pred_1'] == 'gd') | (AA['pred_2'] == 'gd')]
add2dict(clean_d,
issns=['1754-4270', #perhaps the only true Scottish Gaelic journal
'0035-6867', '2675-1127', '0957-5286'],
langs=['gd', 'it', 'pt', 'en'])
4 journal(s) added; 0 journals corrected; 10799 journals cleaned and stored in total
add2dict(clean_d,
issns=['1805-9511', '0252-9076', '2563-562X'],
langs=['cs', 'es', 'en'],
corrections=True)
0 journal(s) added; 3 journals corrected; 10799 journals cleaned and stored in total
cz
:¶#AA[AA['tld'] == 'cz']
add2dict(clean_d,
issns=['0009-2770', '2336-2766', '2336-3630', '1805-9511', '2336-4378',
'1802-3983', '1804-5383', '1804-6665', '2694-9288'],
langs=['cs', 'cs', 'uk', 'cs', 'cs', 'cs', 'cs', 'cs', 'cs'],
corrections=True)
0 journal(s) added; 9 journals corrected; 10804 journals cleaned and stored in total
al
:¶#pd.set_option("display.max_rows", None)
#AA[AA['tld']=='al']
add2dict(clean_d,
issns=['2523-6636'],
langs=['sq'],
corrections=True)
0 journal(s) added; 1 journals corrected; 10804 journals cleaned and stored in total
pk
:¶#AA[AA['tld']=='pk']
#The output below is an error -- 11 journals added, 33 journals corrected
add2dict(clean_d,
issns=['2663-6255', '0430-4055', '1813-775X', '2707-1200', '2411-6211', '1995-7904', '2520-5021',
'1998-4472', '2708-8235', '2073-5146', '2707-6903', '2664-4959', '2708-8847', '1816-5389',
'2709-8885', '2664-1461', '2305-1345', '2708-6577', '2618-1355', '1818-9296', '2664-0023',
'2709-6076', '2709-7641', '2709-4022', '2709-4162', '2710-0227', '2710-0812', '2410-8065',
'2521-408X', '2707-6288', '2617-9075', '2305-154X', '2710-2475', '2710-5180', '2707-7225',
'2523-0093', '2521-8948', '2788-4627', '2709-7617', '2073-3674', '2413-7480', '2415-5500',
'2519-6618', '2518-5330'],
langs=['ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur',
'ur', 'Balochi', 'ur', 'ur', 'ur', 'ur', 'ar', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur',
'ur', 'ur', 'ur', 'ur', 'ar', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ur', 'ar', 'ur', 'ur'],
corrections=True)
0 journal(s) added; 44 journals corrected; 10815 journals cleaned and stored in total
ar
:¶#AA[(AA['pred_1'] == 'ar') | (AA['pred_2'] == 'ar')]
#The output below is an error -- 20 journals added, 5 journals corrected
add2dict(clean_d,
issns=['1693-3257', '2700-8355', '2600-7398', '2410-1036', '2600-7398', '2223-859X', '1996-9546',
'2071-9728', '1026-3748', '1026-3721', '1995-8005', '2460-5360', '2716-5515', '0552-265X',
'1994-473X', '2663-7405', '2437-0789', '2522-3259', '2522-6460', '2421-9843', '2520-7431',
'2664-4673', '2706-9524', '1658-7030', '1658-3116'],
langs=['ar', 'ar', 'ar', 'ku', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'id', 'ar', 'ar', 'ar',
'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar', 'ar'],
corrections=True)
0 journal(s) added; 25 journals corrected; 10835 journals cleaned and stored in total
by
:¶#AA[AA['tld']=='by']
add2dict(clean_d,
issns=['2222-8853', '2789-195X', '2415-7198'],
langs=['by', 'en', 'uk'],
corrections=True)
0 journal(s) added; 3 journals corrected; 10837 journals cleaned and stored in total
pred_1
and top-level domain tld
match, add the journal:¶add2dict(d=clean_d,
issns=AA[AA['pred_1'] == AA['tld']].issn.tolist(),
langs=AA[AA['pred_1'] == AA['tld']].pred_1.tolist())
5298 journal(s) added; 0 journals corrected; 16135 journals cleaned and stored in total
tld
and its secondary language prediction pred_2
matches its title_language
, it probably publishes in its secondary language (e.g., Ukrainian) but translates article metadata into English. So, add the secondary language on the assumption that text_language
== en
is an error due to translated article titles and abstracts:¶ee_tlds = ['AL','AD','AM','AT','BG','BA','BY','CY','CZ','EE','GE','GR','GL','HR',
'HU','LT','LV','MD','MK','ME','PL','RO','RU','RS','SK','SI','UA']
AA['is_ee'] = AA['tld'].apply(lambda x: any([i.lower() for i in ee_tlds if i.lower() == x]))
#AA[(AA['pred_2'] == AA['title_language']) & (AA['pred_2'].notnull()) & (AA['is_ee'] == True)]
add2dict(d=clean_d,
issns=AA[(AA['pred_2'] == AA['title_language']) & (AA['pred_2'].notnull()) & \
(AA['is_ee'] == True)].issn.tolist(),
langs=AA[(AA['pred_2'] == AA['title_language']) & (AA['pred_2'].notnull()) & \
(AA['is_ee'] == True)].pred_2.tolist())
127 journal(s) added; 0 journals corrected; 16262 journals cleaned and stored in total
pred_1
to complete clean_d
:¶add2dict(d=clean_d,
issns=AA[AA['pred_1'].notnull()].issn.tolist(),
langs=AA[AA['pred_1'].notnull()].pred_1.tolist())
6299 journal(s) added; 0 journals corrected; 22561 journals cleaned and stored in total
codes = {'af':'Afrikaans', 'ar':'Arabic', 'bg':'Bulgarian', 'bg-Latn':'Bulgarian', 'bs':'Bosnian', 'by':'Belarusian',
'ca':'Catalan', 'cs':'Czech', 'da':'Danish', 'de':'German', 'el':'Greek', 'el-Latn':'Greek', 'en':'English',
'es':'Spanish', 'et':'Estonian', 'eu':'Basque', 'fa':'Persian', 'fi':'Finnish', 'fil':'Filipino',
'fr':'French', 'ga':'Irish', 'gd':'Scottish Gaelic', 'gl':'Galician', 'hi':'Hindi','hi-Latn':'Hindi',
'hr':'Croatian', 'hu':'Hungarian', 'hy':'Armenian', 'id':'Indonesian', 'ig':'Igbo', 'is':'Icelandic',
'it':'Italian', 'iw':'Hebrew', 'ja':'Japanese', 'ja-Latn':'Japanese', 'ka':'Georgian', 'kk':'Kazakh',
'ko':'Korean', 'ku':'Kurdish', 'la':'Latin', 'lt':'Lithuanian', 'lv':'Latvian', 'mk':'Macedonian',
'ms':'Malay', 'my':'Burmese', 'ne':'Nepali', 'nl':'Dutch', 'no':'Norwegian', 'pl':'Polish',
'pt':'Portuguese', 'ro':'Romanian', 'ru':'Russian', 'ru-Latn':'Russian', 'sd':'Sindhi', 'si':'Sinhala',
'sk':'Slovak', 'sl':'Slovenian', 'sq':'Albanian', 'sr':'Serbian', 'sv':'Swedish', 'sw':'Swahili',
'ta':'Tamil', 'tg':'Tajik', 'th':'Thai', 'tr':'Turkish', 'uk':'Ukrainian', 'ur':'Urdu', 'uz':'Uzbek',
'vi':'Vietnamese', 'zh':'Chinese', 'zh-Latn':'Chinese'}
Get primary language counts for the entire sample of journals, to be visualized using matplotlib and seaborn:
issn2primary = pd.DataFrame({'issn': list(clean_d.keys()),
'gcld3_code': [c if len(c) == 2 else None for c in clean_d.values()],
'language': [codes[l] if codes.get(l) else l for l in list(clean_d.values())]})
issn2primary = issn2primary[issn2primary['language'].notnull()]
#Get a series of value counts for the language codes
ls = issn2primary['language'].value_counts(sort=True, ascending=False)
print(f"Total: {ls.sum()} journals")
Total: 22561 journals
ojsLangs = AA[['issn', 'issn_alt', 'context_name', 'journal_url']].merge(issn2primary, how='left', on='issn')
ojsLangs = ojsLangs[ojsLangs['language'].notnull()].sort_values(by=['journal_url'])
print(ojsLangs.shape)
ojsLangs.head()
(22561, 6)
issn | issn_alt | context_name | journal_url | gcld3_code | language | |
---|---|---|---|---|---|---|
17064 | 1018-2888 | 2709-7951 | Diagnóstico | http://142.44.242.51/index.php/diagnostico | es | Spanish |
16672 | 2236-3785 | NaN | Revista Ciências em Saúde | http://186.225.220.186:7474/ojs/index.php/rcsf... | pt | Portuguese |
9466 | 2077-1371 | 2077-1460 | Biosfera | http://21bs.ru/index.php/bio | en | English |
5560 | 2089-4686 | 2548-5970 | 2-TRIK: TUNAS-TUNAS RISET KESEHATAN | http://2trik.jurnalelektronik.com/index.php/2trik | id | Indonesian |
9854 | 2705-0513 | NaN | 工程技术研究 | http://2winpub.usp-pl.com/index.php/ETR | zh | Chinese |
ojsLangs.to_csv(os.path.join('data', 'OJS_languages_v3.csv'), index=False)
%matplotlib inline
sns.set(font_scale=1.25, style='whitegrid')
fig, ax = matplotlib.pyplot.subplots()
ld = sns.barplot(x=ls.values[:10],
y=ls.index[:10],
orient='h',
color='grey')
ax.set(xlim=(0, 12000),
xlabel='Active journals using OJS',
ylabel='Language')#,
#title='Primary language employed by journals using OJS ($\it{n}$ = 21,874)')
sns.despine(bottom=True)
matplotlib.pyplot.xticks([2000, 4000, 6000, 8000, 10000],
['2,000', '4,000', '6,000', '8,000', '10,000'])
for p in ld.patches:
_x = p.get_x() + p.get_width()
_y = p.get_y() + p.get_height() - 0.15
percent = round(((p.get_width() / 22561) * 100), 1)
string = str(int(p.get_width()))
if len(string) == 5:
value = string[:2] + ',' + string[2:]
elif len(string) == 4:
value = string[0] + ',' + string[1:]
else:
value = string
value += ' ({})'.format(str(percent)+'%')
ld.text(_x + 150, _y, value, ha='left', weight='bold')
fig.savefig(os.path.join('vis', 'OJS_primary_languages.png'), bbox_inches='tight')
%matplotlib inline
fig, ax = matplotlib.pyplot.subplots(figsize=(8,16))
mult = sns.barplot(y=list(ls.sort_values().index),
x=list(ls.sort_values().values),
orient='h',
color='grey')
sns.despine(bottom=True)
ax.set(xlim=(0, 12500),
xlabel='Active journals using OJS',
ylabel='Language',
title='Primary language employed by journals using OJS ($\it{n}$ = 22,561)',
visible=True)
matplotlib.pyplot.xticks([0, 2000, 4000, 6000, 8000, 10000, 12000],
['0', '2,000', '4,000', '6,000', '8,000', '10,000', '12,000'])
for p in mult.patches:
_x = p.get_x() + p.get_width()
_y = p.get_y() + p.get_height() - 0.175
percent = round(((p.get_width() / 22561) * 100), 2)
string = str(int(p.get_width()))
if len(string) == 5:
value = string[:2] + ',' + string[2:]
elif len(string) == 4:
value = string[0] + ',' + string[1:]
else:
value = string
value += ' ({})'.format(str(percent)+'%')
mult.text(_x + 125, _y, value, ha='left', weight='bold')
fig.savefig(os.path.join('vis', 'OJS_languages_v3.png'), bbox_inches='tight')
Each bar represents the proportion of journals that published 5 or more articles in each of their publishing languages. The decision boundary of 5 was chosen to match the decision boundary for active journals (>=5 articles published per year).
def classify_journals_multi(issn2langs, decision_boundary):
multilingual = defaultdict(list)
for k, v in issn2langs.items():
for lang in Counter(v).items():
if lang[1] >= decision_boundary:
#If the the number of article abstracts tagged as a given language ('en') exceeds the boundary
multilingual[k].append(lang[0]) #Append the language to a list for the journal
multilingual_counts = defaultdict(int)
array_lengths = []
for v in multilingual.values():
if v:
multiplier = len(v)
array_lengths.append(multiplier)
if multiplier >= 3:
multilingual_counts['Multi- (3+ languages)'] += 1
elif multiplier == 2:
multilingual_counts['Bi- (2 languages)'] += 1
elif multiplier == 1:
multilingual_counts['Mono- (1 language)'] += 1
else:
continue
total = 0
for v in multilingual_counts.values():
total += v
print('Total: {} journals'.format(total))
print('Average number of languages per journal: {}'.format(np.array(array_lengths).mean()))
return pd.Series(multilingual_counts).sort_values(ascending=False)
multi5 = classify_journals_multi(issn2langs, decision_boundary=5)
Total: 22382 journals Average number of languages per journal: 1.6994013046197838
%matplotlib inline
fig, ax = matplotlib.pyplot.subplots()
mult = sns.barplot(y=multi5.index,
x=multi5.values,
orient='h',
color='grey')
ax.set(xlim=(0, 12000),
xlabel='Active journals using OJS',
ylabel='*-lingual journals')#,
#title='Number of languages employed by journals using OJS ($\it{n}$ = 22,382)')
sns.despine(bottom=True)
matplotlib.pyplot.xticks([2000, 4000, 6000, 8000, 10000],
['2,000', '4,000', '6,000', '8,000', '10,000'])
for p in mult.patches:
_x = p.get_x() + p.get_width()
_y = p.get_y() + p.get_height() - 0.35
percent = round(((p.get_width() / 22382) * 100), 1)
string = str(int(p.get_width()))
if len(string) == 5:
value = string[:2] + ',' + string[2:]
elif len(string) == 4:
value = string[0] + ',' + string[1:]
else:
value = string
value += ' ({})'.format(str(percent)+'%')
mult.text(_x + 150, _y, value, ha='left', weight='bold')
fig.savefig(os.path.join('vis', 'OJS_multilingual5.png'), bbox_inches=('tight'))