#!/usr/bin/env python # coding: utf-8 # # 3.1 Multilingualism among Journals using OJS # # ### Notebook objectives: # 1. Determine for each journal in the subset (n=22,561) the languages in which they published more than 5 articles # 2. Classify journals based on whether they published more than 5 articles in multiple languages # 3. Double-check error-prone Indonesian journal classifications # In[1]: from collections import Counter, defaultdict import pandas as pd import numpy as np import json import re import os # Load previously determined lists of gcld3 language codes for each journal, represented by ISSN: # In[2]: with open(os.path.join("data", "issn2langs.json"), "r") as infile: issn2langs = json.load(infile) # Load a .csv with previously determined primary language classifications for each journal: # In[3]: with open(os.path.join("data", "OJS_languages_v3.csv"), "r") as infile: ojs = pd.read_csv(infile) # In[4]: ojs.info() # In[5]: issn2mono = dict(zip(ojs["issn"].tolist(), ojs["gcld3_code"].tolist())) # An eventual visualization will only include the four main languages of OJS users. These are English, Indonesian, Spanish, Portuguese, and a placeholder category, "Other": # In[6]: allowed_langs = ["en", "id", "es", "pt"] # +Other, "xx" # Loop over the ISSNs and produce combinations of language codes for each journal: # In[7]: d = defaultdict(int) count = 0 id_check = [] for idx, (k, v) in enumerate(list(issn2langs.items())): langs = [] if issn2mono[k] in allowed_langs: langs.append(issn2mono[k]) #stable code for each of the allowed languages else: langs.append("xx") #other language c = list(Counter(v).items()) c = [tup for tup in c if tup[1] > 5] #apply the "at least 5 articles" criterion if c: for tup in c: if tup[0] in allowed_langs: #filter for the four languages mentioned above langs.append(tup[0]) elif tup[0] in ["af", "ja"]: #Afrikaans and Japanese are common gcld3 errors #ignore these languages becuase <10 of the journals actually publish in these languages continue else: langs.append("xx") #other languages langs = sorted(list(set(langs))) langtup = tuple(langs) d[langtup] += 1 count += 1 #checking indonesian journals if langtup == ('id', 'pt'): id_check.append(k) if langtup == ('en', 'es', 'id', 'pt'): id_check.append(k) if langtup == ('en', 'es', 'id', 'xx'): id_check.append(k) if langtup == ('en', 'es', 'id'): id_check.append(k) if langtup == ('en', 'id', 'pt'): id_check.append(k) # In[8]: print(count) # An additional two journals which are not present in the data, publishing in Balochi and Faroese, # will be manually added to "Other," or "xx". # Language combinations, some of which need to be double-checked: # In[9]: for k in sorted(d, key=len, reverse=False): print(k, d[k]) # These ISSNs feature unusual combinations of Indonesian, Spanish, and Portuguese classifications. Each was manually checked by querying issn.org: https://portal.issn.org/ # In[10]: print(id_check)