#!/usr/bin/env python # coding: utf-8 # # 5. Journals indexed in Scopus by their disciplinary distribution # ### Notebook objectives: # 1. Determine the disciplinary distribution of Scopus journals for the sake of comparison to OJS. # *Updated 9/22/2022 # In[1]: get_ipython().system('python --version') # In[2]: import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import requests import json import os import re from easynmt import EasyNMT from tqdm import tqdm # In[3]: scopus = pd.read_excel(os.path.join('data', 'scopus_jan2021.xlsx')) scopus = scopus.drop_duplicates(subset=["Print-ISSN", "E-ISSN"]) # ### Education isn't a defined subject area in the Scopus data, so I approximate the number of Education journals using a multilngual string search for "education," "teach," and "learn" in journal titles: # In[4]: langs = scopus['Article language in source (three-letter ISO language codes)'].unique().tolist() langs = [re.split(r"[^a-zA-Z]+", l) for l in langs if isinstance(l, str)] langs = sorted(list(set([l for subl in langs for l in subl]))) print(len(langs), "\n", langs) # In[5]: print([f"{lang}: {scopus.iloc[:, 7].str.contains(lang).sum()}" for lang in langs]) # #### Save a list of ISO-639-1 language codes with Latin scripts, because the Scopus data only feature titles written in Latin scripts: # (Transliteration is used by Scopus, but transliterating "Education" to Chinese "Jiaoyu" returns no titles. I will skip transliteration because the success rate seems so low.) # In[6]: iso639_1 = ["af", "ca", "cs", "da", "nl", "et", "fi", "fr", "de", "hu", "id", "it", "lv", "lt", "no", "pl", "pt", "ro", "sr", "sk", "sl", "es", "sv", "tr"] # In[7]: model = EasyNMT("opus-mt") # In[8]: doc = ["education", "learn", "teach"] edu = [] for code in iso639_1: try: edu.extend(model.translate(doc, target_lang=code)) except OSError: continue print(edu) # In[9]: scopus.iloc[:, 1].str.contains( "ducat|teach|learn|opvoeding|onderrig|educaci|enseny|vzdela|ucit se|uddanel|undervis|onderwijs|leren|lesgev|haridus|opetami|koulutus|opettaa|apprend|enseign|bildung|lernen|unterricht|oktatas|tanulj|tanarno|pendidikan|belajar|mengajar|istruzione|imparare|insenga|invata|vyuco|aprend|ensena|utbild|lara dig|lara ut|jiaoyu", regex=True, case=False).sum() # In[10]: ed = 1097 / 41957 print(f"Education journals: {round(ed*100, 1)}%") # In[11]: print(scopus.columns[23:]) scopus.iloc[:, 23:].info() # In[12]: scopus[(scopus["1700\nComputer Science"].notnull()) | (scopus["2200\nEngineering"].notnull())].shape # In[13]: csen = 6752 / 41957 print(f"% CS & Engineering journals: {round(csen*100, 1)}%") # In[14]: scopus[scopus["2600\nMathematics"].notnull()].shape # In[15]: math = 1929 / 41957 print(f"% Math journals: {round(math*100, 1)}%") # In[16]: medh = scopus[scopus["Top level:\n\nHealth Sciences"].notnull()].shape[0] / 41957 print(f"% Med-Health journals: {round(medh*100, 1)}%") # ### Use OpenAlex to try and disaggregate the "Social Sciences" journals: # In[17]: e_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) & (scopus["E-ISSN"].notnull())]["E-ISSN"] e_socsci = list(zip(e_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in e_socsci])) # In[18]: print_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) & (scopus["Print-ISSN"].notnull())]["Print-ISSN"] print_socsci = list(zip(print_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in print_socsci])) # In[19]: ss_issns = e_socsci + print_socsci # In[20]: def get_subjects(list_of_tuples): idx2subject = [] error_issns = [] for i, v in tqdm(list_of_tuples): query = "https://api.openalex.org/venues/issn:" + v try: response = json.loads( requests.get(query).content.decode() ) subject = response["x_concepts"][0]["display_name"] except: error_issns.append(v) idx2subject.append( (i, subject) ) return idx2subject, error_issns # In[21]: idx2subject, errors = get_subjects(ss_issns) # In[22]: with open(os.path.join("data", "idx2subject_ss.json"), "w") as outfile: json.dump(idx2subject, outfile) # In[23]: print(set([t[1] for t in idx2subject])) # In[25]: d = {} phil = 0 ling = 0 hum = 0 for idx, subject in idx2subject: if idx not in d: d[idx] = subject match subject: case "Philosophy": phil += 1 case "Linguistics": ling += 1 case "Humanities": hum += 1 print(f"% Philosophy journals: {round(phil / 41957 * 100, 1)}%") print(f"% Linguistics journals: {round(ling / 41957 * 100, 1)}%") print(f"% Humanities journals: {round(hum / 41957 * 100, 1)}%") print(f"Phil: {phil}, Ling: {ling}, Hum: {hum}") # In[26]: print(len(d)) # In[28]: hist = 0 art = 0 visa = 0 soc = 0 for idx, subject in d.items(): match subject: case "History": hist += 1 case "Art": art += 1 case "Visual arts": visa += 1 case "Sociology": soc += 1 print(f"% History journals: {round(hist / 41957 * 100, 1)}%") print(f"% Art + Visual Arts journals: {round((art+visa) / 41957 * 100, 1)}%") print(f"% Sociology journals: {round(soc / 41957 * 100, 1)}%") print(f"Hist: {hist}, Art: {art}, Vis Art: {visa}, Soc: {soc}") # ### Final step: I need to give a rough estimate of the proportions of Scopus-indexed journals falling under the rubrics of "Language, communication, and culture" and "Philosophy and religion." # First, I will assume that a "Philosophy" classification from OpenAlex genuinely indicates a philosophy journal only if the journal is also classified by Scopus as "Arts and Humanities": # In[30]: alex = scopus[scopus.index.isin( list(set([t[0] for t in idx2subject])) )] print(alex.shape) # In[32]: phil_indices = [idx for idx, subject in d.items() if subject == "Philosophy"] print(len(phil_indices)) # In[34]: philn = alex[ (alex.index.isin(phil_indices)) & (alex["1200\nArts and Humanities"].notnull()) ].shape[0] print(philn) print(f"% Philosophy journals, double checked: {round(philn / 41957 * 100, 1)}") # As for Scopus's "Arts and Humanities" journals which aren't labeled "Philosophy," "Arts," or "Visual arts" by OpenAlex, those provide a rough estimate for the number of journals in "Language, communication, and culture." That is to say, these are just "Humanities" journals: # In[41]: art_indices = [idx for idx, subject in d.items() if subject == "Art" or subject == "Visual arts"] # In[42]: print(len(art_indices)) # In[45]: lcc = alex[ (alex["1200\nArts and Humanities"].notnull()) & (~alex.index.isin(art_indices)) & (~alex.index.isin(phil_indices)) ].shape[0] print(lcc) print(f"% Language, communication, and culture journals, double checked: {round(lcc / 41957 * 100, 1)}")