*Updated 9/22/2022
!python --version
Python 3.10.5
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import requests
import json
import os
import re
from easynmt import EasyNMT
from tqdm import tqdm
scopus = pd.read_excel(os.path.join('data', 'scopus_jan2021.xlsx'))
scopus = scopus.drop_duplicates(subset=["Print-ISSN", "E-ISSN"])
langs = scopus['Article language in source (three-letter ISO language codes)'].unique().tolist()
langs = [re.split(r"[^a-zA-Z]+", l) for l in langs if isinstance(l, str)]
langs = sorted(list(set([l for subl in langs for l in subl])))
print(len(langs), "\n", langs)
49 ['AFR', 'ARA', 'ARM', 'AZE', 'BAQ', 'BOS', 'BUL', 'CAT', 'CHI', 'CHN', 'CZE', 'DAN', 'DUT', 'ENF', 'ENG', 'EST', 'FIN', 'FRE', 'GER', 'GLE', 'GLG', 'GRE', 'HEB', 'HUN', 'ICE', 'IND', 'ITA', 'JPN', 'KOR', 'LAV', 'LIT', 'MAC', 'MAO', 'MAY', 'NOR', 'PER', 'POL', 'POR', 'RUM', 'RUS', 'SCC', 'SCR', 'SLO', 'SLV', 'SPA', 'SWE', 'THA', 'TUR', 'UKR']
print([f"{lang}: {scopus.iloc[:, 7].str.contains(lang).sum()}" for lang in langs])
['AFR: 13', 'ARA: 24', 'ARM: 1', 'AZE: 3', 'BAQ: 6', 'BOS: 9', 'BUL: 18', 'CAT: 33', 'CHI: 562', 'CHN: 1', 'CZE: 131', 'DAN: 14', 'DUT: 71', 'ENF: 1', 'ENG: 27125', 'EST: 19', 'FIN: 18', 'FRE: 1197', 'GER: 997', 'GLE: 5', 'GLG: 2', 'GRE: 38', 'HEB: 7', 'HUN: 44', 'ICE: 3', 'IND: 4', 'ITA: 535', 'JPN: 199', 'KOR: 69', 'LAV: 8', 'LIT: 19', 'MAC: 2', 'MAO: 1', 'MAY: 12', 'NOR: 20', 'PER: 46', 'POL: 190', 'POR: 474', 'RUM: 53', 'RUS: 412', 'SCC: 15', 'SCR: 109', 'SLO: 64', 'SLV: 61', 'SPA: 1348', 'SWE: 23', 'THA: 3', 'TUR: 134', 'UKR: 26']
(Transliteration is used by Scopus, but transliterating "Education" to Chinese "Jiaoyu" returns no titles. I will skip transliteration because the success rate seems so low.)
iso639_1 = ["af", "ca", "cs", "da", "nl", "et", "fi", "fr", "de", "hu", "id", "it", "lv", "lt", "no", "pl",
"pt", "ro", "sr", "sk", "sl", "es", "sv", "tr"]
model = EasyNMT("opus-mt")
doc = ["education",
"learn",
"teach"]
edu = []
for code in iso639_1:
try:
edu.extend(model.translate(doc, target_lang=code))
except OSError:
continue
print(edu)
/Users/jball/opt/anaconda3/envs/tmp/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py:194: UserWarning: Recommended: pip install sacremoses. warnings.warn("Recommended: pip install sacremoses.") /Users/jball/opt/anaconda3/envs/tmp/lib/python3.10/site-packages/transformers/generation_utils.py:1227: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 512 (`self.config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation. warnings.warn( Exception: Helsinki-NLP/opus-mt-en-lv is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-lt is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-no is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-pl is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-pt is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-sr is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-sl is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`. Exception: Helsinki-NLP/opus-mt-en-tr is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models' If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
['Opvoeding', 'leer', 'onderrig', 'educació', 'learn', 'ensenyeu', 'vzdělávání', 'učit se', 'Učit', 'uddannelse', 'lær', 'underviser', 'onderwijs', 'leren', 'lesgeven', 'haridus', 'õpi', 'õpetamine', 'koulutus', 'opi', 'opettaa', 'éducation', 'apprendre', 'enseigner', 'Bildung', 'lernen', 'Unterricht', 'oktatás', 'tanulj!', 'Tanárnő!', 'pendidikan', 'belajar', 'mengajar', 'istruzione', 'imparare', 'Insegna', 'educaţie', 'Învaţă', 'Predă', 'vzdelávanie', 'učiť sa', 'vyučovať', 'Educación', 'aprender', 'enseñar', 'utbildning', 'lära dig', 'lära ut']
scopus.iloc[:, 1].str.contains(
"ducat|teach|learn|opvoeding|onderrig|educaci|enseny|vzdela|ucit se|uddanel|undervis|onderwijs|leren|lesgev|haridus|opetami|koulutus|opettaa|apprend|enseign|bildung|lernen|unterricht|oktatas|tanulj|tanarno|pendidikan|belajar|mengajar|istruzione|imparare|insenga|invata|vyuco|aprend|ensena|utbild|lara dig|lara ut|jiaoyu",
regex=True, case=False).sum()
1097
ed = 1097 / 41957
print(f"Education journals: {round(ed*100, 1)}%")
Education journals: 2.6%
print(scopus.columns[23:])
scopus.iloc[:, 23:].info()
Index(['Top level:\n\nLife Sciences', 'Top level:\n\nSocial Sciences', 'Top level:\n\nPhysical Sciences', 'Top level:\n\nHealth Sciences', '1000 \nGeneral', '1100\nAgricultural and Biological Sciences', '1200\nArts and Humanities', '1300\nBiochemistry, Genetics and Molecular Biology', '1400\nBusiness, Management and Accounting', '1500\nChemical Engineering', '1600\nChemistry', '1700\nComputer Science', '1800\nDecision Sciences', '1900\nEarth and Planetary Sciences', '2000\nEconomics, Econometrics and Finance', '2100\nEnergy', '2200\nEngineering', '2300\nEnvironmental Science', '2400\nImmunology and Microbiology', '2500\nMaterials Science', '2600\nMathematics', '2700\nMedicine', '2800\nNeuroscience', '2900\nNursing', '3000\nPharmacology, Toxicology and Pharmaceutics', '3100\nPhysics and Astronomy', '3200\nPsychology', '3300\nSocial Sciences', '3400\nVeterinary', '3500\nDentistry', '3600\nHealth Professions'], dtype='object') <class 'pandas.core.frame.DataFrame'> Int64Index: 41958 entries, 0 to 42473 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Top level: Life Sciences 7666 non-null object 1 Top level: Social Sciences 13940 non-null object 2 Top level: Physical Sciences 14039 non-null object 3 Top level: Health Sciences 14744 non-null object 4 1000 General 148 non-null object 5 1100 Agricultural and Biological Sciences 3031 non-null object 6 1200 Arts and Humanities 5424 non-null object 7 1300 Biochemistry, Genetics and Molecular Biology 3105 non-null object 8 1400 Business, Management and Accounting 2007 non-null object 9 1500 Chemical Engineering 1081 non-null object 10 1600 Chemistry 1312 non-null object 11 1700 Computer Science 2274 non-null object 12 1800 Decision Sciences 513 non-null object 13 1900 Earth and Planetary Sciences 2250 non-null object 14 2000 Economics, Econometrics and Finance 1412 non-null object 15 2100 Energy 717 non-null object 16 2200 Engineering 5236 non-null object 17 2300 Environmental Science 2636 non-null object 18 2400 Immunology and Microbiology 902 non-null object 19 2500 Materials Science 1917 non-null object 20 2600 Mathematics 1929 non-null object 21 2700 Medicine 13779 non-null object 22 2800 Neuroscience 793 non-null object 23 2900 Nursing 897 non-null object 24 3000 Pharmacology, Toxicology and Pharmaceutics 1231 non-null object 25 3100 Physics and Astronomy 1542 non-null object 26 3200 Psychology 1566 non-null object 27 3300 Social Sciences 8704 non-null object 28 3400 Veterinary 317 non-null object 29 3500 Dentistry 257 non-null object 30 3600 Health Professions 702 non-null object dtypes: object(31) memory usage: 10.2+ MB
scopus[(scopus["1700\nComputer Science"].notnull()) |
(scopus["2200\nEngineering"].notnull())].shape
(6752, 54)
csen = 6752 / 41957
print(f"% CS & Engineering journals: {round(csen*100, 1)}%")
% CS & Engineering journals: 16.1%
scopus[scopus["2600\nMathematics"].notnull()].shape
(1929, 54)
math = 1929 / 41957
print(f"% Math journals: {round(math*100, 1)}%")
% Math journals: 4.6%
medh = scopus[scopus["Top level:\n\nHealth Sciences"].notnull()].shape[0] / 41957
print(f"% Med-Health journals: {round(medh*100, 1)}%")
% Med-Health journals: 35.1%
e_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) &
(scopus["E-ISSN"].notnull())]["E-ISSN"]
e_socsci = list(zip(e_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in e_socsci]))
print_socsci = scopus[(scopus["Top level:\n\nSocial Sciences"].notnull()) &
(scopus["Print-ISSN"].notnull())]["Print-ISSN"]
print_socsci = list(zip(print_socsci.index, [str(issn)[:4] + "-" + str(issn)[4:] for issn in print_socsci]))
ss_issns = e_socsci + print_socsci
def get_subjects(list_of_tuples):
idx2subject = []
error_issns = []
for i, v in tqdm(list_of_tuples):
query = "https://api.openalex.org/venues/issn:" + v
try:
response = json.loads(
requests.get(query).content.decode()
)
subject = response["x_concepts"][0]["display_name"]
except:
error_issns.append(v)
idx2subject.append(
(i, subject)
)
return idx2subject, error_issns
idx2subject, errors = get_subjects(ss_issns)
100%|███████████████████████████████████| 19908/19908 [2:39:52<00:00, 2.08it/s]
with open(os.path.join("data", "idx2subject_ss.json"), "w") as outfile:
json.dump(idx2subject, outfile)
print(set([t[1] for t in idx2subject]))
{'History', 'Ecology', 'Art', 'Linguistics', 'Genetics', 'Humanities', 'Common value auction', 'Business', 'Macroeconomics', 'Economics', 'Chemistry', 'Visual arts', 'Biology', 'Physics', 'Demographic economics', 'Finance', 'Materials science', 'Economic geography', 'Population', 'Computer science', 'Psychology', 'Law', 'Nanotechnology', 'Thermodynamics', 'Outbreak', 'Geology', 'Environmental science', 'Astronomy', 'Poison control', 'Geophysics', 'Political science', 'Medicine', 'Nursing', 'Mathematics', 'Geography', 'Archaeology', 'Philosophy', 'Engineering', 'Monetary policy', 'Sociology'}
d = {}
phil = 0
ling = 0
hum = 0
for idx, subject in idx2subject:
if idx not in d:
d[idx] = subject
match subject:
case "Philosophy":
phil += 1
case "Linguistics":
ling += 1
case "Humanities":
hum += 1
print(f"% Philosophy journals: {round(phil / 41957 * 100, 1)}%")
print(f"% Linguistics journals: {round(ling / 41957 * 100, 1)}%")
print(f"% Humanities journals: {round(hum / 41957 * 100, 1)}%")
print(f"Phil: {phil}, Ling: {ling}, Hum: {hum}")
% Philosophy journals: 5.6% % Linguistics journals: 0.0% % Humanities journals: 0.1% Phil: 2365, Ling: 3, Hum: 23
print(len(d))
13940
hist = 0
art = 0
visa = 0
soc = 0
for idx, subject in d.items():
match subject:
case "History":
hist += 1
case "Art":
art += 1
case "Visual arts":
visa += 1
case "Sociology":
soc += 1
print(f"% History journals: {round(hist / 41957 * 100, 1)}%")
print(f"% Art + Visual Arts journals: {round((art+visa) / 41957 * 100, 1)}%")
print(f"% Sociology journals: {round(soc / 41957 * 100, 1)}%")
print(f"Hist: {hist}, Art: {art}, Vis Art: {visa}, Soc: {soc}")
% History journals: 1.9% % Art + Visual Arts journals: 3.4% % Sociology journals: 1.6% Hist: 792, Art: 1437, Vis Art: 1, Soc: 656
First, I will assume that a "Philosophy" classification from OpenAlex genuinely indicates a philosophy journal only if the journal is also classified by Scopus as "Arts and Humanities":
alex = scopus[scopus.index.isin(
list(set([t[0] for t in idx2subject]))
)]
print(alex.shape)
(13940, 54)
phil_indices = [idx for idx, subject in d.items() if subject == "Philosophy"]
print(len(phil_indices))
2365
philn = alex[
(alex.index.isin(phil_indices)) & (alex["1200\nArts and Humanities"].notnull())
].shape[0]
print(philn)
print(f"% Philosophy journals, double checked: {round(philn / 41957 * 100, 1)}")
1694 % Philosophy journals, double checked: 4.0
As for Scopus's "Arts and Humanities" journals which aren't labeled "Philosophy," "Arts," or "Visual arts" by OpenAlex, those provide a rough estimate for the number of journals in "Language, communication, and culture." That is to say, these are just "Humanities" journals:
art_indices = [idx for idx, subject in d.items() if subject == "Art" or subject == "Visual arts"]
print(len(art_indices))
1438
lcc = alex[
(alex["1200\nArts and Humanities"].notnull()) &
(~alex.index.isin(art_indices)) &
(~alex.index.isin(phil_indices))
].shape[0]
print(lcc)
print(f"% Language, communication, and culture journals, double checked: {round(lcc / 41957 * 100, 1)}")
2493 % Language, communication, and culture journals, double checked: 5.9