Building on a dataset we previously released of citations with identifiers across all Wikipedia language editions, we explore the distribution of DOIs cited in Wikipedia by topic and accessibility.
We assign a topic to each publication, by looking at the main topic(s) of the Wikipedia article that cites it. Topics are determined by matching each article with its WikiProject, and assigning the corresponding top-level topic according to the Wikiproject Hierarchy. This is done using the draftopic tool from the Wikimedia Foundation's Scoring Platform team.
We determine the accessibility of each publication (by looking up the DOI in data provided by Unpaywall. The DOI is marked as:
Note that this analysis only takes into account the openness of the canonical version of a scholarly paper citation as identified by a DOI. Citation templates used in Wikipedia articles often complement a DOI with a link to an accessible version, when it has identified."
Further documentation on the format of the data can be found in the parent dataset: https://doi.org/10.6084/m9.figshare.1299540.v10
'''
import useful libraries
'''
import pandas as pd
import numpy as np
from bokeh.models import ColumnDataSource, LabelSet, HoverTool,Range1d, NumeralTickFormatter
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.transform import stack
import math
import operator
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
''' use input file provided or substitute with your own path '''
inputfile='data/all_data_forplot.tsv'
def generate_open_language_plot(dataframe,open_type,topic):
'''
takes as input:
dataframe - the pandas data frame containing the data
open_type - 'open' if you want to visualize completely open access statistics;
'avaliable if you want to visualize statistics about publications having free copies available;
topic - a string corresponding to one of the topics, or 'all' if you want to have
a complete overview across topics. Choose between:
'Africa', 'Americas', 'Article improvement and grading', 'Arts', 'Biology', 'Bodies of water',
'Broadcasting', 'Business and economics', 'Chemistry', 'Cities', 'Contents systems','Countries',
'Crafts and hobbies', 'Economics', 'Education','Entertainment', 'Europe', 'Files',
'Food and drink', 'Geosciences','History and society', 'Information science', 'Internet culture',
'Landforms', 'Language and literature', 'Maintenance', 'Maps', 'Mathematics', 'Media', 'Medicine',
'Meteorology', 'Military and warfare', 'Music', 'Performing arts','Philosophy and religion',
'Physics', 'Plastic arts','Politics and government', 'Science', 'Space', 'Sports','Technology',
'Time', 'Transportation', 'all'
'''
if open_type=='open':
text='open access'
elif open_type=='available':
text='available open access'
else:
print('wrong accessibility type')
return
if topic == 'all':
topic= 'all_topics'
TITLE = "Percentage of "+text+" publications for all topics"
else:
TITLE = "Percentage of "+text+" publications for topic "+topic
#load the data for one specific topic
source=ColumnDataSource(dataframe.loc[dataframe['topic'] == topic])
#prepare interaction tools
tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
hover = HoverTool(tooltips=[
("language", "@wiki"),
("total scholarly publications:", "@{total}{0}"),
("% open publications", "@{open}{0.00%}"),
("% open access available publications", "@{available}{0.00%}")])
tools.append(hover)
#prepare the plot figure, depending on the quantity of data, go for log scale or linear scale
if max(dataframe.loc[dataframe['topic'] == topic]['total']>500):
p = figure(tools=tools, toolbar_location="above", logo="grey",
plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
else:
p = figure(tools=tools, toolbar_location="above", logo="grey",
plot_width=800, plot_height=600, title=TITLE)
#prepare plot background, axes labels and line colors
p.background_fill_color = "#ffffff" #change if you don't want white background
p.xaxis.axis_label = "percentage of "+text+" publications"
p.yaxis.axis_label = "total number of scholarly publications"
p.grid.grid_line_color = "gray"
#choose format for axes
p.yaxis[0].formatter = NumeralTickFormatter(format="0")
p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%")
#draw the circles; change colors here
p.circle(open_type, "total", size=10,
source=source, line_color="#005693", line_width=1,
line_alpha=0.7, fill_alpha=0.5, fill_color="#23a3ff")
labels = LabelSet(x=open_type, y="total", text="wiki",y_offset=8,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
#draws the plot
output_notebook()
show(p)
def generate_open_topic_plot(df,topics, open_type,lan):
'''
takes as input:
df - the pandas data frame containing the data
open_type - 'open' if you want to visualize completely open access statistics;
'avaliable if you want to visualize statistics about publications having free copies available;
lan - a string corresponding to one of the languages for which we have data, or 'all' if you want to have
a complete overview across all languages. Choose between:
'ace', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arz', 'as', 'ast', 'av', 'az', 'azb', 'ba',
'bar', 'bat_smg', 'bcl', 'be', 'be_x_old', 'bg', 'bh', 'bjn', 'bn', 'bo', 'bpy', 'br', 'bs', 'bxr',
'cbk_zam', 'cdo', 'ce', 'ceb', 'chr', 'ckb', 'co', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'diq',
'dsb', 'dty', 'dv', 'ee', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'fi', 'fo', 'fr',
'frr', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'gn', 'gom', 'gu', 'gv', 'ha', 'hak', 'hi', 'hif', 'hr',
'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ilo', 'io', 'is', 'it', 'ja', 'jam', 'jbo', 'jv',
'ka', 'kab', 'kk', 'km', 'kn', 'ko', 'koi', 'krc', 'ku', 'kv', 'ky', 'la', 'lad', 'lb', 'lez', 'lg',
'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'lv', 'mai', 'map_bms', 'mdf', 'mg', 'mhr', 'min', 'mk',
'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl', 'my', 'myv', 'mzn', 'na', 'nah', 'nds', 'nds_nl', 'ne',
'new', 'nl', 'nn', 'no', 'nov', 'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pam', 'pap',
'pcd', 'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'ps', 'pt', 'qu', 'rm', 'ro', 'roa_tara', 'ru', 'rue',
'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sh', 'si', 'simple', 'sk', 'sl', 'sm', 'sn', 'so',
'sq', 'sr', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te', 'test', 'test2', 'tet', 'tg',
'th', 'ti', 'tl', 'tn', 'tr', 'ts', 'tt', 'ug', 'uk', 'ur', 'uz', 'vec', 'vep', 'vi', 'vls', 'vo', 'war',
'wo', 'wuu', 'xh', 'xmf', 'yi', 'yo', 'za', 'zh', 'zh_classical', 'zh_min_nan', 'zh_yue','all'
'''
#we are now going to generate a new dataframe gathering for each topic either the values for one language,
#or the average/sum of values across languages
dic={}
counts={}
if open_type=='open':
text='open access'
elif open_type=='available':
text='available open access'
else:
print('wrong accessibility type')
return
if lan=='all':
for topic in topics:
if topic=='all_topics':
continue
dic[topic]=np.mean(df.loc[df['topic']==topic].loc[df['wiki']=='all_languages'][open_type])
counts[topic]=np.sum(df.loc[df['topic']==topic].loc[df['wiki']=='all_languages']['total'])
TITLE = "Percentage of "+text+" publications by topic for all languages"
else:
for topic in topics:
if topic=='all_topics':
continue
dic[topic]=np.mean(df.loc[df['topic']==topic].loc[df['wiki']==lan][open_type])
counts[topic]=np.sum(df.loc[df['topic']==topic].loc[df['wiki']==lan]['total'])
TITLE = "Percentage of "+text+" publications for "+lan+".Wikipedia"
source = pd.DataFrame(data={'topics':dic.keys(), 'counts':counts.values(), 'perc':dic.values()})
#prepare interaction tools
tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
hover = HoverTool(tooltips=[
("topic", "@topics"),
("total scholarly publications:", "@{counts}{0}"),
("% "+text+" publications", "@{perc}{0.00%}")])
tools.append(hover)
#prepare the plot figure, depending on the quantity of data, go for log scale or linear scale
if max(source['counts']>200):
p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE,y_axis_type="log")
else:
p = figure(tools=tools, toolbar_location="above", logo="grey", plot_width=800, plot_height=600, title=TITLE)
p.background_fill_color = "#ffffff"
#change axes labels according to whether we analyze one language or all languages, prepare axes
if lan=='all':
p.xaxis.axis_label = "average percentage of "+text+" publications across languages"
p.yaxis.axis_label = "sum of all scholarly publications across languages"
else:
p.xaxis.axis_label = "percentage of "+text+" publications"
p.yaxis.axis_label = "total number of scholarly publications"
p.x_range=Range1d(0,1.1)
p.yaxis[0].formatter = NumeralTickFormatter(format="0")
p.xaxis[0].formatter = NumeralTickFormatter(format="0.00%")
p.grid.grid_line_color = "gray"
#draw the circles; change colors here
source = ColumnDataSource(source)
p.circle("perc", "counts", size=10, source=source, line_color="#8B0A50", line_width=1,line_alpha=0.7, fill_alpha=0.5, fill_color="#cd1076")
labels = LabelSet(x="perc", y="counts", text="topics",y_offset=8,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
p.add_layout(labels)
#draw plot
output_notebook()
show(p)
def generate_comparison_plot(dataframe,topics):
'''
takes as input:
dataframe - the pandas data frame containing the data
'''
TITLE = "Percentage of open, available, and closed publications for all topics"
header=['open','available','closed']
#prepare interaction tools
tools = "pan,wheel_zoom,box_zoom,reset,save".split(',')
hover = HoverTool(tooltips=[
("topic", "@topic"),
("total scholarly publications:", "@{total}{0}"),
("% open publications", "@{open}{0.00%}"),
("% OA available publications", "@{available}{0.00%}"),
("% paywalled publications", "@{closed}{0.00%}")])
tools.append(hover)
dic=df.loc[df['wiki']=='all_languages']
#print dic
p = figure(y_range=topics,tools=tools, toolbar_location="above", logo="grey", plot_width=800, title=TITLE)
#p.background_fill_color = "#ffffff" #change if you don't want white background
#choose format for axes
source = ColumnDataSource(dic)
p.hbar_stack(header,y='topic',height=0.5,source=source, color=['#5F9E99','#ffb06e','#e72d66'])
#p.hbar(right=stack(),left=stack('open'),y='topic',height=0.2,color='blue',source=source,name='open')
#p.hbar(right=stack('open'),left=stack('open', 'available'), y='topic', height=0.2, color='red',source=source,name='available')
p.legend.visible=True
#draws the plot
output_notebook()
show(p)
We now read the input data and store set of languages and topics available (for future usage)
df = pd.DataFrame.from_csv(inputfile,sep='\t', index_col=None)
wikis = list(np.sort(list(set(df['wiki'].tolist()))))
topics =list(set(df['topic'].tolist()))
topics=np.delete(topics,[topics.index('all_topics'),topics.index('Article improvement and grading')])
topics= np.sort(topics)[::-1]
We now generate the distribution of languages over the real accessibility ('open') and potential accessibility ('available') of their publications, for all topics
generate_open_language_plot(df,'open','all')
generate_open_language_plot(df,'available','all')
We can also look at the distribution of all topics over the accessibility of their publications, for all languages
generate_open_topic_plot(df,topics,'open','all')
generate_open_topic_plot(df,topics,'available','all')
We can also look at the three levels of 'openness' together, for all topics
We see from the language distribution plot above that among the languages with highest number of publications, "Belarussian" Wikipedia is very open; by contrast, we see that "Khmer" wikipedia is less open; let's see their breakdown by topic:
generate_open_topic_plot(df,topics,'open','bewiki')
generate_open_topic_plot(df,topics,'open','kmwiki')
We see from the topic distribution plot above that "Space" is the most open topic based on scholarly articles cited, while, for example "Chemistry" has fewer open publications; let's see their breakdown by language: