Bibliometricians need to have an overall picture of the DataCite data corpus in order to do the analysis. In order to provide this, we will create a notebook with descriptive statistics about the Datacite data corpus broken up by the different dimensions of interest (viz. Discipline, career status, usage, and citations). This document aims to make the first step into creating that notebook by looking directly at the DOI index and exploring the descriptive statistics that will be used.
I have broken the descriptive statistics by the two main sections one per each use case: discipline and career status. Each section then breaks down the data by different dimensions: citations and usage.
I found that we have a limited number of datasets with disciplinary information that have citations, views, and downloads. We must implement methods to enrich our metadata to have significant a sample. The proxy approach is a good way to enrich the metadata in terms of discipline and it would help to get a larger data corpus to the bibliometricians. However, the fact that none of the disciplinary repositories is sending usage reports about the datasets might limit the usefulness of that data.
Installing and importing packages.
%%capture
# Install required Python packages
!pip install dfply altair altair_saver vega altair_viewer dash==1.16.3
import json
import numpy as np
from dfply import *
import altair.vega.v5 as alt
from altair_saver import save
import altair.vegalite.v4 as lite
# import plotly.graph_objects as go
import pandas as pd
import plotly.graph_objects as go
# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
_transport = RequestsHTTPTransport(
url='https://api.datacite.org/graphql',
use_json=True,
)
client = Client(
transport=_transport,
fetch_schema_from_transport=True,
)
We obtain all the data from the DataCite GraphQL API. All the queries are for datasets DOIs that include Field of Science information in their metadata. We have three different queries:
# Generate the GraphQL query to retrieve up to 100 outputs of University of Oxford, with at least 100 views each.
query_params = {
"query" : "subjects.subjectScheme:\"Fields of Science and Technology (FOS)\"",
}
datasetsQuery = gql("""query
{
datasets {
totalCount
}
}
""")
fOSQuery = gql("""query getOutputs($query: String)
{
datasets(query: $query) {
totalCount
}
}
""")
citationsQuery = gql("""query getOutputs($query: String)
{
datasets(query: $query, hasCitations:1) {
totalCount
fieldsOfScience{
title
count
}
published{
title
count
}
licenses{
title
count
}
affiliations{
title
count
}
}
}
""")
viewsQuery = gql("""query getOutputs($query: String)
{
datasets(query:$query, hasViews:1) {
totalCount
fieldsOfScience{
title
count
}
published{
title
count
}
licenses{
title
count
}
affiliations{
title
count
}
}
}
""")
downloadsQuery = gql("""query getOutputs($query: String)
{
datasets(query:$query, hasDownloads:1) {
totalCount
fieldsOfScience{
title
count
}
published{
title
count
}
licenses{
title
count
}
affiliations{
title
count
}
}
}
""")
def get_data(type):
"""Gets the data from the graphql api into an object
Parameters:
type (string): Controlled vocabulary for type of data
Returns:
object:Returning object reponse
"""
if type == "citations":
return client.execute(citationsQuery, variable_values=json.dumps(query_params))["datasets"]
elif type == "views":
return client.execute(viewsQuery, variable_values=json.dumps(query_params))["datasets"]
elif type == "downloads":
return client.execute(downloadsQuery, variable_values=json.dumps(query_params))["datasets"]
elif type == "fos":
return client.execute(fOSQuery, variable_values=json.dumps(query_params))["datasets"]
else:
return client.execute(datasetsQuery, variable_values=json.dumps(query_params))["datasets"]
usage = get_data("views")
citations = get_data("citations")
datasets = get_data("datasets")
fos = get_data("fos")
Simple transformations are performed to convert the graphql response into an dataframe that can be used in visulisations and tables.
def transform_distributions(dataframe, total):
"""Modifies each item to include attributes needed for the node visulisation
Parameters:
dataframe (dataframe): A dataframe with all the itemss
parent (int): The id of the parent node
Returns:
dataframe:Returning vthe same dataframe with new attributes
"""
# dataframe = {title: "Other", count: total}
if (dataframe) is None:
return pd.DataFrame()
else:
return (dataframe >>
mutate(
perc = (X['count']/total)*100
)
)
def processTable(data, type):
# data = get_data("citations")
if len(data[type]) == 0:
return None
else:
table = pd.DataFrame(data[type],columns=data[type][0].keys())
return transform_distributions(table, data['totalCount'])
fig = go.Figure(go.Indicator(
mode = "number+delta",
value = datasets["totalCount"],
title= {'text': "Datasets"},
domain = {'x': [0, 1], 'y': [0, 1]}))
fig.update_layout(paper_bgcolor = "lightgray")
fig.show()
fig = go.Figure(go.Indicator(
mode = "number+delta",
value = fos["totalCount"],
title= {'text': "Datasets with FOS"},
domain = {'x': [0, 1], 'y': [0, 1]}))
fig.update_layout(paper_bgcolor = "lightgray")
fig.show()
Questions:
I have broken the descriptive statistics by the two main sections one per each type of data: citations and usage.
perc = 100*(citations["totalCount"]/fos["totalCount"])
fig = go.Figure(go.Indicator(
mode = "number+delta",
value = citations["totalCount"],
title= {'text': f"Cited Datasets ({perc:.2f}%)"},
domain = {'x': [0, 1], 'y': [0, 1]}))
fig.update_layout(paper_bgcolor = "lightgray")
fig.show()
processTable(citations, "affiliations")
title | count | perc | |
---|---|---|---|
0 | Rice University | 5 | 0.601685 |
1 | University of California, Berkeley | 5 | 0.601685 |
2 | University of Melbourne | 5 | 0.601685 |
3 | Utah State University | 4 | 0.481348 |
4 | University of California System | 4 | 0.481348 |
5 | French National Centre for Scientific Research | 4 | 0.481348 |
6 | University of Florida | 4 | 0.481348 |
7 | University of Arizona | 4 | 0.481348 |
8 | Cornell University | 4 | 0.481348 |
9 | University of Sheffield | 4 | 0.481348 |
processTable(citations, "fieldsOfScience")
title | count | perc | |
---|---|---|---|
0 | Earth and related environmental sciences | 349 | 41.997593 |
1 | Sociology | 146 | 17.569194 |
2 | Biological sciences | 142 | 17.087846 |
3 | Social sciences | 55 | 6.618532 |
4 | Clinical medicine | 30 | 3.610108 |
5 | Computer and information sciences | 23 | 2.767750 |
6 | Health sciences | 21 | 2.527076 |
7 | Languages and literature | 16 | 1.925391 |
8 | Psychology | 15 | 1.805054 |
9 | Physical sciences | 12 | 1.444043 |
processTable(citations, "published")
title | count | perc | |
---|---|---|---|
0 | 2020 | 150 | 18.050542 |
1 | 2019 | 67 | 8.062575 |
2 | 2018 | 78 | 9.386282 |
3 | 2017 | 45 | 5.415162 |
4 | 2016 | 65 | 7.821901 |
5 | 2015 | 51 | 6.137184 |
6 | 2014 | 29 | 3.489771 |
7 | 2013 | 21 | 2.527076 |
8 | 2012 | 19 | 2.286402 |
9 | 2011 | 105 | 12.635379 |
10 | 2010 | 17 | 2.045728 |
processTable(citations, "licenses")
title | count | perc | |
---|---|---|---|
0 | CC0-1.0 | 210 | 25.270758 |
1 | CC-BY-4.0 | 129 | 15.523466 |
2 | CC-BY-3.0 | 4 | 0.481348 |
3 | cc-by-nd-2.0 | 3 | 0.361011 |
4 | MIT | 2 | 0.240674 |
5 | CC-BY-NC-4.0 | 1 | 0.120337 |
6 | CC-BY-NC-ND-4.0 | 1 | 0.120337 |
7 | CC-BY-NC-SA-4.0 | 1 | 0.120337 |
8 | CC-BY-SA-4.0 | 1 | 0.120337 |
9 | GPL-3.0 | 1 | 0.120337 |
Questions:
perc = 100*(usage["totalCount"]/fos["totalCount"])
fig = go.Figure(go.Indicator(
mode = "number+delta",
value = usage["totalCount"],
title= {'text': f"Viewed Datasets ({perc:.2f}%)"},
# delta = {'position': "top", 'reference': usage["published"][0]["count"]},
domain = {'x': [0, 1], 'y': [0, 1]}))
fig.update_layout(paper_bgcolor = "lightgray")
fig.show()
processTable(usage, "fieldsOfScience")
title | count | perc | |
---|---|---|---|
0 | Sociology | 166 | 68.032787 |
1 | Biological sciences | 39 | 15.983607 |
2 | Clinical medicine | 13 | 5.327869 |
3 | Health sciences | 9 | 3.688525 |
4 | Computer and information sciences | 4 | 1.639344 |
5 | Chemical engineering | 2 | 0.819672 |
6 | Earth and related environmental sciences | 2 | 0.819672 |
7 | Languages and literature | 2 | 0.819672 |
8 | Medical biotechnology | 2 | 0.819672 |
9 | Chemical sciences | 1 | 0.409836 |
processTable(usage, "affiliations")
title | count | perc | |
---|---|---|---|
0 | University of California, Berkeley | 7 | 2.868852 |
1 | Rice University | 5 | 2.049180 |
2 | Utah State University | 5 | 2.049180 |
3 | University of Melbourne | 5 | 2.049180 |
4 | Harvard University | 5 | 2.049180 |
5 | University of Helsinki | 5 | 2.049180 |
6 | University of Sheffield | 5 | 2.049180 |
7 | University of Montana | 4 | 1.639344 |
8 | Princeton University | 4 | 1.639344 |
9 | University of California System | 4 | 1.639344 |
processTable(usage, "published")
title | count | perc | |
---|---|---|---|
0 | 2020 | 23 | 9.426230 |
1 | 2019 | 53 | 21.721311 |
2 | 2018 | 31 | 12.704918 |
3 | 2017 | 21 | 8.606557 |
4 | 2016 | 37 | 15.163934 |
5 | 2015 | 26 | 10.655738 |
6 | 2014 | 19 | 7.786885 |
7 | 2013 | 11 | 4.508197 |
8 | 2012 | 13 | 5.327869 |
9 | 2011 | 4 | 1.639344 |
10 | 2010 | 2 | 0.819672 |
processTable(usage, "licenses")
title | count | perc | |
---|---|---|---|
0 | CC0-1.0 | 226 | 92.622951 |
1 | CC-BY-4.0 | 2 | 0.819672 |
Questions:
def vega_donut_template(data):
"""Injects data into the vega specification
Parameters:
data (array): Array of nodes
Returns:
VegaSpec:Specification with data
"""
return """
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"description": "A simple donut chart with embedded data.",
"padding": {"left": 55, "top": 10, "right": 10, "bottom": 10},
"width": 200,
"height": 200,
"data": {
"values": """ + data + """
},
"layer": [
{
"mark": {
"type": "arc",
"innerRadius": 68,
"outerRadius": 90,
"cursor": "pointer",
"tooltip": true
},
"encoding": {
"theta": {
"field": "count",
"type": "quantitative",
"sort": "descending"
},
"color": {
"field": "title",
"type": "nominal",
"title": "type",
"scale": {
"range": [
"#fccde5",
"#fdb462",
"#fb8072",
"#fb8072",
"#b3de69",
"#bc80bd",
"#fccde5",
"#8dd3c7",
"#ffed6f",
"#d9d9d9",
"#ffffb3",
"#bebada",
"#80b1d3",
"#ccebc5",
"#d9d9d9"
],
"domain": [
"2020",
"2019",
"2018",
"2017",
"2016",
"2015",
"2014",
"Model",
"Physical Object",
"Service",
"Sound",
"Software",
"Text",
"Workflow",
"Other"
]
}
}
}
},
{
"mark": {
"type": "text",
"fill": "#767676",
"align": "center",
"baseline": "middle",
"fontSize": 27
},
"encoding": {"text": {"value": "33"}}
}
],
"view": {"stroke": "none"}
}
"""
def vega_grid_template(data):
"""Injects data into the vega specification
Parameters:
data (array): Array of nodes
Returns:
VegaSpec:Specification with data
"""
return """
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"description": "Two vertically concatenated charts that show a histogram of precipitation in Seattle and the relationship between min and max temperature.",
"data": {
"url": "data/weather.csv"
},
"vconcat": [
{
""" + total + """
},
{
""" + discipline_distribution + """
},
{
""" + affiliation_distribution + """
},
]
}
"""
chart = lite.VegaLite(json.loads(vega_donut_template(json.dumps(get_data("")["published"]))))
chart
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-34-82c7246679a7> in <module> ----> 1 chart = lite.VegaLite(json.loads(vega_donut_template(json.dumps(get_data("")["published"])))) 2 chart KeyError: 'published'
def vega_hist_template(data):
"""Injects data into the vega specification
Parameters:
data (array): Array of nodes
Returns:
VegaSpec:Specification with data
"""
return """
{
"$schema": "https://vega.github.io/schema/vega-lite/v4.json",
"data": {"values": """ + data + """},
"padding": {"left": 5, "top": 5, "right": 5, "bottom": 5},
"transform": [
{"calculate": "toNumber(datum.title)", "as": "period"},
{"calculate": "toNumber(datum.title)+1", "as": "bin_end"},
{"filter": "toNumber(datum.title) >= 2010"}
],
"width": 242,
"mark": {"type": "bar", "cursor": "pointer", "tooltip": true},
"selection": {
"highlight": {"type": "single", "empty": "none", "on": "mouseover"}
},
"encoding": {
"x": {
"field": "period",
"bin": {"binned": true, "step": 1, "maxbins": 11},
"type": "quantitative",
"axis": {
"format": "1"
},
"scale": {"domain": [2010, 2021]}
},
"x2": {"field": "bin_end"},
"y": {
"field": "count",
"type": "quantitative",
"axis": {"format": ",f", "tickMinStep": 1}
},
"color": {
"field": "count",
"scale": {"range": ["#1abc9c"]},
"type": "nominal",
"legend": null,
"condition": [{"selection": "highlight", "value": "#34495e"}]
}
},
"config": {
"view": {"stroke": null},
"axis": {"grid": false, "title": "donut", "labelFlush": false}
}
}
"""
chart = lite.VegaLite(json.loads(vega_hist_template(json.dumps(get_data("")["published"]))))
chart