![]() |
FREYA WP2 User Story 1 | As a data center, I want to see the citations of publications that use my repository for the underlying data, so that I can demonstrate the impact of our repository. |
---|---|---|
It is important for repositories of scientific data to monitor and report on the impact of the data they store. One useful proxy of that impact are citations of publications accompanying the deposited data.
This notebook uses the DataCite GraphQL API to retrieve data (a.k.a. works) and their citations from three different repositories: PANGAEA, DRYAD and Global Biodiversity Information Facility, using polarstern, butterfly and Lake Malawi as example queries respectively.Goal: By the end of this notebook you should be able to:
%%capture
# Install required Python packages
!pip install gql requests numpy pandas
# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
_transport = RequestsHTTPTransport(
url='https://api.datacite.org/graphql',
use_json=True,
)
client = Client(
transport=_transport,
fetch_schema_from_transport=True,
)
Define the GraphQL to find all works from PANGAEA, DRYAD and Global Biodiversity Information Facility (GBIF) repositories using keywords: polarstern, butterfly and Lake Malawi respectively.
# Generate the GraphQL query
query_params = {
"pangaea_repository" : "pangaea.repository",
"pangaea_keyword" : "polarstern",
"dryad_repository" : "dryad.dryad",
"dryad_keyword" : "butterfly",
"gbif_repository" : "gbif.gbif",
"gbif_keyword" : "Lake Malawi",
}
query = gql("""query getWorksByRepositoryAndKeyword(
$pangaea_repository: ID!, $pangaea_keyword: String!,
$dryad_repository: ID!, $dryad_keyword: String!
$gbif_repository: ID!, $gbif_keyword: String!
)
{
pangaea: repository(id: $pangaea_repository) {
id
name
citationCount
works(query: $pangaea_keyword) {
totalCount
published {
title
count
}
nodes {
id
type
publicationYear
bibtex
titles {
title
}
citationCount
viewCount
downloadCount
}
}
},
dryad: repository(id: $dryad_repository) {
id
name
citationCount
works(query: $dryad_keyword) {
totalCount
published {
title
count
}
nodes {
id
type
publicationYear
bibtex
titles {
title
}
citationCount
viewCount
downloadCount
}
}
},
gbif: repository(id: $gbif_repository) {
id
name
citationCount
works(query: $gbif_keyword) {
totalCount
published {
title
count
}
nodes {
id
type
publicationYear
bibtex
titles {
title
}
citationCount
viewCount
downloadCount
}
}
}
}
""")
Run the above query via the GraphQL client
import json
data = client.execute(query, variable_values=json.dumps(query_params))
For each repository, display the total number of works matching the respective query.
# Get the total number of datasets matching the query
works = {}
for repo in ['pangaea', 'dryad', 'gbif']:
works[repo] = data[repo]['works']
print("The number of works for query '%s' in repository %s:\n%s" % (query_params['%s_keyword' % repo], data[repo]['name'], str(works[repo]['totalCount'])))
For each repository, display the total number of citations of works matching the respective query.
# Get the total number of citations per repository
for repo in ['pangaea', 'dryad', 'gbif']:
print("The total number of citations for repository %s:\n%s" % (data[repo]['name'], str(data[repo]['citationCount'])))
For each repository, display a bar plot showing the counts of works matching the respective query, across years.
# Plot the total number of datasets to date, by year
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import numpy as np
for repo in ['pangaea', 'dryad','gbif']:
works = data[repo]['works']
name = data[repo]['name']
plt.rcdefaults()
sorted_years = sorted([int(s['title']) for s in works['published']])
num_outputs4sorted_years = [s['count'] for s in works['published']]
# Get a list of all consecutive years between min and max year (inclusive)
all_years = list(range(sorted_years[0], sorted_years[-1]))
# Populate output counts (into num_counts) for all consecutive years
num_outputs = []
for year in all_years:
if year in sorted_years:
idx = sorted_years.index(year)
num_outputs.append(num_outputs4sorted_years[idx])
else:
num_outputs.append(0)
fig, ax = plt.subplots(1, 1, figsize = (10, 5))
x_pos = np.arange(len(all_years))
ax.bar(x_pos, num_outputs, align='center', color='blue', edgecolor='black', linewidth=1, alpha=0.5)
ax.set_xticks(x_pos)
ax.set_xticklabels(all_years, rotation='vertical')
ax.set_ylabel('Number of works per Year')
ax.set_xlabel('Year')
ax.set_title("Number of works retrieved via query '%s' from %s" % (query_params["%s_keyword" % repo], name))
plt.show()
For each repository and query, display the works in a html table, including the number of their citations, views and downloads.
from IPython.core.display import display, HTML
# Get details for each output
for repo in ['pangaea', 'dryad','gbif']:
works = data[repo]['works']
name = data[repo]['name']
outputs = [['ID','Type','Publication Year','Titles','Number of Citations', 'Number of Views', 'Number of Downloads']]
for r in works['nodes']:
id = '<a href="%s">%s</a></html>' % (r['id'], '/'.join(r['id'].split("/")[3:]))
titles = '; '.join([s['title'] for s in r['titles']])
output = [id, r['type'], str(r['publicationYear']), titles, str(r['citationCount']), str(r['viewCount']), str(r['downloadCount'])]
outputs += [output]
# Display outputs as html table
html_table = '<html><table><caption><b>"%s" works from %s</b></caption>' % (query_params["%s_keyword" % repo], name)
html_table += '<tr><th style="text-align:center;">' + '</th><th style="text-align:center;">'.join(outputs[0]) + '</th></tr>'
for row in outputs[1:]:
html_table += '<tr><td style="text-align:left;">' + '</td><td style="text-align:left;">'.join(row) + '</td></tr>'
html_table += '</table></html>'
display(HTML(html_table))
Download the works in a single BibTeX file per repository
import pandas as pd
from IPython.display import Javascript
from requests.utils import requote_uri
# For each repository, download a file of BibTeX entries in csv format
for repo in ['pangaea', 'dryad','gbif']:
works = data[repo]['works']
bibtex_data = []
for r in works['nodes']:
bibtex_data.append([r['bibtex']])
df = pd.DataFrame(bibtex_data, columns = None)
js_download = """
var csv = '%s';
var filename = '%s_%s.bib';
var blob = new Blob([csv], { type: 'application/x-bibtex;charset=utf-8;' });
if (navigator.msSaveBlob) { // IE 10+
navigator.msSaveBlob(blob, filename);
} else {
var link = document.createElement("a");
if (link.download !== undefined) { // feature detection
// Browsers that support HTML5 download attribute
var url = URL.createObjectURL(blob);
link.setAttribute("href", url);
link.setAttribute("download", filename);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
""" % (df.to_csv(index=False, header=False).replace('\n','\\n').replace("\'","\\'").replace("\"","").replace("\r",""), query_params["%s_repository" % repo], requote_uri(query_params["%s_keyword" % repo]))
display(Javascript(js_download))
The query will retrieve citations for IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment.
# Generate the GraphQL query: Get citations for a specific work from the repository
citations_query_params = {
"id" : "https://doi.org/10.15468/1z5fn8",
"maxCitations" : 75
}
citation_query = gql("""query getCitationsByWorkId($id: ID!, $maxCitations: Int!)
{
work(id: $id) {
id
titles {
title
}
type
publicationYear
citations(first: $maxCitations) {
totalCount
nodes {
id
type
publicationYear
repository {
id
name
}
titles {
title
}
bibtex
citationCount
viewCount
downloadCount
}
}
}
}
""")
Run the above query
import json
citations = client.execute(citation_query, variable_values=json.dumps(citations_query_params))
Display the number of citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment.
# Get the total number of citations matching the query
citations_data = citations['work']['citations']
print("The number of citations for work %s:\n%s" % (citations_query_params["id"], str(citations_data['totalCount'])))
Display citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment in a html table, including the number of their respective citations, views and downloads.
from IPython.core.display import display, HTML
# Get details for each citation
outputs = [['ID','Type','Publication Year','Titles','Number of Citations', 'Number of Views', 'Number of Downloads']]
for r in citations_data['nodes']:
citation_id = '<a href="%s">%s</a></html>' % (r['id'], '/'.join(r['id'].split("/")[3:]))
titles = '; '.join([s['title'] for s in r['titles']])
output = [citation_id, r['type'], str(r['publicationYear']), titles, str(r['citationCount']), str(r['viewCount']), str(r['downloadCount'])]
outputs += [output]
# Display outputs as html table
id_href = '<a href="%s">%s</a></html>' % (citations_query_params['id'], '/'.join(citations_query_params['id'].split("/")[3:]))
html_table = '<html><table><caption><b>Citations of %s from %s</b></caption>' % (id_href, query_params["%s_repository" % "gbif"] )
html_table += '<tr><th style="text-align:center;">' + '</th><th style="text-align:center;">'.join(outputs[0]) + '</th></tr>'
for row in outputs[1:]:
html_table += '<tr><td style="text-align:left;">' + '</td><td style="text-align:left;">'.join(row) + '</td></tr>'
html_table += '</table></html>'
display(HTML(html_table))
Download the citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment in a single BibTeX file.
import pandas as pd
from IPython.display import Javascript
from requests.utils import requote_uri
# Download a file of BibTeX entries in csv format, for the citations of citations_query_params['id']
for r in works['nodes']:
bibtex_data = []
for r in works['nodes']:
bibtex_data.append([r['bibtex']])
df = pd.DataFrame(bibtex_data, columns = None)
id_label = '/'.join(citations_query_params['id'].split("/")[3:])
js_download = """
var csv = '%s';
var filename = '%s.bib';
var blob = new Blob([csv], { type: 'application/x-bibtex;charset=utf-8;' });
if (navigator.msSaveBlob) { // IE 10+
navigator.msSaveBlob(blob, filename);
} else {
var link = document.createElement("a");
if (link.download !== undefined) { // feature detection
// Browsers that support HTML5 download attribute
var url = URL.createObjectURL(blob);
link.setAttribute("href", url);
link.setAttribute("download", filename);
link.style.visibility = 'hidden';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
}
}
""" % (df.to_csv(index=False, header=False).replace('\n','\\n').replace("\'","\\'").replace("\"","").replace("\r",""), requote_uri(id_label))
display(Javascript(js_download))