Notebook

	FREYA WP2 User Story 1	As a data center, I want to see the citations of publications that use my repository for the underlying data, so that I can demonstrate the impact of our repository.

It is important for repositories of scientific data to monitor and report on the impact of the data they store. One useful proxy of that impact are citations of publications accompanying the deposited data.

This notebook uses the DataCite GraphQL API to retrieve data (a.k.a. works) and their citations from three different repositories: PANGAEA, DRYAD and Global Biodiversity Information Facility, using polarstern, butterfly and Lake Malawi as example queries respectively.

Goal: By the end of this notebook you should be able to:

Retrieve works for a chosen repository and query, along with associated metrics such as citation, view and download counts;
Visualise the work counts over time, e.g.
Present the works in a tabular format and download them in a single BibTeX file;
For a given work, retrieve all the citations, present them in a tabular format and then download them in a single BibTeX file.

Install libraries and prepare GraphQL client¶

In [ ]:

%%capture
# Install required Python packages
!pip install gql requests numpy pandas

In [ ]:

# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport

_transport = RequestsHTTPTransport(
    url='https://api.datacite.org/graphql',
    use_json=True,
)

client = Client(
    transport=_transport,
    fetch_schema_from_transport=True,
)

Define and run the GraphQL query¶

Define the GraphQL to find all works from PANGAEA, DRYAD and Global Biodiversity Information Facility (GBIF) repositories using keywords: polarstern, butterfly and Lake Malawi respectively.

In [ ]:

# Generate the GraphQL query
query_params = {
    "pangaea_repository" : "pangaea.repository",
    "pangaea_keyword" : "polarstern",
    "dryad_repository" : "dryad.dryad",
    "dryad_keyword" : "butterfly",
    "gbif_repository" : "gbif.gbif",
    "gbif_keyword" : "Lake Malawi", 
}

query = gql("""query getWorksByRepositoryAndKeyword(
    $pangaea_repository: ID!, $pangaea_keyword: String!,
    $dryad_repository: ID!, $dryad_keyword: String!
    $gbif_repository: ID!, $gbif_keyword: String!
    )
{
  pangaea: repository(id: $pangaea_repository) {
    id
    name
    citationCount
    works(query: $pangaea_keyword) {
      totalCount
      published {
        title
        count
      }
      nodes {
        id
        type
        publicationYear
        bibtex
        titles {
          title
        }
        citationCount
        viewCount
        downloadCount
      }
    }
  },
  dryad: repository(id: $dryad_repository) {
    id
    name
    citationCount
    works(query: $dryad_keyword) {
      totalCount
      published {
        title
        count
      }
      nodes {
        id
        type
        publicationYear
        bibtex
        titles {
          title
        }
        citationCount
        viewCount
        downloadCount
      }
    }
  },
  gbif: repository(id: $gbif_repository) {
    id
    name
    citationCount
    works(query: $gbif_keyword) {
      totalCount
      published {
        title
        count
      }
      nodes {
        id
        type
        publicationYear
        bibtex
        titles {
          title
        }
        citationCount
        viewCount
        downloadCount
      }
    }
  }
}
""")

Run the above query via the GraphQL client

In [ ]:

import json
data = client.execute(query, variable_values=json.dumps(query_params))

Display the number of works¶

For each repository, display the total number of works matching the respective query.

In [ ]:

# Get the total number of datasets matching the query
works = {}
for repo in ['pangaea', 'dryad', 'gbif']:
    works[repo] = data[repo]['works']
    print("The number of works for query '%s' in repository %s:\n%s" % (query_params['%s_keyword' % repo], data[repo]['name'], str(works[repo]['totalCount'])))

Display the number of citations of the works¶

For each repository, display the total number of citations of works matching the respective query.

In [ ]:

# Get the total number of citations per repository
for repo in ['pangaea', 'dryad', 'gbif']:
    print("The total number of citations for repository %s:\n%s" % (data[repo]['name'], str(data[repo]['citationCount'])))
    

Plot the number of works per year¶

For each repository, display a bar plot showing the counts of works matching the respective query, across years.

In [ ]:

# Plot the total number of datasets to date, by year
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import numpy as np

for repo in ['pangaea', 'dryad','gbif']:
    works = data[repo]['works']
    name = data[repo]['name']
    plt.rcdefaults()
    sorted_years = sorted([int(s['title']) for s in works['published']])
    num_outputs4sorted_years = [s['count'] for s in works['published']]
    # Get a list of all consecutive years between min and max year (inclusive)
    all_years = list(range(sorted_years[0], sorted_years[-1]))
    # Populate output counts (into num_counts) for all consecutive years
    num_outputs = []
    for year in all_years:
        if year in sorted_years:
            idx = sorted_years.index(year)
            num_outputs.append(num_outputs4sorted_years[idx])
        else:
            num_outputs.append(0)     

    fig, ax = plt.subplots(1, 1, figsize = (10, 5))
    x_pos = np.arange(len(all_years))
    ax.bar(x_pos, num_outputs, align='center', color='blue', edgecolor='black', linewidth=1, alpha=0.5)
    ax.set_xticks(x_pos)
    ax.set_xticklabels(all_years, rotation='vertical')
    ax.set_ylabel('Number of works per Year')
    ax.set_xlabel('Year')
    ax.set_title("Number of works retrieved via query '%s' from %s" % (query_params["%s_keyword" % repo], name))
    plt.show()

Display works in tabular format¶

For each repository and query, display the works in a html table, including the number of their citations, views and downloads.

In [ ]:

from IPython.core.display import display, HTML

# Get details for each output
for repo in ['pangaea', 'dryad','gbif']:
    works = data[repo]['works']
    name = data[repo]['name']
    outputs = [['ID','Type','Publication Year','Titles','Number of Citations', 'Number of Views', 'Number of Downloads']]
    for r in works['nodes']:
        id = '<a href="%s">%s</a></html>' % (r['id'], '/'.join(r['id'].split("/")[3:]))
        titles = '; '.join([s['title'] for s in r['titles']])
        output = [id, r['type'], str(r['publicationYear']), titles, str(r['citationCount']), str(r['viewCount']), str(r['downloadCount'])]
        outputs += [output]
    
    # Display outputs as html table 
    html_table = '<html><table><caption><b>"%s" works from %s</b></caption>' % (query_params["%s_keyword" % repo], name)  
    html_table += '<tr><th style="text-align:center;">' + '</th><th style="text-align:center;">'.join(outputs[0]) + '</th></tr>'
    for row in outputs[1:]:
        html_table += '<tr><td style="text-align:left;">' + '</td><td style="text-align:left;">'.join(row) + '</td></tr>'
    html_table += '</table></html>'
    display(HTML(html_table))

Download works in BibTeX format¶

Download the works in a single BibTeX file per repository

In [ ]:

import pandas as pd
from IPython.display import Javascript
from requests.utils import requote_uri

# For each repository, download a file of BibTeX entries in csv format
for repo in ['pangaea', 'dryad','gbif']:
    works = data[repo]['works']
    bibtex_data = []
    for r in works['nodes']:
        bibtex_data.append([r['bibtex']])
    df = pd.DataFrame(bibtex_data, columns = None)
    
    js_download = """
var csv = '%s';

var filename = '%s_%s.bib';
var blob = new Blob([csv], { type: 'application/x-bibtex;charset=utf-8;' });
if (navigator.msSaveBlob) { // IE 10+
    navigator.msSaveBlob(blob, filename);
} else {
    var link = document.createElement("a");
    if (link.download !== undefined) { // feature detection
        // Browsers that support HTML5 download attribute
        var url = URL.createObjectURL(blob);
        link.setAttribute("href", url);
        link.setAttribute("download", filename);
        link.style.visibility = 'hidden';
        document.body.appendChild(link);
        link.click();
        document.body.removeChild(link);
    }
}
""" % (df.to_csv(index=False, header=False).replace('\n','\\n').replace("\'","\\'").replace("\"","").replace("\r",""), query_params["%s_repository" % repo], requote_uri(query_params["%s_keyword" % repo]))
    
    display(Javascript(js_download))

Define and run GraphQL query to retrieve citations for a single work¶

The query will retrieve citations for IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment.

In [ ]:

# Generate the GraphQL query: Get citations for a specific work from the repository
citations_query_params = {
    "id" : "https://doi.org/10.15468/1z5fn8",
    "maxCitations" : 75
}

citation_query = gql("""query getCitationsByWorkId($id: ID!, $maxCitations: Int!)
{
  work(id: $id) {
    id
    titles {
      title
    }
    type
    publicationYear
    citations(first: $maxCitations) {
      totalCount
      nodes {
        id
        type
        publicationYear
        repository {
          id
          name
        }
        titles {
          title
        }
        bibtex
        citationCount
        viewCount
        downloadCount
      }
    }
  }
}
""")

Run the above query

In [ ]:

import json
citations = client.execute(citation_query, variable_values=json.dumps(citations_query_params))

Display the number of citations¶

Display the number of citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment.

In [ ]:

# Get the total number of citations matching the query
citations_data = citations['work']['citations']
print("The number of citations for work %s:\n%s" % (citations_query_params["id"], str(citations_data['totalCount'])))

Display citations in tabular format¶

Display citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment in a html table, including the number of their respective citations, views and downloads.

In [ ]:

from IPython.core.display import display, HTML

# Get details for each citation
outputs = [['ID','Type','Publication Year','Titles','Number of Citations', 'Number of Views', 'Number of Downloads']]
for r in citations_data['nodes']:
    citation_id = '<a href="%s">%s</a></html>' % (r['id'], '/'.join(r['id'].split("/")[3:]))
    titles = '; '.join([s['title'] for s in r['titles']])
    output = [citation_id, r['type'], str(r['publicationYear']), titles, str(r['citationCount']), str(r['viewCount']), str(r['downloadCount'])]
    outputs += [output]
    
# Display outputs as html table
id_href = '<a href="%s">%s</a></html>' % (citations_query_params['id'], '/'.join(citations_query_params['id'].split("/")[3:]))
html_table = '<html><table><caption><b>Citations of %s from %s</b></caption>' % (id_href, query_params["%s_repository" % "gbif"] )  
html_table += '<tr><th style="text-align:center;">' + '</th><th style="text-align:center;">'.join(outputs[0]) + '</th></tr>'
for row in outputs[1:]:
    html_table += '<tr><td style="text-align:left;">' + '</td><td style="text-align:left;">'.join(row) + '</td></tr>'
html_table += '</table></html>'
display(HTML(html_table))

Download citations in BibTeX format¶

Download the citations of IUCN Red List assessment occurrence data for freshwater species native to the Lake Malawi/Nyasa/Niassa Catchment in a single BibTeX file.

In [ ]:

import pandas as pd
from IPython.display import Javascript
from requests.utils import requote_uri

# Download a file of BibTeX entries in csv format, for the citations of citations_query_params['id']
for r in works['nodes']:
    bibtex_data = []
    for r in works['nodes']:
        bibtex_data.append([r['bibtex']])
    df = pd.DataFrame(bibtex_data, columns = None)
id_label = '/'.join(citations_query_params['id'].split("/")[3:])

js_download = """
var csv = '%s';
var filename = '%s.bib';
var blob = new Blob([csv], { type: 'application/x-bibtex;charset=utf-8;' });
if (navigator.msSaveBlob) { // IE 10+
    navigator.msSaveBlob(blob, filename);
} else {
    var link = document.createElement("a");
    if (link.download !== undefined) { // feature detection
        // Browsers that support HTML5 download attribute
        var url = URL.createObjectURL(blob);
        link.setAttribute("href", url);
        link.setAttribute("download", filename);
        link.style.visibility = 'hidden';
        document.body.appendChild(link);
        link.click();
        document.body.removeChild(link);
    }
}
""" % (df.to_csv(index=False, header=False).replace('\n','\\n').replace("\'","\\'").replace("\"","").replace("\r",""), requote_uri(id_label))
    
display(Javascript(js_download))

In [ ]: