#!/usr/bin/env python
# coding: utf-8
# ![FREYA Logo](https://github.com/datacite/pidgraph-notebooks-python/blob/master/images/freya_200x121.png?raw=true) | [FREYA](https://www.project-freya.eu/en) WP2 [User Story 2](https://github.com/datacite/freya/issues/63) | As a software author, I want to be able to see the citations of my software aggregated across all versions, so that I see a complete picture of reuse.
# :------------- | :------------- | :-------------
#
# Software development process involves versioned releases. Consequently, different software versions may be used for scientific discovery and thus referenced in publications. In order to quantify impact of a software, its author must be able to capture the reuse of the software across all its versions.
# This notebook uses the [DataCite GraphQL API](https://api.datacite.org/graphql) to retrieve metadata about software titled: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488), including all its versions, so that its overall reuse can be quantified.
#
# **Goal**: By the end of this notebook, for a given software you should be able to display:
# - Counts of citations, views and downloads metrics, aggregated across all versions of the software
# - An interactive stacked bar plot showing how the metric counts of each version contribute to the corresponding aggregated metric counts, e.g.
# ## Install libraries and prepare GraphQL client
# In[ ]:
get_ipython().run_cell_magic('capture', '', '# Install required Python packages\n!pip install gql requests numpy plotly\n')
# In[ ]:
# Prepare the GraphQL client
import requests
from IPython.display import display, Markdown
from gql import gql, Client
from gql.transport.requests import RequestsHTTPTransport
_transport = RequestsHTTPTransport(
url='https://api.datacite.org/graphql',
use_json=True,
)
client = Client(
transport=_transport,
fetch_schema_from_transport=True,
)
# ## Define and run GraphQL query
# Define the GraphQL query to retrieve metadata for the software titled: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488) using its DOI.
# In[ ]:
# Generate the GraphQL query to retrieve the required software's metadata
query_params = {
"softwareId" : "https://doi.org/10.5281/zenodo.2799488"
}
query = gql("""query getSoftware($softwareId: ID!)
{
software(id: $softwareId) {
id
titles {
title
}
publicationYear
citations {
nodes {
id
titles {
title
}
}
}
version
versionCount
versionOfCount
citationCount
downloadCount
viewCount
versions {
nodes {
id
version
publicationYear
titles {
title
}
citations {
nodes {
id
titles {
title
}
}
}
version
versionCount
versionOfCount
citationCount
downloadCount
viewCount
}
}
}
}
""")
# Run the above query via the GraphQL client
# In[ ]:
import json
data = client.execute(query, variable_values=json.dumps(query_params))
# ## Display total software metrics
# Display total number of citations, views and downloads across all versions of software: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488).
# In[ ]:
# Get the total count per metric, aggregated for across all versions of the software
software = data['software']
# Initialise metric counts
metricCounts = {}
for metric in ['citationCount', 'viewCount', 'downloadCount']:
metricCounts[metric] = 0
# Aggregate metric counts across all the version
for node in software['versions']['nodes']:
for metric in metricCounts:
metricCounts[metric] += node[metric]
# Display the aggregated metric counts
tableBody=""
for metric in metricCounts:
tableBody += "%s | **%s**\n" % (metric, str(metricCounts[metric]))
if tableBody:
display(Markdown("Aggregated metric counts for software: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488) across all its versions:"))
display(Markdown("|Metric | Aggregated Count|\n|---|---|\n%s" % tableBody))
# ## Plot metric counts per software version
# Plot stacked bar plot showing how the individual versions of software: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488) contribute their metric counts to the corresponding aggregated total.
# In[ ]:
import plotly.io as pio
import plotly.express as px
from IPython.display import IFrame
import pandas as pd
# Adapted from: https://stackoverflow.com/questions/58766305/is-there-any-way-to-implement-stacked-or-grouped-bar-charts-in-plotly-express
def px_stacked_bar(df, color_name='Metric', y_name='Metrics', **pxargs):
idx_col = df.index.name
m = pd.melt(df.reset_index(), id_vars=idx_col, var_name=color_name, value_name=y_name)
# For Plotly colour sequences see: https://plotly.com/python/discrete-color/
return px.bar(m, x=idx_col, y=y_name, color=color_name, **pxargs,
color_discrete_sequence=px.colors.qualitative.Pastel1)
# Collect metric counts
software = data['software']
version = software['version']
# Initialise dicts for the stacked bar plot
labels = {0: 'All Software Versions'}
citationCounts = {}
viewCounts = {}
downloadCounts = {}
# Collect software/version labels
versionCnt = 1
for node in software['versions']['nodes']:
version = software['version']
labels[versionCnt] = '%s (%s)' % (version, node['publicationYear'])
versionCnt += 1
# Initialise aggregated metric counts (key: 0)
citationCounts[0] = 0
viewCounts[0] = 0
downloadCounts[0] = 0
# Populate metric counts for individual versions (key: versionCnt) and add them to the aggregated counts (key: 0)
versionCnt = 1
for node in software['versions']['nodes']:
citationCounts[0] += node['citationCount']
viewCounts[0] += node['viewCount']
downloadCounts[0] += node['downloadCount']
citationCounts[versionCnt] = node['citationCount']
viewCounts[versionCnt] = node['viewCount']
downloadCounts[versionCnt] = node['downloadCount']
versionCnt += 1
# Create stacked bar plot
df = pd.DataFrame({'Software/Versions': labels,
'Citations': citationCounts,
'Views': viewCounts,
'Downloads': downloadCounts})
fig = px_stacked_bar(df.set_index('Software/Versions'), y_name = "Counts")
# Set plot background to transparent
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)'
})
# Write interactive plot out to html file
pio.write_html(fig, file='out.html')
# Display plot from the saved html file
display(Markdown("Citations, views and downloads counts for software: [Calculation Package: Inverting topography for landscape evolution model process representation](https://doi.org/10.5281/zenodo.2799488) across all its versions, shown as stacked bar plot:"))
IFrame(src="./out.html", width=500, height=500)