#!/usr/bin/env python # coding: utf-8 # ![FREYA Logo](https://github.com/datacite/pidgraph-notebooks-python/blob/master/images/freya_200x121.png?raw=true) | [FREYA](https://www.project-freya.eu/en) WP2 [User Story 4](https://github.com/datacite/pidgraph-notebooks-python/issues/8) | As a funder I want to see how many of the research outputs funded by me have an open license enabling reuse, so that I am sure I properly support Open Science. # :------------- | :------------- | :------------- # # Funders that support open research are interested in monitoring the extent of open access given to the outputs of grants they award - while the grant is active as well as retrospectively.

# This notebook uses the [DataCite GraphQL API](https://api.datacite.org/graphql) to retrieve and report license types of outputs of the following funders to date: # - [DFG (Deutsche Forschungsgemeinschaft, Germany)](https://doi.org/10.13039/501100001659) # - [ANR (Agence Nationale de la Recherche, France)](https://doi.org/10.13039/501100001665) # - [SNF (Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung, Switzerland)](https://doi.org/10.13039/501100001711) # # **Goal**: By the end of this notebook you should be able to: # - Retrieve licenses across all output types for three different funders; # - Plot interactive bar plots showing for each funder respectively the proportion of outputs: # - issued under a given license type (including no license); # - per output type ("Dataset" and "Text"), issued under a given license type (including no license).
# Please note that "Text" output type includes publications. # ## Install libraries and prepare GraphQL client # In[1]: get_ipython().run_cell_magic('capture', '', '# Install required Python packages\n!pip install gql requests numpy plotnine\n') # In[2]: # Prepare the GraphQL client import requests from IPython.display import display, Markdown from gql import gql, Client from gql.transport.requests import RequestsHTTPTransport _transport = RequestsHTTPTransport( url='https://api.datacite.org/graphql', use_json=True, ) client = Client( transport=_transport, fetch_schema_from_transport=True, ) # ## Define and run GraphQL query # Define the GraphQL query to find all outputs and associated licenses for three different funders: [DFG (Deutsche Forschungsgemeinschaft, Germany)](https://doi.org/10.13039/501100001659), [ANR (Agence Nationale de la Recherche, France)](https://doi.org/10.13039/501100001665) and [SNF (Schweizerischer Nationalfonds zur Förderung der Wissenschaftlichen Forschung, Switzerland)](https://doi.org/10.13039/501100001711)). # In[26]: # Generate the GraphQL query: find all outputs and their associated licenses (where available) # for three different funders, identified by funder1, funder2 and funder3. query_params = { "funder1" : "https://doi.org/10.13039/501100001659", "funder2" : "https://doi.org/10.13039/501100001665", "funder3" : "https://doi.org/10.13039/501100001711" } funderId2Acronym = { "https://doi.org/10.13039/501100001659" : "DFG", "https://doi.org/10.13039/501100001665" : "ANR", "https://doi.org/10.13039/501100001711" : "SNF" } query = gql("""query getGrantOutputsForFundersById( $funder1: ID!, $funder2: ID!, $funder3: ID! ) { funder1: funder(id: $funder1) { name id works { totalCount licenses { id title count } } }, funder2: funder(id: $funder2) { name id works { totalCount licenses { id title count } } }, funder3: funder(id: $funder3) { name id works { totalCount licenses { id title count } } }, funder1Dataset: funder(id: $funder1) { name id works(resourceTypeId: "Dataset") { totalCount licenses { id title count } } }, funder1Text: funder(id: $funder1) { name id works(resourceTypeId: "Text") { totalCount licenses { id title count } } }, funder2Dataset: funder(id: $funder2) { name id works(resourceTypeId: "Dataset") { totalCount licenses { id title count } } }, funder2Text: funder(id: $funder2) { name id works(resourceTypeId: "Text") { totalCount licenses { id title count } } }, funder3Dataset: funder(id: $funder3) { name id works(resourceTypeId: "Dataset") { totalCount licenses { id title count } } }, funder3Text: funder(id: $funder3) { name id works(resourceTypeId: "Text") { totalCount licenses { id title count } } } } """) # Run the above query via the GraphQL client # In[27]: import json data = client.execute(query, variable_values=json.dumps(query_params)) # ## Display bar plot of number of outputs per license type and funder. # Plot an interactive bar plot showing the proportion of outputs issued under a given license type, for each funder. # In[28]: import plotly.io as pio import plotly.express as px from IPython.display import IFrame import pandas as pd from operator import itemgetter import re # Adapted from: https://stackoverflow.com/questions/58766305/is-there-any-way-to-implement-stacked-or-grouped-bar-charts-in-plotly-express def px_stacked_bar(df, color_name='License Type', y_name='Metrics', **pxargs): idx_col = df.index.name m = pd.melt(df.reset_index(), id_vars=idx_col, var_name=color_name, value_name=y_name) # For Plotly colour sequences see: https://plotly.com/python/discrete-color/ return px.bar(m, x=idx_col, y=y_name, color=color_name, **pxargs, color_discrete_sequence=px.colors.qualitative.Pastel1) def get_grouped_license_type(licenseId): ret = None if re.search('cc-by-', licenseId) is not None: ret = "cc-by" elif re.search('cc0-', licenseId) is not None: ret = "cc0" elif licenseId is not None: ret = "other" return ret queries = ['funder1', 'funder2', 'funder3'] # Map each license type to a dict that in turn maps the position of the output's bar in plot # to the count of outputs corresponding to that license type. licenseType2Pos2Count = {} # Under the assumption of one license per work, for each funder licenseType2Pos2Count["No license"] is instantiated # with the totalCount of works for that funder. Any work counts for a license found in funder['works']['licenses'] # will be subtracted from licenseType2Pos2Count["No license"] for that funder, in the end leaving the number of # works with no license. licenseType2Pos2Count["No license"] = {} for pos1 in range(0, len(queries)): # Initialise (no) license's counts for each funder query = queries[pos1] if query in data: licenseType2Pos2Count["No license"][pos1] = data[query]['works']['totalCount'] # Populate license type counts per funder # labels contains funder labels in bar plot - each bar corresponds to a single funder labels = {} pos = 0 for query in queries: if query in data: funder = data[query] labels[pos] = funderId2Acronym[funder['id']] for license in funder['works']['licenses']: outputCount = license['count'] licenseId = get_grouped_license_type(license['id']) if licenseId not in licenseType2Pos2Count: licenseType2Pos2Count[licenseId] = {} for pos1 in range(0, len(queries)): # Initialise license's counts for each funder licenseType2Pos2Count[licenseId][pos1] = 0 licenseType2Pos2Count[licenseId][pos] += outputCount licenseType2Pos2Count["No license"][pos] -= outputCount pos += 1 # Create stacked bar plot x_name = "Funders" dfDict = {x_name: labels} for license in licenseType2Pos2Count: dfDict[license] = licenseType2Pos2Count[license] df = pd.DataFrame(dfDict) fig = px_stacked_bar(df.set_index(x_name), y_name = "Output Counts") # Set plot background to transparent fig.update_layout({ 'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)' }) # Write interactive plot out to html file pio.write_html(fig, file='out.html') # Display plot from the saved html file display(Markdown("
License types of all funder's outputs to date, shown as a stacked bar plot - one bar per funder:")) IFrame(src="./out.html", width=500, height=500) # ## Plot output counts per license type, funder and year # Plot an interactive bar plot showing for each funder the proportion of outputs published in a given year under a given license type. # In[29]: import plotly.express as px import re xstr = lambda s: 'General' if s is None else str(s) # Populate license type counts per funder funderQueryLabels = ['funder1', 'funder2', 'funder3'] outputTypeLabels = ["Dataset", "Text"] funder2resType2licenceType2outputCount = {} # funderAcronym2Name is needed for the plot legend - as funder names are too long to be shown in the plot itself funderAcronym2Name = {} # Collect license type counts data into funder2resType2licenceType2outputCount for funderQueryLabel in funderQueryLabels: for outputType in outputTypeLabels: query = funderQueryLabel + outputType if query in data: funder = data[query] funderAcronym = funderId2Acronym[funder['id']] funderAcronym2Name[funderAcronym] = funder['name'] if funderAcronym not in funder2resType2licenceType2outputCount: funder2resType2licenceType2outputCount[funderAcronym] = {} if outputType not in funder2resType2licenceType2outputCount[funderAcronym]: funder2resType2licenceType2outputCount[funderAcronym][outputType] = {} # Under the assumption of one license per work, for each funder # funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] is instantiated # with the totalCount of works for that funder and outputType. Any work counts for a license found in funder['works']['licenses'] # will be subtracted from funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] for that funder, # in the end leaving the number of works with no license. if "No license" not in funder2resType2licenceType2outputCount[funderAcronym][outputType]: funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] = funder['works']['totalCount'] for license in funder['works']['licenses']: outputCount = license['count'] licenseId = get_grouped_license_type(license['id']) if licenseId not in funder2resType2licenceType2outputCount[funderAcronym][outputType]: funder2resType2licenceType2outputCount[funderAcronym][outputType][licenseId] = 0 funder2resType2licenceType2outputCount[funderAcronym][outputType][licenseId] += outputCount funder2resType2licenceType2outputCount[funderAcronym][outputType]["No license"] -= outputCount # Populate data structures for faceted stacked bar plot funders, outputTypes, licenseTypes, outputCounts = ({}, {}, {}, {}) pos = 0 for funder in funder2resType2licenceType2outputCount: for outputType in funder2resType2licenceType2outputCount[funder]: for licenseType in funder2resType2licenceType2outputCount[funder][outputType]: funders[pos] = funder outputTypes[pos] = outputType licenseTypes[pos] = licenseType outputCounts[pos] = funder2resType2licenceType2outputCount[funder][outputType][licenseType] pos += 1 dfDict = {"Funder": funders, "Output Type": outputTypes, "License": licenseTypes, "Output Count": outputCounts} df1 = pd.DataFrame(dfDict) # Create funders legend tableBody="" for funderAcronym in funderAcronym2Name: tableBody += "%s | %s\n" % (funderAcronym, funderAcronym2Name[funderAcronym]) fig2 = px.bar(df1, x="Output Type", y="Output Count", color="License", barmode="stack", facet_row="Funder" # facet_col="" ) # fig2.update_traces(texttemplate='%{text:}', textposition='inside') fig2.update_layout(uniformtext_minsize=8, uniformtext_mode='hide') # Write interactive plot out to html file pio.write_html(fig2, file='out2.html') # Display plot from the saved html file markDownContent="
Fo each funder, the plot below shows counts of all outputs to date of type %s, corresponding to a given license type." + \ "
Full information is shown when you mouse-over a bar." + \ "
" display(Markdown(markDownContent % " or ".join(outputTypeLabels))) display(Markdown("| Acronym | Funder Name|\n|---|---|\n%s" % tableBody)) IFrame(src="./out2.html", width=500, height=700)