#!/usr/bin/env python # coding: utf-8 # ![FREYA Logo](https://github.com/datacite/pidgraph-notebooks-python/blob/master/images/freya_200x121.png?raw=true) | [FREYA](https://www.project-freya.eu/en) | WP2 [User Story 6](https://www.pidforum.org/t/pid-graph-graphql-example-disambiguate-researchers/931): As a researcher, I am looking for more information about another researcher with a common name, but don’t know his/her ORCID ID. # :------------- | :------------- | :------------- # # It is important to be able to locate a researcher of interest even though their ORCID ID is unknown. For example, a reader of a scientific publication may wish to find out more about one of the authors, whereby the publisher has not cross-referenced that author's name to ORCID.

# # This notebook uses the [DataCite GraphQL API](https://api.datacite.org/graphql) to disambiguate a researcher name via a *funnel* approach: # * First all researcher records matching query "John AND Smith" and retrieved, and an alphabetically sorted list of affiliations and the corresponding researcher names is displayed; # * Then the notebook simulates the user selecting one of the affiliations (in our case "University of Arizona"), and then performs a more detailed query: "John AND Smith AND University of Arizona". The second query retrieves and displays a much smaller set of results, now also containing the researcher's publications, thus helping the user pinpoint the researcher of interest more easily. # # **Goal**: By the end of this notebook, you should be able successfully disambiguate a researcher name of interest. # ## Install libraries and prepare GraphQL client # In[228]: get_ipython().run_cell_magic('capture', '', '# Install required Python packages\n!pip install gql requests\n') # In[229]: # Prepare the GraphQL client import requests from IPython.display import display, Markdown from gql import gql, Client from gql.transport.requests import RequestsHTTPTransport _transport = RequestsHTTPTransport( url='https://api.datacite.org/graphql', use_json=True, ) client = Client( transport=_transport, fetch_schema_from_transport=True, ) # ## Define and run GraphQL query # Define the GraphQL query to find all publications including co-authors for [Dr Sarah Teichmann](https://orcid.org/0000-0002-6294-6366): # In[231]: # Generate the GraphQL query to retrieve up to 100 researchers matching query "John and Smith" query_params = { "query" : "John AND Smith", "max_researchers" : 100, "query_end_cursor" : "" } query_str = """query getResearchersByName( $query: String!, $max_researchers: Int!, $query_end_cursor : String! ) { people(query: $query, first: $max_researchers, after: $query_end_cursor) { totalCount pageInfo { hasNextPage endCursor } nodes { id givenName familyName name affiliation { name } } } } """ # Run the above query via the GraphQL client # In[232]: import json found_next_page = True # Initialise overall data dict that will store results data = {} # Keep retrieving results until there are no more results left while True: query = gql("%s" % query_str) res = client.execute(query, variable_values=json.dumps(query_params)) if "people" not in data: data = res else: people = res["people"] data["people"]["nodes"].extend(people["nodes"]) pageInfo = people["pageInfo"] if pageInfo["hasNextPage"]: if pageInfo["endCursor"] is not None: query_params["query_end_cursor"] = pageInfo["endCursor"] else: break else: break # ## List researcher details # List in tabular format affilitions and the corresponding researcher names. This allows the user to select one of the affiliations to use in a more detailed query (see below) that also retrieves publications. # In[234]: # Collect names and affiliations for the researchers found # Test if fieldValue matches (case-insensitively) a Solr-style query (with " AND " representing the logical AND, and " " representing the logical OR) def testIfPresentCaseInsensitive(solrQuery, fieldValueLowerCase): for orTerms in solrQuery.split(" AND "): present = False for term in orTerms.split(" "): if term.lower() in fieldValueLowerCase: present = True break if not present: return False return True people = data['people'] af2Names = {} totalCount = 0 for node in people['nodes']: id = node['id'] name = node['name'] # TODO: Remove if we manage to search only individual fields if not testIfPresentCaseInsensitive(query_params['query'], name.lower()): continue totalCount += 1 for af in node['affiliation']: affiliation = af['name'] if affiliation not in af2Names: af2Names[affiliation] = set() af2Names[affiliation].add(name) tableBody = "" for af,names in sorted(af2Names.items()): tableBody += af + " | " + ', '.join(names) + "\n" display(Markdown("Total number of researchers found: **%d**
The list of researchers by affiliation is as follows:" % totalCount)) display(Markdown("")) display(Markdown("| Affiliation | Researcher Names |\n|---|---|\n%s" % tableBody)) # In[235]: # Generate the GraphQL query to retrieve all researchers matching query "John and Smith" and affiliation "University of Arizona", now with works name_query = "John AND Smith" affiliation_query = "\"University of Arizona\"" query_params1 = { "query" : name_query + " AND " + affiliation_query, "max_researchers" : 10, "query_end_cursor" : "" } query_str = """query getResearchersByName( $query: String!, $max_researchers: Int!, $query_end_cursor : String! ) { people(query: $query, first: $max_researchers, after: $query_end_cursor) { totalCount pageInfo { hasNextPage endCursor } nodes { id givenName familyName name affiliation { name } works(first: 3) { nodes { id publicationYear publisher titles { title } creators { id name affiliation { id name } } subjects { subject } } } } } } """ # Run the above query via the GraphQL client # In[236]: import json found_next_page = True # Initialise overall data dict that will store results data1 = {} # Keep retrieving results until there are no more results left while True: query = gql("%s" % query_str) res = client.execute(query, variable_values=json.dumps(query_params1)) if "people" not in data1: data1 = res else: people = res["people"] data1["people"]["nodes"].extend(people["nodes"]) pageInfo = people["pageInfo"] if pageInfo["hasNextPage"]: if pageInfo["endCursor"] is not None: query_params["query_end_cursor"] = pageInfo["endCursor"] else: break else: break # In[237]: from textwrap import shorten # Collect all relevant details for the researchers found tableBody=set() people = data1['people'] for node in people['nodes']: id = node['id'] firstName = node['givenName'] surname = node['familyName'] name = node['name'] # TODO: Remove if we manage to search only individual fields if not testIfPresentCaseInsensitive(name_query, name.lower()): continue orcidHref = "" if id is not None and id != "": orcidHref = "["+ name +"]("+ id +")" affiliations = [] for affiliation in node['affiliation']: affiliations.append(affiliation['name']) works = "" if 'works' in node: for work in node['works']['nodes']: titles = [] for title in work['titles']: titles.append(shorten(title['title'], width=50, placeholder="...")) creators = [] cnt = 0 for creator in work['creators']: cnt += 1 # Restrict display to the first author only if (cnt > 1): creators[-1] += " et al." break if creator['id'] is not None: creators.append("[" + creator['name'] + "](" + creator['id'] + ")") else: creators.append(creator['name']) works += '; '.join(creators) + " (" + str(work['publicationYear']) + ") ["+ ', '.join(titles) +"]("+ work['id'] + ") *" + work['publisher'] + "*
" tableBody.add(firstName + " | " + surname + " | " + orcidHref + " | " + '
'.join(sorted(affiliations)) + " | " + works) display(Markdown("| First Name | Surname | Link to ORCID | Affiliations | Works | \n|---|---|---|---|---|\n%s" % '\n'.join(tableBody))) # In[ ]: