#!/usr/bin/env python # coding: utf-8 #  | [FREYA](https://www.project-freya.eu/en) | WP2 [User Story 6](https://www.pidforum.org/t/pid-graph-graphql-example-disambiguate-researchers/931): As a researcher, I am looking for more information about another researcher with a common name, but don’t know his/her ORCID ID. # :------------- | :------------- | :------------- # # It is important to be able to locate a researcher of interest even though their ORCID ID is unknown. For example, a reader of a scientific publication may wish to find out more about one of the authors, whereby the publisher has not cross-referenced that author's name to ORCID.
# # This notebook uses the [DataCite GraphQL API](https://api.datacite.org/graphql) to disambiguate a researcher name via a *funnel* approach: # * First all researcher records matching query "John AND Smith" and retrieved, and an alphabetically sorted list of affiliations and the corresponding researcher names is displayed; # * Then the notebook simulates the user selecting one of the affiliations (in our case "University of Arizona"), and then performs a more detailed query: "John AND Smith AND University of Arizona". The second query retrieves and displays a much smaller set of results, now also containing the researcher's publications, thus helping the user pinpoint the researcher of interest more easily. # # **Goal**: By the end of this notebook, you should be able successfully disambiguate a researcher name of interest. # ## Install libraries and prepare GraphQL client # In[228]: get_ipython().run_cell_magic('capture', '', '# Install required Python packages\n!pip install gql requests\n') # In[229]: # Prepare the GraphQL client import requests from IPython.display import display, Markdown from gql import gql, Client from gql.transport.requests import RequestsHTTPTransport _transport = RequestsHTTPTransport( url='https://api.datacite.org/graphql', use_json=True, ) client = Client( transport=_transport, fetch_schema_from_transport=True, ) # ## Define and run GraphQL query # Define the GraphQL query to find all publications including co-authors for [Dr Sarah Teichmann](https://orcid.org/0000-0002-6294-6366): # In[231]: # Generate the GraphQL query to retrieve up to 100 researchers matching query "John and Smith" query_params = { "query" : "John AND Smith", "max_researchers" : 100, "query_end_cursor" : "" } query_str = """query getResearchersByName( $query: String!, $max_researchers: Int!, $query_end_cursor : String! ) { people(query: $query, first: $max_researchers, after: $query_end_cursor) { totalCount pageInfo { hasNextPage endCursor } nodes { id givenName familyName name affiliation { name } } } } """ # Run the above query via the GraphQL client # In[232]: import json found_next_page = True # Initialise overall data dict that will store results data = {} # Keep retrieving results until there are no more results left while True: query = gql("%s" % query_str) res = client.execute(query, variable_values=json.dumps(query_params)) if "people" not in data: data = res else: people = res["people"] data["people"]["nodes"].extend(people["nodes"]) pageInfo = people["pageInfo"] if pageInfo["hasNextPage"]: if pageInfo["endCursor"] is not None: query_params["query_end_cursor"] = pageInfo["endCursor"] else: break else: break # ## List researcher details # List in tabular format affilitions and the corresponding researcher names. This allows the user to select one of the affiliations to use in a more detailed query (see below) that also retrieves publications. # In[234]: # Collect names and affiliations for the researchers found # Test if fieldValue matches (case-insensitively) a Solr-style query (with " AND " representing the logical AND, and " " representing the logical OR) def testIfPresentCaseInsensitive(solrQuery, fieldValueLowerCase): for orTerms in solrQuery.split(" AND "): present = False for term in orTerms.split(" "): if term.lower() in fieldValueLowerCase: present = True break if not present: return False return True people = data['people'] af2Names = {} totalCount = 0 for node in people['nodes']: id = node['id'] name = node['name'] # TODO: Remove if we manage to search only individual fields if not testIfPresentCaseInsensitive(query_params['query'], name.lower()): continue totalCount += 1 for af in node['affiliation']: affiliation = af['name'] if affiliation not in af2Names: af2Names[affiliation] = set() af2Names[affiliation].add(name) tableBody = "" for af,names in sorted(af2Names.items()): tableBody += af + " | " + ', '.join(names) + "\n" display(Markdown("Total number of researchers found: **%d**