#!/usr/bin/env python # coding: utf-8 # ## Querying Nexus knowledge graph using SPARQL # # The goal of this notebook is to learn the basics of SPARQL. Only the READ part of SPARQL will be exposed. # # ## Prerequisites # # This notebook assumes you've created a project within the AWS deployment of Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html). # ## Overview # # You'll work through the following steps: # # 1. Create a sparql wrapper around your project's SparqlView # 2. Explore and navigate data using the SPARQL query language # # ## Step 1: Create a sparql wrapper around your project's SparqlView # Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a **SPARQL endpoint**. # In[ ]: #Configuration for the Nexus deployment nexus_deployment = "https://sandbox.bluebrainnexus.io/v1" token= "your token here" org ="tutorialnexus" project ="$PROJECTLABEL" headers = {} # In[ ]: #Let install sparqlwrapper which a python wrapper around sparql client get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper') # In[ ]: # Utility functions to create sparql wrapper around a sparql endpoint from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV import requests def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None): sparql_client = SPARQLWrapper(sparql_endpoint) #sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query") if token: sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token)) sparql_client.setMethod(http_query_method) sparql_client.setReturnFormat(result_format) if http_query_method == POST: sparql_client.setRequestMethod(POSTDIRECTLY) return sparql_client # In[ ]: # Utility functions import pandas as pd pd.set_option('display.max_colwidth', -1) # Convert SPARQL results into a Pandas data frame def sparql2dataframe(json_sparql_results): cols = json_sparql_results['head']['vars'] out = [] for row in json_sparql_results['results']['bindings']: item = [] for c in cols: item.append(row.get(c, {}).get('value')) out.append(item) return pd.DataFrame(out, columns=cols) # Send a query using a sparql wrapper def query_sparql(query, sparql_client): sparql_client.setQuery(query) result_object = sparql_client.query() if sparql_client.returnFormat == JSON: return result_object._convertJSON() return result_object.convert() # In[ ]: # Let create a sparql wrapper around the project sparql view sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql" sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON) # ## Step 2: Explore and navigate data using the SPARQL query language # # Let write our first query. # In[ ]: select_all_query = """ SELECT ?s ?p ?o WHERE { ?s ?p ?o } OFFSET 0 LIMIT 5 """ nexus_results = query_sparql(select_all_query,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head() # Most SPARQL queries you'll see will have the anotomy above with: # * a **SELECT** clause that let you select the variables you want to retrieve # * a **WHERE** clause defining a set of constraints that the variables should satisfy to be retrieved # * **LIMIT** and **OFFSET** clauses to enable pagination # * the constraints are usually graph patterns in the form of **triple** (?s for subject, ?p for property and ?o for ?object) # Multiple triples can be provided as graph pattern to match but each triple should end with a period. As an example, let retrieve 5 movies (?movie) along with their titles (?title). # In[ ]: movie_with_title = """ PREFIX vocab: PREFIX nxv: Select ?movie ?title WHERE { ?movie a vocab:Movie. ?movie vocab:title ?title. } LIMIT 5 """%(org,project) nexus_results = query_sparql(movie_with_title,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head() # Note PREFIX clauses. It is way to shorten URIS within a SPARQL query. Without them we would have to use full URI for all properties. # # The ?movie variable is bound to a URI (the internal Nexus id). Let retrieve the movieId just like in the MovieLens csv files for simplicity. # In[ ]: movie_with_title = """ PREFIX vocab: PREFIX nxv: Select ?movieId ?title WHERE { # Select movies ?movie a vocab:Movie. # Select their movieId value ?movie vocab:movieId ?movieId. # ?movie vocab:title ?title. } LIMIT 5 """%(org,project) nexus_results = query_sparql(movie_with_title,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head() # In the above query movies are things (or entities) of type vocab:Movie. # This is a typical instance query where entities are filtered by their type(s) and then some of their properties are retrieved (here ?title). # # Let retrieve everything that is linked (outgoing) to the movies. # The * character in the SELECT clause indicates to retreve all variables: ?movie, ?p, ?o # In[ ]: movie_with_properties = """ PREFIX vocab: PREFIX nxv: Select * WHERE { ?movie a vocab:Movie. ?movie ?p ?o. } LIMIT 20 """%(org,project) nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head(20) # As a little exercise, write a query retrieving incoming entities to movies. You can copy past the query above and modify it. # # Hints: ?s ?p ?o can be read as: ?o is linked to ?s with an outgoing link. # # Do you have results ? # In[ ]: #Your query here # Let retrieve the movie ratings # In[ ]: movie_with_properties = """ PREFIX vocab: PREFIX nxv: Select ?userId ?movieId ?rating ?timestamp WHERE { ?movie a vocab:Movie. ?movie vocab:movieId ?movieId. ?ratingNode vocab:movieId ?ratingmovieId. ?ratingNode vocab:rating ?rating. ?ratingNode vocab:userId ?userId. ?ratingNode vocab:timestamp ?timestamp. # Somehow pandas is movieId as double for rating FILTER(xsd:integer(?ratingmovieId) = ?movieId) } LIMIT 20 """%(org,project) nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head(20) # As a little exercise, write a query retrieving the movie tags along with the user id and timestamp. You can copy and past the query above and modify it. # # In[ ]: #Your query here # ### Aggregate queries # [Aggregates](https://www.w3.org/TR/sparql11-query/#aggregates) apply some operations over a group of solutions. # Available aggregates are: COUNT, SUM, MIN, MAX, AVG, GROUP_CONCAT, and SAMPLE. # # We will not see them all but we'll look at some examples. # The next query will compute the average rating score for 'funny' movies. # In[ ]: tag_value = "funny" movie_avg_ratings = """ PREFIX vocab: PREFIX nxv: Select ( AVG(?ratingvalue) AS ?score) WHERE { # Select movies ?movie a vocab:Movie. # Select their movieId value ?movie vocab:movieId ?movieId. ?tag vocab:movieId ?movieId. ?tag vocab:tag ?tagvalue. FILTER(?tagvalue = "%s"). # Keep movies with ratings ?rating vocab:movieId ?ratingmovidId. FILTER(xsd:integer(?ratingmovidId) = xsd:integer(?movieId)) ?rating vocab:rating ?ratingvalue. } """ %(org,project,tag_value) nexus_results = query_sparql(movie_avg_ratings,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) display(nexus_df.head(20)) nexus_df=nexus_df.astype(float) # Retrieve the number of tags per movie. Can be a little bit slow depending on the size of your data. # In[ ]: nbr_tags_per_movie = """ PREFIX vocab: PREFIX nxv: Select ?title (COUNT(?tagvalue) as ?tagnumber) WHERE { # Select movies ?movie a vocab:Movie. # Select their movieId value ?movie vocab:movieId ?movieId. ?tag a vocab:Tag. ?tag vocab:movieId ?tagmovieId. FILTER(?tagmovieId = ?movieId) ?movie vocab:title ?title. ?tag vocab:tag ?tagvalue. } GROUP BY ?title ORDER BY DESC(?tagnumber) LIMIT 10 """ %(org,project) nexus_results = query_sparql(nbr_tags_per_movie,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) display(nexus_df.head(20)) # In[ ]: #Let plot the result nexus_df.tagnumber = pd.to_numeric(nexus_df.tagnumber) nexus_df.plot(x="title",y="tagnumber",kind="bar") # The next query will retrieve movies along with users that tagged them separated by a comma # In[ ]: # Group Concat movie_tag_users = """ PREFIX vocab: PREFIX nxv: Select ?movieId (group_concat(DISTINCT ?userId;separator=",") as ?users) WHERE { # Select movies ?movie a vocab:Movie. # Select their movieId value ?movie vocab:movieId ?movieId. ?tag vocab:movieId ?movieId. ?tag vocab:userId ?userId. } GROUP BY ?movieId LIMIT 10 """%(org,project) nexus_results = query_sparql(movie_tag_users,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head(20) # In[ ]: # In[ ]: