#!/usr/bin/env python
# coding: utf-8
# ## Querying Nexus knowledge graph using SPARQL
#
# The goal of this notebook is to learn the basics of SPARQL. Only the READ part of SPARQL will be exposed.
#
# ## Prerequisites
#
# This notebook assumes you've created a project within the AWS deployment of Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html).
# ## Overview
#
# You'll work through the following steps:
#
# 1. Create a sparql wrapper around your project's SparqlView
# 2. Explore and navigate data using the SPARQL query language
#
# ## Step 1: Create a sparql wrapper around your project's SparqlView
# Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a **SPARQL endpoint**.
# In[ ]:
#Configuration for the Nexus deployment
nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"
token= "your token here"
org ="tutorialnexus"
project ="$PROJECTLABEL"
headers = {}
# In[ ]:
#Let install sparqlwrapper which a python wrapper around sparql client
get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper')
# In[ ]:
# Utility functions to create sparql wrapper around a sparql endpoint
from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests
def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
sparql_client = SPARQLWrapper(sparql_endpoint)
#sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
if token:
sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
sparql_client.setMethod(http_query_method)
sparql_client.setReturnFormat(result_format)
if http_query_method == POST:
sparql_client.setRequestMethod(POSTDIRECTLY)
return sparql_client
# In[ ]:
# Utility functions
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
cols = json_sparql_results['head']['vars']
out = []
for row in json_sparql_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
# Send a query using a sparql wrapper
def query_sparql(query, sparql_client):
sparql_client.setQuery(query)
result_object = sparql_client.query()
if sparql_client.returnFormat == JSON:
return result_object._convertJSON()
return result_object.convert()
# In[ ]:
# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)
# ## Step 2: Explore and navigate data using the SPARQL query language
#
# Let write our first query.
# In[ ]:
select_all_query = """
SELECT ?s ?p ?o
WHERE
{
?s ?p ?o
}
OFFSET 0
LIMIT 5
"""
nexus_results = query_sparql(select_all_query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
# Most SPARQL queries you'll see will have the anotomy above with:
# * a **SELECT** clause that let you select the variables you want to retrieve
# * a **WHERE** clause defining a set of constraints that the variables should satisfy to be retrieved
# * **LIMIT** and **OFFSET** clauses to enable pagination
# * the constraints are usually graph patterns in the form of **triple** (?s for subject, ?p for property and ?o for ?object)
# Multiple triples can be provided as graph pattern to match but each triple should end with a period. As an example, let retrieve 5 movies (?movie) along with their titles (?title).
# In[ ]:
movie_with_title = """
PREFIX vocab:
PREFIX nxv:
Select ?movie ?title
WHERE {
?movie a vocab:Movie.
?movie vocab:title ?title.
} LIMIT 5
"""%(org,project)
nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
# Note PREFIX clauses. It is way to shorten URIS within a SPARQL query. Without them we would have to use full URI for all properties.
#
# The ?movie variable is bound to a URI (the internal Nexus id). Let retrieve the movieId just like in the MovieLens csv files for simplicity.
# In[ ]:
movie_with_title = """
PREFIX vocab:
PREFIX nxv:
Select ?movieId ?title
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
#
?movie vocab:title ?title.
} LIMIT 5
"""%(org,project)
nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
# In the above query movies are things (or entities) of type vocab:Movie.
# This is a typical instance query where entities are filtered by their type(s) and then some of their properties are retrieved (here ?title).
#
# Let retrieve everything that is linked (outgoing) to the movies.
# The * character in the SELECT clause indicates to retreve all variables: ?movie, ?p, ?o
# In[ ]:
movie_with_properties = """
PREFIX vocab:
PREFIX nxv:
Select *
WHERE {
?movie a vocab:Movie.
?movie ?p ?o.
} LIMIT 20
"""%(org,project)
nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)
# As a little exercise, write a query retrieving incoming entities to movies. You can copy past the query above and modify it.
#
# Hints: ?s ?p ?o can be read as: ?o is linked to ?s with an outgoing link.
#
# Do you have results ?
# In[ ]:
#Your query here
# Let retrieve the movie ratings
# In[ ]:
movie_with_properties = """
PREFIX vocab:
PREFIX nxv:
Select ?userId ?movieId ?rating ?timestamp
WHERE {
?movie a vocab:Movie.
?movie vocab:movieId ?movieId.
?ratingNode vocab:movieId ?ratingmovieId.
?ratingNode vocab:rating ?rating.
?ratingNode vocab:userId ?userId.
?ratingNode vocab:timestamp ?timestamp.
# Somehow pandas is movieId as double for rating
FILTER(xsd:integer(?ratingmovieId) = ?movieId)
} LIMIT 20
"""%(org,project)
nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)
# As a little exercise, write a query retrieving the movie tags along with the user id and timestamp. You can copy and past the query above and modify it.
#
# In[ ]:
#Your query here
# ### Aggregate queries
# [Aggregates](https://www.w3.org/TR/sparql11-query/#aggregates) apply some operations over a group of solutions.
# Available aggregates are: COUNT, SUM, MIN, MAX, AVG, GROUP_CONCAT, and SAMPLE.
#
# We will not see them all but we'll look at some examples.
# The next query will compute the average rating score for 'funny' movies.
# In[ ]:
tag_value = "funny"
movie_avg_ratings = """
PREFIX vocab:
PREFIX nxv:
Select ( AVG(?ratingvalue) AS ?score)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag vocab:movieId ?movieId.
?tag vocab:tag ?tagvalue.
FILTER(?tagvalue = "%s").
# Keep movies with ratings
?rating vocab:movieId ?ratingmovidId.
FILTER(xsd:integer(?ratingmovidId) = xsd:integer(?movieId))
?rating vocab:rating ?ratingvalue.
}
""" %(org,project,tag_value)
nexus_results = query_sparql(movie_avg_ratings,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))
nexus_df=nexus_df.astype(float)
# Retrieve the number of tags per movie. Can be a little bit slow depending on the size of your data.
# In[ ]:
nbr_tags_per_movie = """
PREFIX vocab:
PREFIX nxv:
Select ?title (COUNT(?tagvalue) as ?tagnumber)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag a vocab:Tag.
?tag vocab:movieId ?tagmovieId.
FILTER(?tagmovieId = ?movieId)
?movie vocab:title ?title.
?tag vocab:tag ?tagvalue.
}
GROUP BY ?title
ORDER BY DESC(?tagnumber)
LIMIT 10
""" %(org,project)
nexus_results = query_sparql(nbr_tags_per_movie,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))
# In[ ]:
#Let plot the result
nexus_df.tagnumber = pd.to_numeric(nexus_df.tagnumber)
nexus_df.plot(x="title",y="tagnumber",kind="bar")
# The next query will retrieve movies along with users that tagged them separated by a comma
# In[ ]:
# Group Concat
movie_tag_users = """
PREFIX vocab:
PREFIX nxv:
Select ?movieId (group_concat(DISTINCT ?userId;separator=",") as ?users)
WHERE {
# Select movies
?movie a vocab:Movie.
# Select their movieId value
?movie vocab:movieId ?movieId.
?tag vocab:movieId ?movieId.
?tag vocab:userId ?userId.
}
GROUP BY ?movieId
LIMIT 10
"""%(org,project)
nexus_results = query_sparql(movie_tag_users,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)
# In[ ]:
# In[ ]: