#!/usr/bin/env python
# coding: utf-8

# ## Querying Nexus knowledge graph using SPARQL
# 
# The goal of this notebook is to learn the basics of SPARQL. Only the READ part of SPARQL will be exposed.
# 

# ## Prerequisites
# 
# This notebook assumes you've created a project within the AWS deployment of Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html).

# ## Overview
# 
# You'll work through the following steps:
# 
# 1. Create a sparql wrapper around your project's SparqlView
# 2. Explore and navigate data using the SPARQL query language
# 

# ## Step 1: Create a sparql wrapper around your project's SparqlView

# Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the W3C SPARQL Language. The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a **SPARQL endpoint**.

# In[ ]:


#Configuration for the Nexus deployment
nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"

token= "your token here"

org ="tutorialnexus"
project ="$PROJECTLABEL"

headers = {}


# In[ ]:


#Let install sparqlwrapper which a python wrapper around sparql client
get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper')


# In[ ]:


# Utility functions to create sparql wrapper around a sparql endpoint

from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests


def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
    sparql_client = SPARQLWrapper(sparql_endpoint)
    #sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
    if token:
        sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
    sparql_client.setMethod(http_query_method)
    sparql_client.setReturnFormat(result_format)
    if http_query_method == POST:
        sparql_client.setRequestMethod(POSTDIRECTLY)
    
    return sparql_client


# In[ ]:


# Utility functions
import pandas as pd

pd.set_option('display.max_colwidth', -1)

# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
    cols = json_sparql_results['head']['vars']
    out = []
    for row in json_sparql_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
    return pd.DataFrame(out, columns=cols)

# Send a query using a sparql wrapper 
def query_sparql(query, sparql_client):
    sparql_client.setQuery(query)
    

    result_object = sparql_client.query()
    if sparql_client.returnFormat == JSON:
        return result_object._convertJSON()
    return result_object.convert()


# In[ ]:


# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)


# ## Step 2: Explore and navigate data using the SPARQL query language
# 

# Let write our first query.

# In[ ]:


select_all_query = """
SELECT ?s ?p ?o
WHERE
{
  ?s ?p ?o
}
OFFSET 0
LIMIT 5
"""

nexus_results = query_sparql(select_all_query,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()


# Most SPARQL queries you'll see will have the anotomy above with:
# * a **SELECT** clause that let you select the variables you want to retrieve
# * a **WHERE** clause defining a set of constraints that the variables should satisfy to be retrieved
# * **LIMIT** and **OFFSET** clauses to enable pagination
# * the constraints are usually graph patterns in the form of **triple** (?s for subject, ?p for property and ?o for ?object)

# Multiple triples can be provided as graph pattern to match but each triple should end with a period. As an example, let retrieve 5 movies (?movie) along with their titles (?title).

# In[ ]:


movie_with_title = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movie ?title
 WHERE  {
    ?movie a vocab:Movie.
    ?movie vocab:title ?title.
} LIMIT 5
"""%(org,project)

nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()


# Note PREFIX clauses. It is way to shorten URIS within a SPARQL query. Without them we would have to use full URI for all properties.
# 
# The ?movie variable is bound to a URI (the internal Nexus id). Let retrieve the movieId just like in the MovieLens csv files for simplicity.

# In[ ]:


movie_with_title = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title
 WHERE  {
    
    # Select movies
    ?movie a vocab:Movie.

    # Select their movieId value
    ?movie vocab:movieId ?movieId.
    
    #
    ?movie vocab:title ?title.
    
} LIMIT 5
"""%(org,project)

nexus_results = query_sparql(movie_with_title,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()


# In the above query movies are things (or entities) of type vocab:Movie. 
# This is a typical instance query where entities are filtered by their type(s) and then some of their properties are retrieved (here ?title). 
# 
# Let retrieve everything that is linked (outgoing) to the movies. 
# The * character in the SELECT clause indicates to retreve all variables: ?movie, ?p, ?o

# In[ ]:


movie_with_properties = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select *
 WHERE  {
    ?movie a vocab:Movie.
    ?movie ?p ?o.
} LIMIT 20
"""%(org,project)

nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)


# As a little exercise, write a query retrieving incoming entities to movies. You can copy past the query above and modify it.
# 
# Hints: ?s ?p ?o can be read as: ?o is linked to ?s with an outgoing link.
# 
# Do you have results ?

# In[ ]:


#Your query here


# Let retrieve the movie ratings

# In[ ]:


movie_with_properties = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?userId ?movieId ?rating ?timestamp
 WHERE  {
    ?movie a vocab:Movie.
    ?movie vocab:movieId ?movieId.
    
    
    ?ratingNode vocab:movieId ?ratingmovieId.
    ?ratingNode vocab:rating ?rating.
    ?ratingNode vocab:userId ?userId.
    ?ratingNode vocab:timestamp ?timestamp.
    
    # Somehow pandas is movieId as double for rating 
    FILTER(xsd:integer(?ratingmovieId) = ?movieId)
    
} LIMIT 20
"""%(org,project)

nexus_results = query_sparql(movie_with_properties,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)


# As a little exercise, write a query retrieving the movie tags along with the user id and timestamp. You can copy and past the query above and modify it.
# 

# In[ ]:


#Your query here


# ### Aggregate queries

# [Aggregates](https://www.w3.org/TR/sparql11-query/#aggregates) apply some operations over a group of solutions.
# Available aggregates are: COUNT, SUM, MIN, MAX, AVG, GROUP_CONCAT, and SAMPLE.
# 
# We will not see them all but we'll look at some examples.

# The next query will compute the average rating score for 'funny' movies.

# In[ ]:


tag_value = "funny"
movie_avg_ratings = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>

Select ( AVG(?ratingvalue) AS ?score)
 WHERE  {
    # Select movies
    ?movie a vocab:Movie.

    # Select their movieId value
    ?movie vocab:movieId ?movieId.

    ?tag vocab:movieId ?movieId.
    ?tag vocab:tag ?tagvalue.
    FILTER(?tagvalue = "%s").

    # Keep movies with ratings
    ?rating vocab:movieId ?ratingmovidId.
    FILTER(xsd:integer(?ratingmovidId) = xsd:integer(?movieId))
    ?rating vocab:rating ?ratingvalue.

}
""" %(org,project,tag_value)

nexus_results = query_sparql(movie_avg_ratings,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))
nexus_df=nexus_df.astype(float)


# Retrieve the number of tags per movie. Can be a little bit slow depending on the size of your data.

# In[ ]:


nbr_tags_per_movie = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>

Select ?title (COUNT(?tagvalue) as ?tagnumber)
 WHERE  {
    # Select movies
    ?movie a vocab:Movie.
    # Select their movieId value
    ?movie vocab:movieId ?movieId.
    
    ?tag a vocab:Tag.
    ?tag vocab:movieId ?tagmovieId.
    FILTER(?tagmovieId = ?movieId)
    ?movie vocab:title ?title.
    ?tag vocab:tag ?tagvalue.
}

GROUP BY ?title 
ORDER BY DESC(?tagnumber)
LIMIT 10
""" %(org,project)

nexus_results = query_sparql(nbr_tags_per_movie,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
display(nexus_df.head(20))


# In[ ]:


#Let plot the result
nexus_df.tagnumber = pd.to_numeric(nexus_df.tagnumber)
nexus_df.plot(x="title",y="tagnumber",kind="bar")


# The next query will retrieve movies along with users that tagged them separated by a comma

# In[ ]:


# Group Concat

movie_tag_users = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>

Select ?movieId (group_concat(DISTINCT ?userId;separator=",") as ?users)
 WHERE  {
    # Select movies
    ?movie a vocab:Movie.

    # Select their movieId value
    ?movie vocab:movieId ?movieId.

    ?tag vocab:movieId ?movieId.
    ?tag vocab:userId ?userId.

  
}
GROUP BY ?movieId
LIMIT 10
"""%(org,project)

nexus_results = query_sparql(movie_tag_users,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head(20)


# In[ ]:


# In[ ]: