#!/usr/bin/env python
# coding: utf-8

# # Linking data on the web
# 
# In this notebook you'll experience how to consume structured data available on Wikidata to extend and enrich data managed within Nexus.

# ## Prerequisites
# 
# This notebook assumes you've created a project within the AWS deployment of Blue Brain Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html).
# 

# ## Overview
# 
# You'll work through the following steps:
# 
# 1. Install and configure the Blue Brain Nexus python sdk
# 2. Create a sparql wrapper around your project's SparqlView
# 3. Create a sparql wrapper around the Wikidata Sparql Service
# 4. Query more metadata from the Wikidata Sparql Service for movies stored in Nexus using their tmdbId
# 5. Save the extended movies metadata back to Nexus and perform new queries

# ## Step 1: Install and configure the Blue Brain Nexus python sdk

# In[ ]:


get_ipython().system('pip install git+https://github.com/BlueBrain/nexus-cli')


# In[1]:


import nexussdk as nexus


nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"

token = "your token here"
nexus.config.set_environment(nexus_deployment)
nexus.config.set_token(token)


org ="tutorialnexus"

#Provide your project name here.
project ="your_project_here"


# ## Step 2: Create a sparql wrapper around your project's SparqlView

# Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the [W3C SPARQL Language](https://www.w3.org/TR/sparql11-query/). 
# The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a sparql endpoint.

# In[ ]:


get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper')


# In[ ]:


# Utility functions to create sparql wrapper around a sparql endpoint

from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests


def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
    sparql_client = SPARQLWrapper(sparql_endpoint)
    #sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
    if token:
        sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
    sparql_client.setMethod(http_query_method)
    sparql_client.setReturnFormat(result_format)
    if http_query_method == POST:
        sparql_client.setRequestMethod(POSTDIRECTLY)
    
    return sparql_client


# In[ ]:


# Utility functions
import pandas as pd

pd.set_option('display.max_colwidth', -1)

# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
    cols = json_sparql_results['head']['vars']
    out = []
    for row in json_sparql_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)
    return pd.DataFrame(out, columns=cols)

# Send a query using a sparql wrapper 
def query_sparql(query, sparql_client):
    sparql_client.setQuery(query)
    
    result_object = sparql_client.query()
    if sparql_client.returnFormat == JSON:
        return result_object._convertJSON()
    return result_object.convert()


# In[ ]:


# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)


# Let test that the SparqlView wrapper works by running a simple Sparql query to get 5 movies along with their titles.

# In[ ]:


five_movie_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movie_nexus_Id ?movieId ?title ?genres ?imdbId ?tmdbId ?revision
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id nxv:rev ?revision.
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
} LIMIT 5
""" %(org, project)

nexus_results = query_sparql(five_movie_query,sparqlview_wrapper)


nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()


# ## Step 3: Create a Sparql wrapper around the Wikidata Sparql Service

# In[ ]:


wikidata_sparql_endpoint = "https://query.wikidata.org/sparql"
wikidata_sparql_wrapper = create_sparql_client(sparql_endpoint=wikidata_sparql_endpoint,http_query_method= GET, result_format=JSON)


# Let test that the wrapper works by running a query that fetch the logo url for a given movie tmdbId 862 (Toy Story).
# You can play the following query in the Wikidata playground [Try It](https://query.wikidata.org/#SELECT%20%2a%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%22862%22.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%20%3Flang.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP577%20%3FreleaseDate.%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP291%20wdt%3AQ30.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%7D).
# 
# In the query below:
# 
# * [wdt:P4947](https://www.wikidata.org/wiki/Property:P4947) is the wikidata property for tmdbId
# * [wdt:P154](https://www.wikidata.org/wiki/Property:P154) is the wiki data property for logo image url
# 

# In[ ]:


# wdt:P4947 is the wikidata property for tmdbId

movie_logo_query = """
SELECT *
WHERE
{
    ?movie wdt:P4947 "%s".
    OPTIONAL{
    ?movie wdt:P154 ?logo.
    }
}
""" % (862)
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
wiki_df.head()


# Let display the logo of the Toy Story movie. This part might take some time but you can skip it.

# In[ ]:


from IPython.display import SVG, display

movie_logo_url = wiki_df.at[0,'logo']
display(SVG(movie_logo_url))


# In[ ]:


# ## Step 4: Query more metadata from the Wikidata Sparql Service

# For every movie retrieved from the Nexus SparqlView, we will get:
# 
# * its cast members or voice actors
# * its publication date in the United States of America if any
# 
# [Try it](https://query.wikidata.org/#SELECT%20%3FtmdbId%20%3Fmovie%20%3Flogo%20%3Fnativelanguage%20%3Fpublication_date%20%3Fcast%20%3FgivenName%20%3FfamilyName%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%3FtmdbId.%0A%20%20%20%20%20%20%20%20FILTER%20%28%3FtmdbId%20%3D%20%22862%22%29.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP725%7Cwdt%3AP161%20%3Fcast.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP735%2Fwdt%3AP1705%20%3FgivenName.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP734%2Fwdt%3AP1705%20%3FfamilyName.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%2Fwdt%3AP1705%20%3Fnativelanguage.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%0A%20%20%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fmovie%20p%3AP577%20%3Fpublication_date_node.%0A%20%20%20%20%20%3Fpublication_date_node%20ps%3AP577%20%3Fpublication_date.%20%23%20publication%20date%20statement%0A%20%20%20%20%20%3Fpublication_date_node%20pq%3AP291%20wd%3AQ30.%20%20%20%20%20%20%20%20%20%20%20%20%23%20qualifier%20on%20the%20release%20date%0A%20%20%7D%0A%20%20%20%20%7D) for the movie Toy Story.

# In[ ]:


from functools import reduce
import json


def panda_merge(df,df2, on):
    cols_to_use = df2.columns.difference(df.columns)
    dfNew = pd.merge(df, df2[cols_to_use], left_index=True, right_index=True, how='outer')
    return dfNew

def panda_concatenate(dfs):
    df = pd.concat(dfs)
    return df

wiki_dataframes = []
for index, row in nexus_df.iterrows():
    imdbdId = row['tmdbId']
    movie_logo_query = """
    SELECT ?tmdbId ?movie ?logo ?nativelanguage ?publication_date ?cast ?givenName ?familyName
    WHERE
    {
        ?movie wdt:P4947 ?tmdbId.
        FILTER (?tmdbId = "%s").
        OPTIONAL{
          ?movie wdt:P154 ?logo.
        }
        OPTIONAL{
          ?movie wdt:P725|wdt:P161 ?cast.
          ?cast wdt:P735/wdt:P1705 ?givenName.
          ?cast wdt:P734/wdt:P1705 ?familyName.
        }
        OPTIONAL{
        ?movie wdt:P364/wdt:P1705 ?nativelanguage.
        }
       
      OPTIONAL {
     ?movie p:P577 ?publication_date_node.
     ?publication_date_node ps:P577 ?publication_date. # publication date statement
     ?publication_date_node pq:P291 wd:Q30.            # qualifier on the release date
  }
    }
    """ % (row['tmdbId'])

    wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
    wiki_df =sparql2dataframe(wiki_results)
    print("""Display metadata (from wikidata) for %s""" %(nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]))
    display(wiki_df.head())
    wiki_dataframes.append(wiki_df)


#Let concatenate all dataframes from wikidata
result_wiki_dataframes = panda_concatenate(wiki_dataframes)
#display(result_wiki_dataframes.head())


merge_wiki_dataframes = panda_merge(result_wiki_dataframes,nexus_df,"tmdbId")
#display(merge_wiki_dataframes.head())
wiki_dataframes_tojson = (merge_wiki_dataframes.apply(lambda x: x.dropna(), 1).groupby(['tmdbId','movie','nativelanguage'], as_index=False)
             .apply(lambda x: x[['cast','givenName','familyName']].to_dict('r'))
             .reset_index()
             .rename(columns={0:'casting'})
             .to_json(orient='records'))


updated_movies_json = json.loads(wiki_dataframes_tojson)


# ## Step 5: Save the extended movies metadata back to Nexus

# In[ ]:


# We obtained a json array to be loaded
from urllib.parse import urlencode, quote_plus

def update_in_nexus(row):
    row["@type"]= "Movie"
    _id = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]
    rev = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'revision'].iloc[0]
    row["title"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'title'].iloc[0]
    row["genres"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'genres'].iloc[0]
    row["movieId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movieId'].iloc[0]
    row["imdbId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'imdbId'].iloc[0]
    
    
    data = nexus.resources.fetch(org_label=org,project_label=project,resource_id=_id,schema_id="_")
    current_revision = data["_rev"]
  
    
    url = nexus_deployment+"/resources/"+org+"/"+project+"/"+"_/"+quote_plus(_id)
    row["_self"] = url
    nexus.resources.update(resource=row, rev=current_revision)
    
for item in updated_movies_json:
    update_in_nexus(item)
    

# The data can now be listed with the casting metadata obtained from wikidata.
# 

# In[ ]:


# List movies with their casting names separated by a comma.

movie_acting_query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId (group_concat(DISTINCT ?castname;separator=", ") as ?casting)
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id vocab:casting ?cast.
    ?cast vocab:givenName ?givenName.
    ?cast vocab:familyName ?familyName.
    BIND (CONCAT(?givenName, " ", ?familyName) AS ?castname).
   
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
}
Group By ?movieId ?title ?genres ?imdbId ?tmdbId
LIMIT 100
""" %(org,project)

nexus_updated_results = query_sparql(movie_acting_query,sparqlview_wrapper)
nexus_df_updated=sparql2dataframe(nexus_updated_results)
nexus_df_updated.head(100)


# In[ ]:


# Pick an actor and find movies within which they acted
actor = updated_movies_json[0]["casting"][0]
given_name = actor["givenName"]
family_name = actor["familyName"]

query = """
PREFIX vocab: <https://sandbox.bluebrainnexus.io/v1/vocabs/%s/%s/>
PREFIX nxv: <https://bluebrain.github.io/nexus/vocabulary/>
Select ?movieId ?title ?genres ?imdbId ?tmdbId
 WHERE  {
   
    ?movie_nexus_Id a vocab:Movie.
    ?movie_nexus_Id vocab:casting ?cast.
    ?cast vocab:familyName "%s".
    ?cast vocab:givenName "%s".
    ?movie_nexus_Id vocab:movieId ?movieId.
    ?movie_nexus_Id vocab:title ?title.
    ?movie_nexus_Id vocab:imdbId ?imdbId.
    ?movie_nexus_Id vocab:genres ?genres.
    OPTIONAL {
        ?movie_nexus_Id vocab:tmdbId ?tmdbId.
    }
} LIMIT 5
""" % (org, project, family_name, given_name)

nexus_results = query_sparql(query,sparqlview_wrapper)

nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()


# In[ ]: