#!/usr/bin/env python
# coding: utf-8
# # Linking data on the web
#
# In this notebook you'll experience how to consume structured data available on Wikidata to extend and enrich data managed within Nexus.
# ## Prerequisites
#
# This notebook assumes you've created a project within the AWS deployment of Blue Brain Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html).
#
# ## Overview
#
# You'll work through the following steps:
#
# 1. Install and configure the Blue Brain Nexus python sdk
# 2. Create a sparql wrapper around your project's SparqlView
# 3. Create a sparql wrapper around the Wikidata Sparql Service
# 4. Query more metadata from the Wikidata Sparql Service for movies stored in Nexus using their tmdbId
# 5. Save the extended movies metadata back to Nexus and perform new queries
# ## Step 1: Install and configure the Blue Brain Nexus python sdk
# In[ ]:
get_ipython().system('pip install git+https://github.com/BlueBrain/nexus-cli')
# In[1]:
import nexussdk as nexus
nexus_deployment = "https://sandbox.bluebrainnexus.io/v1"
token = "your token here"
nexus.config.set_environment(nexus_deployment)
nexus.config.set_token(token)
org ="tutorialnexus"
#Provide your project name here.
project ="your_project_here"
# ## Step 2: Create a sparql wrapper around your project's SparqlView
# Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the [W3C SPARQL Language](https://www.w3.org/TR/sparql11-query/).
# The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a sparql endpoint.
# In[ ]:
get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper')
# In[ ]:
# Utility functions to create sparql wrapper around a sparql endpoint
from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV
import requests
def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None):
sparql_client = SPARQLWrapper(sparql_endpoint)
#sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query")
if token:
sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token))
sparql_client.setMethod(http_query_method)
sparql_client.setReturnFormat(result_format)
if http_query_method == POST:
sparql_client.setRequestMethod(POSTDIRECTLY)
return sparql_client
# In[ ]:
# Utility functions
import pandas as pd
pd.set_option('display.max_colwidth', -1)
# Convert SPARQL results into a Pandas data frame
def sparql2dataframe(json_sparql_results):
cols = json_sparql_results['head']['vars']
out = []
for row in json_sparql_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
# Send a query using a sparql wrapper
def query_sparql(query, sparql_client):
sparql_client.setQuery(query)
result_object = sparql_client.query()
if sparql_client.returnFormat == JSON:
return result_object._convertJSON()
return result_object.convert()
# In[ ]:
# Let create a sparql wrapper around the project sparql view
sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql"
sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON)
# Let test that the SparqlView wrapper works by running a simple Sparql query to get 5 movies along with their titles.
# In[ ]:
five_movie_query = """
PREFIX vocab:
PREFIX nxv:
Select ?movie_nexus_Id ?movieId ?title ?genres ?imdbId ?tmdbId ?revision
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id nxv:rev ?revision.
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
} LIMIT 5
""" %(org, project)
nexus_results = query_sparql(five_movie_query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
# ## Step 3: Create a Sparql wrapper around the Wikidata Sparql Service
# In[ ]:
wikidata_sparql_endpoint = "https://query.wikidata.org/sparql"
wikidata_sparql_wrapper = create_sparql_client(sparql_endpoint=wikidata_sparql_endpoint,http_query_method= GET, result_format=JSON)
# Let test that the wrapper works by running a query that fetch the logo url for a given movie tmdbId 862 (Toy Story).
# You can play the following query in the Wikidata playground [Try It](https://query.wikidata.org/#SELECT%20%2a%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%22862%22.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%20%3Flang.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP577%20%3FreleaseDate.%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP291%20wdt%3AQ30.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%7D).
#
# In the query below:
#
# * [wdt:P4947](https://www.wikidata.org/wiki/Property:P4947) is the wikidata property for tmdbId
# * [wdt:P154](https://www.wikidata.org/wiki/Property:P154) is the wiki data property for logo image url
#
# In[ ]:
# wdt:P4947 is the wikidata property for tmdbId
movie_logo_query = """
SELECT *
WHERE
{
?movie wdt:P4947 "%s".
OPTIONAL{
?movie wdt:P154 ?logo.
}
}
""" % (862)
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
wiki_df.head()
# Let display the logo of the Toy Story movie. This part might take some time but you can skip it.
# In[ ]:
from IPython.display import SVG, display
movie_logo_url = wiki_df.at[0,'logo']
display(SVG(movie_logo_url))
# In[ ]:
# ## Step 4: Query more metadata from the Wikidata Sparql Service
# For every movie retrieved from the Nexus SparqlView, we will get:
#
# * its cast members or voice actors
# * its publication date in the United States of America if any
#
# [Try it](https://query.wikidata.org/#SELECT%20%3FtmdbId%20%3Fmovie%20%3Flogo%20%3Fnativelanguage%20%3Fpublication_date%20%3Fcast%20%3FgivenName%20%3FfamilyName%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%3FtmdbId.%0A%20%20%20%20%20%20%20%20FILTER%20%28%3FtmdbId%20%3D%20%22862%22%29.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP725%7Cwdt%3AP161%20%3Fcast.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP735%2Fwdt%3AP1705%20%3FgivenName.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP734%2Fwdt%3AP1705%20%3FfamilyName.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%2Fwdt%3AP1705%20%3Fnativelanguage.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%0A%20%20%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fmovie%20p%3AP577%20%3Fpublication_date_node.%0A%20%20%20%20%20%3Fpublication_date_node%20ps%3AP577%20%3Fpublication_date.%20%23%20publication%20date%20statement%0A%20%20%20%20%20%3Fpublication_date_node%20pq%3AP291%20wd%3AQ30.%20%20%20%20%20%20%20%20%20%20%20%20%23%20qualifier%20on%20the%20release%20date%0A%20%20%7D%0A%20%20%20%20%7D) for the movie Toy Story.
# In[ ]:
from functools import reduce
import json
def panda_merge(df,df2, on):
cols_to_use = df2.columns.difference(df.columns)
dfNew = pd.merge(df, df2[cols_to_use], left_index=True, right_index=True, how='outer')
return dfNew
def panda_concatenate(dfs):
df = pd.concat(dfs)
return df
wiki_dataframes = []
for index, row in nexus_df.iterrows():
imdbdId = row['tmdbId']
movie_logo_query = """
SELECT ?tmdbId ?movie ?logo ?nativelanguage ?publication_date ?cast ?givenName ?familyName
WHERE
{
?movie wdt:P4947 ?tmdbId.
FILTER (?tmdbId = "%s").
OPTIONAL{
?movie wdt:P154 ?logo.
}
OPTIONAL{
?movie wdt:P725|wdt:P161 ?cast.
?cast wdt:P735/wdt:P1705 ?givenName.
?cast wdt:P734/wdt:P1705 ?familyName.
}
OPTIONAL{
?movie wdt:P364/wdt:P1705 ?nativelanguage.
}
OPTIONAL {
?movie p:P577 ?publication_date_node.
?publication_date_node ps:P577 ?publication_date. # publication date statement
?publication_date_node pq:P291 wd:Q30. # qualifier on the release date
}
}
""" % (row['tmdbId'])
wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper)
wiki_df =sparql2dataframe(wiki_results)
print("""Display metadata (from wikidata) for %s""" %(nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]))
display(wiki_df.head())
wiki_dataframes.append(wiki_df)
#Let concatenate all dataframes from wikidata
result_wiki_dataframes = panda_concatenate(wiki_dataframes)
#display(result_wiki_dataframes.head())
merge_wiki_dataframes = panda_merge(result_wiki_dataframes,nexus_df,"tmdbId")
#display(merge_wiki_dataframes.head())
wiki_dataframes_tojson = (merge_wiki_dataframes.apply(lambda x: x.dropna(), 1).groupby(['tmdbId','movie','nativelanguage'], as_index=False)
.apply(lambda x: x[['cast','givenName','familyName']].to_dict('r'))
.reset_index()
.rename(columns={0:'casting'})
.to_json(orient='records'))
updated_movies_json = json.loads(wiki_dataframes_tojson)
# ## Step 5: Save the extended movies metadata back to Nexus
# In[ ]:
# We obtained a json array to be loaded
from urllib.parse import urlencode, quote_plus
def update_in_nexus(row):
row["@type"]= "Movie"
_id = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0]
rev = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'revision'].iloc[0]
row["title"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'title'].iloc[0]
row["genres"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'genres'].iloc[0]
row["movieId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movieId'].iloc[0]
row["imdbId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'imdbId'].iloc[0]
data = nexus.resources.fetch(org_label=org,project_label=project,resource_id=_id,schema_id="_")
current_revision = data["_rev"]
url = nexus_deployment+"/resources/"+org+"/"+project+"/"+"_/"+quote_plus(_id)
row["_self"] = url
nexus.resources.update(resource=row, rev=current_revision)
for item in updated_movies_json:
update_in_nexus(item)
# The data can now be listed with the casting metadata obtained from wikidata.
#
# In[ ]:
# List movies with their casting names separated by a comma.
movie_acting_query = """
PREFIX vocab:
PREFIX nxv:
Select ?movieId ?title ?genres ?imdbId ?tmdbId (group_concat(DISTINCT ?castname;separator=", ") as ?casting)
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id vocab:casting ?cast.
?cast vocab:givenName ?givenName.
?cast vocab:familyName ?familyName.
BIND (CONCAT(?givenName, " ", ?familyName) AS ?castname).
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
}
Group By ?movieId ?title ?genres ?imdbId ?tmdbId
LIMIT 100
""" %(org,project)
nexus_updated_results = query_sparql(movie_acting_query,sparqlview_wrapper)
nexus_df_updated=sparql2dataframe(nexus_updated_results)
nexus_df_updated.head(100)
# In[ ]:
# Pick an actor and find movies within which they acted
actor = updated_movies_json[0]["casting"][0]
given_name = actor["givenName"]
family_name = actor["familyName"]
query = """
PREFIX vocab:
PREFIX nxv:
Select ?movieId ?title ?genres ?imdbId ?tmdbId
WHERE {
?movie_nexus_Id a vocab:Movie.
?movie_nexus_Id vocab:casting ?cast.
?cast vocab:familyName "%s".
?cast vocab:givenName "%s".
?movie_nexus_Id vocab:movieId ?movieId.
?movie_nexus_Id vocab:title ?title.
?movie_nexus_Id vocab:imdbId ?imdbId.
?movie_nexus_Id vocab:genres ?genres.
OPTIONAL {
?movie_nexus_Id vocab:tmdbId ?tmdbId.
}
} LIMIT 5
""" % (org, project, family_name, given_name)
nexus_results = query_sparql(query,sparqlview_wrapper)
nexus_df =sparql2dataframe(nexus_results)
nexus_df.head()
# In[ ]: