#!/usr/bin/env python # coding: utf-8 # # Linking data on the web # # In this notebook you'll experience how to consume structured data available on Wikidata to extend and enrich data managed within Nexus. # ## Prerequisites # # This notebook assumes you've created a project within the AWS deployment of Blue Brain Nexus. If not follow the Blue Brain Nexus [Quick Start tutorial](https://bluebrain.github.io/nexus/docs/tutorial/getting-started/quick-start/index.html). # # ## Overview # # You'll work through the following steps: # # 1. Install and configure the Blue Brain Nexus python sdk # 2. Create a sparql wrapper around your project's SparqlView # 3. Create a sparql wrapper around the Wikidata Sparql Service # 4. Query more metadata from the Wikidata Sparql Service for movies stored in Nexus using their tmdbId # 5. Save the extended movies metadata back to Nexus and perform new queries # ## Step 1: Install and configure the Blue Brain Nexus python sdk # In[ ]: get_ipython().system('pip install git+https://github.com/BlueBrain/nexus-cli') # In[1]: import nexussdk as nexus nexus_deployment = "https://sandbox.bluebrainnexus.io/v1" token = "your token here" nexus.config.set_environment(nexus_deployment) nexus.config.set_token(token) org ="tutorialnexus" #Provide your project name here. project ="your_project_here" # ## Step 2: Create a sparql wrapper around your project's SparqlView # Every project in Blue Brain Nexus comes with a SparqlView enabling to navigate the data as a graph and to query it using the [W3C SPARQL Language](https://www.w3.org/TR/sparql11-query/). # The address of such SparqlView is https://sandbox.bluebrainnexus.io/v1/views/tutorialnexus/\$PROJECTLABEL/graph/sparql for a project withe label \$PROJECTLABEL. The address of a SparqlView is also called a sparql endpoint. # In[ ]: get_ipython().system('pip install git+https://github.com/RDFLib/sparqlwrapper') # In[ ]: # Utility functions to create sparql wrapper around a sparql endpoint from SPARQLWrapper import SPARQLWrapper, JSON, POST, GET, POSTDIRECTLY, CSV import requests def create_sparql_client(sparql_endpoint, http_query_method=POST, result_format= JSON, token=None): sparql_client = SPARQLWrapper(sparql_endpoint) #sparql_client.addCustomHttpHeader("Content-Type", "application/sparql-query") if token: sparql_client.addCustomHttpHeader("Authorization","Bearer {}".format(token)) sparql_client.setMethod(http_query_method) sparql_client.setReturnFormat(result_format) if http_query_method == POST: sparql_client.setRequestMethod(POSTDIRECTLY) return sparql_client # In[ ]: # Utility functions import pandas as pd pd.set_option('display.max_colwidth', -1) # Convert SPARQL results into a Pandas data frame def sparql2dataframe(json_sparql_results): cols = json_sparql_results['head']['vars'] out = [] for row in json_sparql_results['results']['bindings']: item = [] for c in cols: item.append(row.get(c, {}).get('value')) out.append(item) return pd.DataFrame(out, columns=cols) # Send a query using a sparql wrapper def query_sparql(query, sparql_client): sparql_client.setQuery(query) result_object = sparql_client.query() if sparql_client.returnFormat == JSON: return result_object._convertJSON() return result_object.convert() # In[ ]: # Let create a sparql wrapper around the project sparql view sparqlview_endpoint = nexus_deployment+"/views/"+org+"/"+project+"/graph/sparql" sparqlview_wrapper = create_sparql_client(sparql_endpoint=sparqlview_endpoint, token=token,http_query_method= POST, result_format=JSON) # Let test that the SparqlView wrapper works by running a simple Sparql query to get 5 movies along with their titles. # In[ ]: five_movie_query = """ PREFIX vocab: PREFIX nxv: Select ?movie_nexus_Id ?movieId ?title ?genres ?imdbId ?tmdbId ?revision WHERE { ?movie_nexus_Id a vocab:Movie. ?movie_nexus_Id nxv:rev ?revision. ?movie_nexus_Id vocab:movieId ?movieId. ?movie_nexus_Id vocab:title ?title. ?movie_nexus_Id vocab:imdbId ?imdbId. ?movie_nexus_Id vocab:genres ?genres. OPTIONAL { ?movie_nexus_Id vocab:tmdbId ?tmdbId. } } LIMIT 5 """ %(org, project) nexus_results = query_sparql(five_movie_query,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head() # ## Step 3: Create a Sparql wrapper around the Wikidata Sparql Service # In[ ]: wikidata_sparql_endpoint = "https://query.wikidata.org/sparql" wikidata_sparql_wrapper = create_sparql_client(sparql_endpoint=wikidata_sparql_endpoint,http_query_method= GET, result_format=JSON) # Let test that the wrapper works by running a query that fetch the logo url for a given movie tmdbId 862 (Toy Story). # You can play the following query in the Wikidata playground [Try It](https://query.wikidata.org/#SELECT%20%2a%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%22862%22.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%20%3Flang.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP577%20%3FreleaseDate.%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP291%20wdt%3AQ30.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%7D). # # In the query below: # # * [wdt:P4947](https://www.wikidata.org/wiki/Property:P4947) is the wikidata property for tmdbId # * [wdt:P154](https://www.wikidata.org/wiki/Property:P154) is the wiki data property for logo image url # # In[ ]: # wdt:P4947 is the wikidata property for tmdbId movie_logo_query = """ SELECT * WHERE { ?movie wdt:P4947 "%s". OPTIONAL{ ?movie wdt:P154 ?logo. } } """ % (862) wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper) wiki_df =sparql2dataframe(wiki_results) wiki_df.head() # Let display the logo of the Toy Story movie. This part might take some time but you can skip it. # In[ ]: from IPython.display import SVG, display movie_logo_url = wiki_df.at[0,'logo'] display(SVG(movie_logo_url)) # In[ ]: # ## Step 4: Query more metadata from the Wikidata Sparql Service # For every movie retrieved from the Nexus SparqlView, we will get: # # * its cast members or voice actors # * its publication date in the United States of America if any # # [Try it](https://query.wikidata.org/#SELECT%20%3FtmdbId%20%3Fmovie%20%3Flogo%20%3Fnativelanguage%20%3Fpublication_date%20%3Fcast%20%3FgivenName%20%3FfamilyName%0A%20%20%20%20WHERE%0A%20%20%20%20%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP4947%20%3FtmdbId.%0A%20%20%20%20%20%20%20%20FILTER%20%28%3FtmdbId%20%3D%20%22862%22%29.%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP154%20%3Flogo.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP725%7Cwdt%3AP161%20%3Fcast.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP735%2Fwdt%3AP1705%20%3FgivenName.%0A%20%20%20%20%20%20%20%20%20%20%3Fcast%20wdt%3AP734%2Fwdt%3AP1705%20%3FfamilyName.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%20OPTIONAL%7B%0A%20%20%20%20%20%20%20%20%3Fmovie%20wdt%3AP364%2Fwdt%3AP1705%20%3Fnativelanguage.%0A%20%20%20%20%20%20%20%20%7D%0A%20%20%20%20%20%20%20%0A%20%20%20%20%20%20OPTIONAL%20%7B%0A%20%20%20%20%20%3Fmovie%20p%3AP577%20%3Fpublication_date_node.%0A%20%20%20%20%20%3Fpublication_date_node%20ps%3AP577%20%3Fpublication_date.%20%23%20publication%20date%20statement%0A%20%20%20%20%20%3Fpublication_date_node%20pq%3AP291%20wd%3AQ30.%20%20%20%20%20%20%20%20%20%20%20%20%23%20qualifier%20on%20the%20release%20date%0A%20%20%7D%0A%20%20%20%20%7D) for the movie Toy Story. # In[ ]: from functools import reduce import json def panda_merge(df,df2, on): cols_to_use = df2.columns.difference(df.columns) dfNew = pd.merge(df, df2[cols_to_use], left_index=True, right_index=True, how='outer') return dfNew def panda_concatenate(dfs): df = pd.concat(dfs) return df wiki_dataframes = [] for index, row in nexus_df.iterrows(): imdbdId = row['tmdbId'] movie_logo_query = """ SELECT ?tmdbId ?movie ?logo ?nativelanguage ?publication_date ?cast ?givenName ?familyName WHERE { ?movie wdt:P4947 ?tmdbId. FILTER (?tmdbId = "%s"). OPTIONAL{ ?movie wdt:P154 ?logo. } OPTIONAL{ ?movie wdt:P725|wdt:P161 ?cast. ?cast wdt:P735/wdt:P1705 ?givenName. ?cast wdt:P734/wdt:P1705 ?familyName. } OPTIONAL{ ?movie wdt:P364/wdt:P1705 ?nativelanguage. } OPTIONAL { ?movie p:P577 ?publication_date_node. ?publication_date_node ps:P577 ?publication_date. # publication date statement ?publication_date_node pq:P291 wd:Q30. # qualifier on the release date } } """ % (row['tmdbId']) wiki_results = query_sparql(movie_logo_query,wikidata_sparql_wrapper) wiki_df =sparql2dataframe(wiki_results) print("""Display metadata (from wikidata) for %s""" %(nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0])) display(wiki_df.head()) wiki_dataframes.append(wiki_df) #Let concatenate all dataframes from wikidata result_wiki_dataframes = panda_concatenate(wiki_dataframes) #display(result_wiki_dataframes.head()) merge_wiki_dataframes = panda_merge(result_wiki_dataframes,nexus_df,"tmdbId") #display(merge_wiki_dataframes.head()) wiki_dataframes_tojson = (merge_wiki_dataframes.apply(lambda x: x.dropna(), 1).groupby(['tmdbId','movie','nativelanguage'], as_index=False) .apply(lambda x: x[['cast','givenName','familyName']].to_dict('r')) .reset_index() .rename(columns={0:'casting'}) .to_json(orient='records')) updated_movies_json = json.loads(wiki_dataframes_tojson) # ## Step 5: Save the extended movies metadata back to Nexus # In[ ]: # We obtained a json array to be loaded from urllib.parse import urlencode, quote_plus def update_in_nexus(row): row["@type"]= "Movie" _id = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movie_nexus_Id'].iloc[0] rev = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'revision'].iloc[0] row["title"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'title'].iloc[0] row["genres"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'genres'].iloc[0] row["movieId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'movieId'].iloc[0] row["imdbId"] = nexus_df.loc[nexus_df['tmdbId'] == row['tmdbId'], 'imdbId'].iloc[0] data = nexus.resources.fetch(org_label=org,project_label=project,resource_id=_id,schema_id="_") current_revision = data["_rev"] url = nexus_deployment+"/resources/"+org+"/"+project+"/"+"_/"+quote_plus(_id) row["_self"] = url nexus.resources.update(resource=row, rev=current_revision) for item in updated_movies_json: update_in_nexus(item) # The data can now be listed with the casting metadata obtained from wikidata. # # In[ ]: # List movies with their casting names separated by a comma. movie_acting_query = """ PREFIX vocab: PREFIX nxv: Select ?movieId ?title ?genres ?imdbId ?tmdbId (group_concat(DISTINCT ?castname;separator=", ") as ?casting) WHERE { ?movie_nexus_Id a vocab:Movie. ?movie_nexus_Id vocab:casting ?cast. ?cast vocab:givenName ?givenName. ?cast vocab:familyName ?familyName. BIND (CONCAT(?givenName, " ", ?familyName) AS ?castname). ?movie_nexus_Id vocab:movieId ?movieId. ?movie_nexus_Id vocab:title ?title. ?movie_nexus_Id vocab:imdbId ?imdbId. ?movie_nexus_Id vocab:genres ?genres. OPTIONAL { ?movie_nexus_Id vocab:tmdbId ?tmdbId. } } Group By ?movieId ?title ?genres ?imdbId ?tmdbId LIMIT 100 """ %(org,project) nexus_updated_results = query_sparql(movie_acting_query,sparqlview_wrapper) nexus_df_updated=sparql2dataframe(nexus_updated_results) nexus_df_updated.head(100) # In[ ]: # Pick an actor and find movies within which they acted actor = updated_movies_json[0]["casting"][0] given_name = actor["givenName"] family_name = actor["familyName"] query = """ PREFIX vocab: PREFIX nxv: Select ?movieId ?title ?genres ?imdbId ?tmdbId WHERE { ?movie_nexus_Id a vocab:Movie. ?movie_nexus_Id vocab:casting ?cast. ?cast vocab:familyName "%s". ?cast vocab:givenName "%s". ?movie_nexus_Id vocab:movieId ?movieId. ?movie_nexus_Id vocab:title ?title. ?movie_nexus_Id vocab:imdbId ?imdbId. ?movie_nexus_Id vocab:genres ?genres. OPTIONAL { ?movie_nexus_Id vocab:tmdbId ?tmdbId. } } LIMIT 5 """ % (org, project, family_name, given_name) nexus_results = query_sparql(query,sparqlview_wrapper) nexus_df =sparql2dataframe(nexus_results) nexus_df.head() # In[ ]: