#Install a library to help us run some SPARQL queries if we haven't already installed it #http://rdflib.github.io/sparqlwrapper/ !pip3 uninstall -y sparqlwrapper !pip3 install sparqlwrapper #NOTE: if you find the SPARQL queries slowing down, or throwing an error message, try the following: ## 1) Save your notebook. ## 2) Close it. ## 3) Shut it down. #This should reset sparqlwrapper ## 4) Restart the notebook. # You will need to run the cells again to load packages, reset state etc, becuase you will have started a new IPython process. #Import the necessary packages from SPARQLWrapper import SPARQLWrapper, JSON #Declare the BNB endpoint endpoint="http://bnb.data.bl.uk/sparql" sparql = SPARQLWrapper(endpoint) #My experience of SPARQL is that things work then they don't and you have no idea which bit is broken #This test should work. It really should. It has before. And it shouldn't take too long. #It comes from http://bnb.data.bl.uk/getting-started q='''PREFIX bibo: PREFIX bio: PREFIX blt: PREFIX dct: PREFIX event: PREFIX foaf: PREFIX geo: PREFIX isbd: PREFIX org: PREFIX owl: PREFIX rda: PREFIX rdf: PREFIX rdfs: PREFIX skos: PREFIX void: PREFIX xsd: SELECT ?book ?bnb ?title WHERE { #Match the book by ISBN ?book bibo:isbn13 "9780729408745"; #bind some variables to its other attributes blt:bnb ?bnb; dct:title ?title. }''' sparql.setQuery(q) sparql.setReturnFormat(JSON) results = sparql.query().convert() results #Declare a standard, if exhaustive, list of prefixes we can apply to each query #Don't leave white space on the left hand side... prefix=''' PREFIX bibo: PREFIX bio: PREFIX blt: PREFIX dct: PREFIX event: PREFIX foaf: PREFIX geo: PREFIX isbd: PREFIX org: PREFIX owl: PREFIX rda: PREFIX rdf: PREFIX rdfs: PREFIX skos: PREFIX void: PREFIX xsd: ''' #Let's just test a simple query #Search for books by author name q=''' SELECT DISTINCT ?book ?title WHERE { ?book dct:creator ?author ; dct:title ?title. ?author foaf:name "Iain Banks". } LIMIT 5 ''' #Run the query, parse the response as JSON, and get them into a variable sparql.setQuery(prefix+q) sparql.setReturnFormat(JSON) results = sparql.query().convert() #Here's what the response looks like results #Let's specify the response columns we want to display answerCols=['book','title'] #We can then iterate through these for result in results["results"]["bindings"]: for ans in answerCols: print(result[ans]['value'], end=" ") print() #Let's make a function to handle that a little more tidily def printResults(results,ansCols): ''' Print the required results column values from the SPARQL query ''' for result in results["results"]["bindings"]: for ans in answerCols: print(result[ans]['value'], end=" ") print() printResults(results,answerCols) #Let's do a little more wrapping def runQuery(endpoint,prefix,q): ''' Run a SPARQL query with a declared prefix over a specified endpoint ''' sparql = SPARQLWrapper(endpoint) sparql.setQuery(prefix+q) sparql.setReturnFormat(JSON) return sparql.query().convert() def queryResults(endpoint,prefix,q,ansCols): ''' Run a SPARQL query with a declared prefix over a specified endpoint and print the required results columns ''' results=runQuery(endpoint,prefix,q) printResults(results,ansCols) queryResults(endpoint,prefix,q,answerCols) #Let's see what the results look like results #Some endpoints will return data in other formats, for example flattened as a CSV data table #We can flatten the data ourselves in an ad hoc way and get it into a pandas datatable import pandas as pd #pandas may have a better way of doing this?! data=[] for result in results["results"]["bindings"]: tmp={} for el in result: tmp[el]=result[el]['value'] data.append(tmp) #Note that we lise the type information which we could have used to type the columns in the final dataframe df = pd.DataFrame(data) df #Let's wrap everything up def dict2df(results): ''' Hack a function to flatten the SPARQL query results and return the column values ''' data=[] for result in results["results"]["bindings"]: tmp={} for el in result: tmp[el]=result[el]['value'] data.append(tmp) df = pd.DataFrame(data) return df def dfResults(endpoint,prefix,q): ''' Generate a data frame containing the results of running a SPARQL query with a declared prefix over a specified endpoint ''' return dict2df( runQuery( endpoint, prefix, q ) ) dfResults(endpoint,prefix,q) q='DESCRIBE ?book WHERE { ?book bibo:isbn10 "1857232356" }' ans=runQuery(endpoint,prefix,q) ans Format requested was JSON, but RDF/XML (application/rdf+xml;charset=UTF-8) has been returned by the endpoint ans.serialize(format="nt") print(ans.serialize(format="nt").decode("utf-8")) #For convenience, let's just bundle that up in case we need to call it again def printDesc(endpoint,prefix,q): ans=runQuery(endpoint,prefix,q) print(ans.serialize(format="nt").decode("utf-8")) q='DESCRIBE ?book WHERE { ?book bibo:isbn10 "1857232356" }' printDesc(endpoint,prefix,q) q=''' SELECT ?book ?bnb ?publicationEvent ?title ?creator WHERE { #Match the book by ISBN ?book bibo:isbn10 "1857232356"; #bind some variables to other attributes of the work #Get the British National Bibliography number blt:bnb ?bnb; #Identify the publication event associated with this work blt:publication ?publicationEvent; #Identify the title of the work dct:title ?title; #Identify the creator of the work dct:creator ?creator. } ''' runQuery(endpoint,prefix,q) q=''' SELECT DISTINCT ?property where { ?book bibo:isbn10 "1857232356"; dct:creator ?creator. ?creator ?property ?x } ''' runQuery(endpoint,prefix,q) q=''' SELECT ?book ?isbn10 ?bnb ?title ?author WHERE { #Match the book by ISBN ?book bibo:isbn10 "1857232356"; #bind some variables to its other attributes blt:bnb ?bnb; dct:title ?title; bibo:isbn10 ?isbn10; dct:creator ?creator. ?creator foaf:name ?author. } ''' dfResults(endpoint,prefix,q) #YOUR INVESTIGATION HERE q=''' SELECT DISTINCT ?a ?b WHERE { ?a ?b. } ''' dfResults(endpoint,prefix,q) q=''' SELECT DISTINCT ?a ?b WHERE { ?a ?b. } ''' dfResults(endpoint,prefix,q) q=''' SELECT DISTINCT ?book ?title ?date WHERE { #Find books by 'Iain Banks': ?book dct:creator ?author ; dct:title ?title. ?author foaf:name "Iain Banks". #Find when they were published: ?book blt:publication ?publicationEvent. ?publicationEvent event:time ?eventTime. ?eventTime rdfs:label ?date. #Look for books published between 1985 and 1990 FILTER (?date>="1985" && ?date<"1990") } ORDER BY ?date ''' dfResults(endpoint,prefix,q) def getBooksByAuthorBetweenDates(author,fromDate,toDate): q=''' SELECT DISTINCT ?book ?title ?date WHERE {{ #Find books by name: ?book dct:creator ?author ; dct:title ?title. ?author foaf:name "{0}". #Find when they were published: ?book blt:publication ?publicationEvent. ?publicationEvent event:time ?eventTime. ?eventTime rdfs:label ?date. #Look for books published between dates FILTER (?date>="{1}" && ?date<="{2}") }} ORDER BY ?date '''.format(author,fromDate,toDate) return dfResults(endpoint,prefix,q) getBooksByAuthorBetweenDates("Terry Pratchett","1985","1987")