#Install a library to help us run some SPARQL queries if we haven't already installed it
#http://rdflib.github.io/sparqlwrapper/
!pip3 uninstall -y sparqlwrapper
!pip3 install sparqlwrapper
#NOTE: if you find the SPARQL queries slowing down, or throwing an error message, try the following:
## 1) Save your notebook.
## 2) Close it.
## 3) Shut it down.
#This should reset sparqlwrapper
## 4) Restart the notebook.
# You will need to run the cells again to load packages, reset state etc, becuase you will have started a new IPython process.
#Import the necessary packages
from SPARQLWrapper import SPARQLWrapper, JSON
#Declare the BNB endpoint
endpoint="http://bnb.data.bl.uk/sparql"
sparql = SPARQLWrapper(endpoint)
#My experience of SPARQL is that things work then they don't and you have no idea which bit is broken
#This test should work. It really should. It has before. And it shouldn't take too long.
#It comes from http://bnb.data.bl.uk/getting-started
q='''PREFIX bibo:
PREFIX bio:
PREFIX blt:
PREFIX dct:
PREFIX event:
PREFIX foaf:
PREFIX geo:
PREFIX isbd:
PREFIX org:
PREFIX owl:
PREFIX rda:
PREFIX rdf:
PREFIX rdfs:
PREFIX skos:
PREFIX void:
PREFIX xsd:
SELECT ?book ?bnb ?title WHERE {
#Match the book by ISBN
?book bibo:isbn13 "9780729408745";
#bind some variables to its other attributes
blt:bnb ?bnb;
dct:title ?title. }'''
sparql.setQuery(q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results
#Declare a standard, if exhaustive, list of prefixes we can apply to each query
#Don't leave white space on the left hand side...
prefix='''
PREFIX bibo:
PREFIX bio:
PREFIX blt:
PREFIX dct:
PREFIX event:
PREFIX foaf:
PREFIX geo:
PREFIX isbd:
PREFIX org:
PREFIX owl:
PREFIX rda:
PREFIX rdf:
PREFIX rdfs:
PREFIX skos:
PREFIX void:
PREFIX xsd:
'''
#Let's just test a simple query
#Search for books by author name
q='''
SELECT DISTINCT ?book ?title WHERE {
?book dct:creator ?author ;
dct:title ?title.
?author foaf:name "Iain Banks".
} LIMIT 5
'''
#Run the query, parse the response as JSON, and get them into a variable
sparql.setQuery(prefix+q)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
#Here's what the response looks like
results
#Let's specify the response columns we want to display
answerCols=['book','title']
#We can then iterate through these
for result in results["results"]["bindings"]:
for ans in answerCols:
print(result[ans]['value'], end=" ")
print()
#Let's make a function to handle that a little more tidily
def printResults(results,ansCols):
''' Print the required results column values from the SPARQL query '''
for result in results["results"]["bindings"]:
for ans in answerCols:
print(result[ans]['value'], end=" ")
print()
printResults(results,answerCols)
#Let's do a little more wrapping
def runQuery(endpoint,prefix,q):
''' Run a SPARQL query with a declared prefix over a specified endpoint '''
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(prefix+q)
sparql.setReturnFormat(JSON)
return sparql.query().convert()
def queryResults(endpoint,prefix,q,ansCols):
''' Run a SPARQL query with a declared prefix over a specified endpoint and print the required results columns '''
results=runQuery(endpoint,prefix,q)
printResults(results,ansCols)
queryResults(endpoint,prefix,q,answerCols)
#Let's see what the results look like
results
#Some endpoints will return data in other formats, for example flattened as a CSV data table
#We can flatten the data ourselves in an ad hoc way and get it into a pandas datatable
import pandas as pd
#pandas may have a better way of doing this?!
data=[]
for result in results["results"]["bindings"]:
tmp={}
for el in result:
tmp[el]=result[el]['value']
data.append(tmp)
#Note that we lise the type information which we could have used to type the columns in the final dataframe
df = pd.DataFrame(data)
df
#Let's wrap everything up
def dict2df(results):
''' Hack a function to flatten the SPARQL query results and return the column values '''
data=[]
for result in results["results"]["bindings"]:
tmp={}
for el in result:
tmp[el]=result[el]['value']
data.append(tmp)
df = pd.DataFrame(data)
return df
def dfResults(endpoint,prefix,q):
''' Generate a data frame containing the results of running
a SPARQL query with a declared prefix over a specified endpoint '''
return dict2df( runQuery( endpoint, prefix, q ) )
dfResults(endpoint,prefix,q)
q='DESCRIBE ?book WHERE { ?book bibo:isbn10 "1857232356" }'
ans=runQuery(endpoint,prefix,q)
ans
Format requested was JSON, but RDF/XML (application/rdf+xml;charset=UTF-8) has been returned by the endpoint
ans.serialize(format="nt")
print(ans.serialize(format="nt").decode("utf-8"))
#For convenience, let's just bundle that up in case we need to call it again
def printDesc(endpoint,prefix,q):
ans=runQuery(endpoint,prefix,q)
print(ans.serialize(format="nt").decode("utf-8"))
q='DESCRIBE ?book WHERE { ?book bibo:isbn10 "1857232356" }'
printDesc(endpoint,prefix,q)
q='''
SELECT ?book ?bnb ?publicationEvent ?title ?creator WHERE {
#Match the book by ISBN
?book bibo:isbn10 "1857232356";
#bind some variables to other attributes of the work
#Get the British National Bibliography number
blt:bnb ?bnb;
#Identify the publication event associated with this work
blt:publication ?publicationEvent;
#Identify the title of the work
dct:title ?title;
#Identify the creator of the work
dct:creator ?creator.
}
'''
runQuery(endpoint,prefix,q)
q='''
SELECT DISTINCT ?property
where {
?book bibo:isbn10 "1857232356";
dct:creator ?creator.
?creator ?property ?x
}
'''
runQuery(endpoint,prefix,q)
q='''
SELECT ?book ?isbn10 ?bnb ?title ?author WHERE {
#Match the book by ISBN
?book bibo:isbn10 "1857232356";
#bind some variables to its other attributes
blt:bnb ?bnb;
dct:title ?title;
bibo:isbn10 ?isbn10;
dct:creator ?creator.
?creator foaf:name ?author.
}
'''
dfResults(endpoint,prefix,q)
#YOUR INVESTIGATION HERE
q='''
SELECT DISTINCT ?a ?b WHERE {
?a ?b.
}
'''
dfResults(endpoint,prefix,q)
q='''
SELECT DISTINCT ?a ?b WHERE {
?a ?b.
}
'''
dfResults(endpoint,prefix,q)
q='''
SELECT DISTINCT ?book ?title ?date WHERE {
#Find books by 'Iain Banks':
?book dct:creator ?author ;
dct:title ?title.
?author foaf:name "Iain Banks".
#Find when they were published:
?book blt:publication ?publicationEvent.
?publicationEvent event:time ?eventTime.
?eventTime rdfs:label ?date.
#Look for books published between 1985 and 1990
FILTER (?date>="1985" && ?date<"1990")
} ORDER BY ?date
'''
dfResults(endpoint,prefix,q)
def getBooksByAuthorBetweenDates(author,fromDate,toDate):
q='''
SELECT DISTINCT ?book ?title ?date WHERE {{
#Find books by name:
?book dct:creator ?author ;
dct:title ?title.
?author foaf:name "{0}".
#Find when they were published:
?book blt:publication ?publicationEvent.
?publicationEvent event:time ?eventTime.
?eventTime rdfs:label ?date.
#Look for books published between dates
FILTER (?date>="{1}" && ?date<="{2}")
}} ORDER BY ?date
'''.format(author,fromDate,toDate)
return dfResults(endpoint,prefix,q)
getBooksByAuthorBetweenDates("Terry Pratchett","1985","1987")