#!/usr/bin/env python # coding: utf-8 # # Sketches around ONS PublishMyData Endpoint # # Quick look at a test endpoint... # In[60]: #Utility routines from SPARQLWrapper import SPARQLWrapper, JSON #Add some helper functions def runQuery(endpoint,prefix,q): ''' Run a SPARQL query with a declared prefix over a specified endpoint ''' sparql = SPARQLWrapper(endpoint) sparql.setQuery(prefix+q) sparql.setReturnFormat(JSON) return sparql.query().convert() import pandas as pd pd.options.display.max_colwidth=150 def dict2df(results): ''' Hack a function to flatten the SPARQL query results and return the column values ''' data=[] for result in results["results"]["bindings"]: tmp={} for el in result: tmp[el]=result[el]['value'] data.append(tmp) df = pd.DataFrame(data) return df def dfResults(endpoint,prefix,q): ''' Generate a data frame containing the results of running a SPARQL query with a declared prefix over a specified endpoint ''' return dict2df( runQuery( endpoint, prefix, q ) ) def printQuery(results,limit=''): ''' Print the results from the SPARQL query ''' resdata=results["results"]["bindings"] if limit!='': resdata=results["results"]["bindings"][:limit] for result in resdata: for ans in result: print('{0}: {1}'.format(ans,result[ans]['value'])) print() def printRunQuery(endpoint,prefix,q,limit=''): ''' Print the results from the SPARQL query ''' results=runQuery(endpoint,prefix,q) printQuery(results,limit) # In[40]: ENDPOINT='http://ons-pilot.publishmydata.com/sparql' # In[42]: #Prefix from the example PREFIX=''' PREFIX dcat: PREFIX dcterms: PREFIX owl: PREFIX qb: PREFIX rdf: PREFIX rdfs: PREFIX sdmx: PREFIX skos: PREFIX void: PREFIX xsd: ''' # Go fish... # In[84]: q=''' SELECT * { ?a ?b ?c } LIMIT 20 ''' df=dfResults(endpoint,prefix,q) df # Hmm... Looks the data could be partitioned into spearate graphs (maybe this is a thing publishmydata does?) # # Graphs can make it easier to to frame specific topical "graph related" queries, perhaps? # # See if there are any specific graphs: # In[83]: q=''' SELECT * { ?a ?c } LIMIT 50 ''' df=dfResults(endpoint,prefix,q) df # To make life easier, we can can (*sic*) different queries into a particular graph... # In[135]: #Set up a way of querying the employment graph q_employment= ''' SELECT {s} WHERE {{ {t} GRAPH {{ {q} }} }} LIMIT {n} ''' def sqe(endpoint=ENDPOINT,prefix=PREFIX,s='*',t='',q='?a ?b ?c',n=100): return dfResults(endpoint,prefix,q_employment.format(s=s,t=t,q=q,n=n)) # In[136]: sqe()[:20] # Fish a bit more... # In[179]: sqe(s='DISTINCT ?a',n=20) # So there are lots of areas in there. Let's pick one and see what's around it... # In[63]: q=''' ?b ?c. ''' sqe(q=q) # How about a different expression, taking the object value for the same area and seeing what related subjects we can find? If there are matches, we can start to get a feel for the graph stucture... # In[64]: q=''' ?a ?b ''' sqe(s='DISTINCT ?a ?b',q=q) # So we can get from an area identifier to a count identifier. # In[65]: q=''' ?a . ''' sqe(q=q) # Where do we go from there? # In[85]: q=''' ?b ?c. ''' sqe(q=q) # [Could be useful adding something alond the lines of the following to the default prefix list? (Is there a standard abbreviation?) # # `PREFIX sdmxd: http://purl.org/linked-data/sdmx/2009/dimension#` # ] # # Okay - so what sort of report might we be able to generate from a count identifier? # In[89]: l='' q=''' {l} ?area. {l} ?period. {l} ?count. '''.format(l=l) sqe(q=q) # Are there any labels for things like the reference year, or geography? # In[91]: q=''' SELECT * { { ?b ?c.} UNION { ?q ?r.} } LIMIT 10 ''' dfResults(endpoint,prefix,q) # So we can tidy up the report a bit more: # In[105]: q=''' {l} ?_area. {l} ?_period. {l} ?count. {l} ?_typ. '''.format(l=l) t=''' ?_area rdfs:label ?area. ?_period rdfs:label ?period. ?_typ rdfs:label ?typ. '''.format(l=l) sqe(s='?area ?period ?count ?typ',q=q,t=t) # So presumably that then generalises further? With a bit more jiggery pokery we can get to: # In[121]: code='E33000012' q=''' ?l ?_area. ?1 ?_period. ?l ?count. ?1 ?_typ. ''' t=''' ?_area rdfs:label '{}'. ?_area rdfs:label ?area. ?_period rdfs:label ?period. ?_typ rdfs:label ?typ. ?l ?dim. ?dim rdfs:label ?d. '''.format(code) sqe(s='DISTINCT ?area ?period ?count ?typ ?d',q=q,t=t) # ## Pay # # How about one of the other graphs? # In[145]: #Set up a way of querying the pay graph q_pay= ''' SELECT {s} WHERE {{ {t} GRAPH {{ {q} }} }} LIMIT {n} ''' def sqp(endpoint=ENDPOINT,prefix=PREFIX,s='*',t='',q='?a ?b ?c',n=50): return dfResults(endpoint,prefix,q_employment.format(s=s,t=t,q=q,n=n)) # In[144]: sqp(n=15) # In[146]: q=''' ?a ?b ''' sqp(s='DISTINCT ?a ?b',q=q) # Okay - not sure I understand that; looks like the same stuff as from the other graph? # # Quitting for now... # PS I don't recognise the statistical geography codes - are they made up? # # Old ONS endpoint at `http://statistics.data.gov.uk/sparql` doesn't seem to exist any more, and whilst OS endpoint is still there: # In[168]: endpoint_os='http://data.ordnancesurvey.co.uk/datasets/boundary-line/apis/sparql' p=''' PREFIX rdfs: PREFIX admingeo: PREFIX ossr: ''' q=''' SELECT ?districtname WHERE { ?district rdfs:label ?districtname. ?district . } ''' runQuery(endpoint_os,p,q) # it doesn't work with the codes in the demo db? # In[175]: q=''' SELECT ?districtname WHERE { ?district rdfs:label ?districtname. ?district . } ''' runQuery(endpoint_os,p,q) # If it did, we could compound the queries using s/thing like the following to pull back info about the area from the Ordnance Surevy endpoint: # In[178]: q=''' ?a ?b . SERVICE { ?district rdfs:label ?districtname. ?district . } ''' sqp(s='DISTINCT *',q=q) # In[ ]: