#!/usr/bin/env python
# coding: utf-8

# # Sketches around ONS PublishMyData Endpoint
# 
# Quick look at a test endpoint... 

# In[60]:


#Utility routines
from SPARQLWrapper import SPARQLWrapper, JSON

#Add some helper functions
def runQuery(endpoint,prefix,q):
    ''' Run a SPARQL query with a declared prefix over a specified endpoint '''
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(prefix+q)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

import pandas as pd
pd.options.display.max_colwidth=150


def dict2df(results):
    ''' Hack a function to flatten the SPARQL query results and return the column values '''
    data=[]
    for result in results["results"]["bindings"]:
        tmp={}
        for el in result:
            tmp[el]=result[el]['value']
        data.append(tmp)

    df = pd.DataFrame(data)
    return df

def dfResults(endpoint,prefix,q):
    ''' Generate a data frame containing the results of running
        a SPARQL query with a declared prefix over a specified endpoint '''
    return dict2df( runQuery( endpoint, prefix, q ) )
        
def printQuery(results,limit=''):
    ''' Print the results from the SPARQL query '''
    resdata=results["results"]["bindings"]
    if limit!='': resdata=results["results"]["bindings"][:limit]
    for result in resdata:
        for ans in result:
            print('{0}: {1}'.format(ans,result[ans]['value']))
        print()

def printRunQuery(endpoint,prefix,q,limit=''):
    ''' Print the results from the SPARQL query '''
    results=runQuery(endpoint,prefix,q)
    printQuery(results,limit)


# In[40]:


ENDPOINT='http://ons-pilot.publishmydata.com/sparql'


# In[42]:


#Prefix from the example
PREFIX='''
PREFIX dcat: <http://www.w3.org/ns/dcat#>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX qb: <http://purl.org/linked-data/cube#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX sdmx: <http://purl.org/linked-data/sdmx/2009/concept#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX void: <http://rdfs.org/ns/void#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
'''


# Go fish...

# In[84]:


q='''
SELECT * {
    ?a ?b ?c
} LIMIT 20
'''
df=dfResults(endpoint,prefix,q)
df


# Hmm... Looks the data could be partitioned into spearate graphs (maybe this is a thing publishmydata does?)
# 
# Graphs can make it easier to to frame specific topical "graph related" queries, perhaps?
# 
# See if there are any specific graphs:

# In[83]:


q='''
SELECT * {
    ?a <http://publishmydata.com/def/dataset#graph> ?c
} LIMIT 50
'''
df=dfResults(endpoint,prefix,q)
df


# To make life easier, we can can (*sic*) different queries into a particular graph...

# In[135]:


#Set up a way of querying the employment graph
q_employment= '''
SELECT {s}
WHERE {{
  {t}
  GRAPH <http://statistics.data.gov.uk/graph/employment> {{ 
      {q} 
  }} 
}} LIMIT {n}
'''

def sqe(endpoint=ENDPOINT,prefix=PREFIX,s='*',t='',q='?a ?b ?c',n=100):
    return dfResults(endpoint,prefix,q_employment.format(s=s,t=t,q=q,n=n))


# In[136]:


sqe()[:20]


# Fish a bit more...

# In[179]:


sqe(s='DISTINCT ?a',n=20)


# So there are lots of areas in there. Let's pick one and see what's around it...

# In[63]:


q='''
<http://statistics.data.gov.uk/data/employment/year/2014/geography/E33000001/labour-force/employment/employees/count> ?b ?c.
'''
sqe(q=q)


# How about a different expression, taking the object value for the same area and seeing what related subjects we can find? If there are matches, we can start to get a feel for the graph stucture...

# In[64]:


q='''
?a ?b <http://statistics.data.gov.uk/id/statistical-geography/E33000001>
'''
sqe(s='DISTINCT ?a ?b',q=q)


# So we can get from an area identifier to a count identifier.

# In[65]:


q='''
?a <http://purl.org/linked-data/sdmx/2009/dimension#refArea> <http://statistics.data.gov.uk/id/statistical-geography/E33000001>.
'''

sqe(q=q)


# Where do we go from there?

# In[85]:


q='''
<http://statistics.data.gov.uk/data/employment/year/2014/geography/E33000001/labour-force/employees/employees/count> ?b ?c.
'''
sqe(q=q)


# [Could be useful adding something alond the lines of the following to the default prefix list? (Is there a standard abbreviation?)
# 
# `PREFIX sdmxd: http://purl.org/linked-data/sdmx/2009/dimension#`
# ]
# 
# Okay - so what sort of report might we be able to generate from a count identifier?

# In[89]:


l='<http://statistics.data.gov.uk/data/employment/year/2014/geography/E33000001/labour-force/employees/employees/count>'
q='''
{l} <http://purl.org/linked-data/sdmx/2009/dimension#refArea> ?area.
{l} <http://purl.org/linked-data/sdmx/2009/dimension#refPeriod> ?period.
{l} <http://statistics.data.gov.uk/def/measure-properties/count> ?count.
'''.format(l=l)
sqe(q=q)


# Are there any labels for things like the reference year, or geography?

# In[91]:


q='''
SELECT * {
{<http://reference.data.gov.uk/id/year/2014> ?b ?c.}
UNION
{<http://statistics.data.gov.uk/id/statistical-geography/E33000001> ?q ?r.}
} LIMIT 10
'''
dfResults(endpoint,prefix,q)


# So we can tidy up the report a bit more:

# In[105]:


q='''
{l} <http://purl.org/linked-data/sdmx/2009/dimension#refArea> ?_area.

{l} <http://purl.org/linked-data/sdmx/2009/dimension#refPeriod> ?_period.

{l} <http://statistics.data.gov.uk/def/measure-properties/count> ?count.

{l} <http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure> ?_typ.
'''.format(l=l)

t='''
?_area rdfs:label ?area.
?_period rdfs:label ?period.
?_typ  rdfs:label ?typ.
'''.format(l=l)
sqe(s='?area ?period ?count ?typ',q=q,t=t)


# So presumably that then generalises further? With a bit more jiggery pokery we can get to:

# In[121]:


code='E33000012'

q='''

?l <http://purl.org/linked-data/sdmx/2009/dimension#refArea> ?_area.
?1 <http://purl.org/linked-data/sdmx/2009/dimension#refPeriod> ?_period.
?l <http://statistics.data.gov.uk/def/measure-properties/count> ?count.
?1 <http://purl.org/linked-data/sdmx/2009/attribute#unitMeasure> ?_typ.
'''

t='''
?_area rdfs:label '{}'.
?_area rdfs:label ?area.
?_period rdfs:label ?period.
?_typ rdfs:label ?typ.
?l <http://statistics.data.gov.uk/def/dimension/labourForce> ?dim.
?dim rdfs:label ?d.

'''.format(code)
sqe(s='DISTINCT ?area ?period ?count ?typ ?d',q=q,t=t)


# ## Pay
# 
# How about one of the other graphs?

# In[145]:


#Set up a way of querying the pay graph
q_pay= '''
SELECT {s}
WHERE {{
  {t}
  GRAPH <http://statistics.data.gov.uk/data/annual-survey-of-hours-and-earnings-pay> {{ 
      {q} 
  }} 
}} LIMIT {n}
'''

def sqp(endpoint=ENDPOINT,prefix=PREFIX,s='*',t='',q='?a ?b ?c',n=50):
    return dfResults(endpoint,prefix,q_employment.format(s=s,t=t,q=q,n=n))


# In[144]:


sqp(n=15)


# In[146]:


q='''
?a ?b <http://statistics.data.gov.uk/id/statistical-geography/E33000001>
'''
sqp(s='DISTINCT ?a ?b',q=q)


# Okay - not sure I understand that; looks like the same stuff as from the other graph?
# 
# Quitting for now...

# PS I don't recognise the statistical geography codes - are they made up?
# 
# Old ONS endpoint at `http://statistics.data.gov.uk/sparql` doesn't seem to exist any more, and whilst OS endpoint is still there:

# In[168]:


endpoint_os='http://data.ordnancesurvey.co.uk/datasets/boundary-line/apis/sparql'
p='''
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX admingeo: <http://statistics.data.gov.uk/def/administrative-geography/>
PREFIX ossr: <http://data.ordnancesurvey.co.uk/ontology/spatialrelations/>
'''
q='''
SELECT ?districtname
WHERE {
    ?district rdfs:label ?districtname.
    ?district <http://www.w3.org/2002/07/owl#sameAs> <http://statistics.data.gov.uk/id/statistical-geography/E04001302> .
}
'''

runQuery(endpoint_os,p,q)


# it doesn't work with the codes in the demo db?

# In[175]:


q='''
SELECT ?districtname
WHERE {
    ?district rdfs:label ?districtname.
    ?district <http://www.w3.org/2002/07/owl#sameAs> <http://statistics.data.gov.uk/id/statistical-geography/E33000001> .
}
'''

runQuery(endpoint_os,p,q)


# If it did, we could compound the queries using s/thing like the following to pull back info about the area from the Ordnance Surevy endpoint:

# In[178]:


q='''
    ?a ?b <http://statistics.data.gov.uk/id/statistical-geography/E33000001>.
    SERVICE <http://data.ordnancesurvey.co.uk/datasets/boundary-line/apis/sparql> {
        ?district rdfs:label ?districtname.
        ?district <http://www.w3.org/2002/07/owl#sameAs> <http://statistics.data.gov.uk/id/statistical-geography/E33000001> .
    }
'''
sqp(s='DISTINCT *',q=q)


# In[ ]: