"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need these
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/knowledge_graph_completion/wn18/data')
!pip install rdflib
import rdflib
import time
import pprint
from tqdm import tqdm
URI_PREFIX = 'http://example.org/'
with open('./wn18/train.txt') as f:
triples = [line.rstrip().split('\t') for line in f]
graph = rdflib.Graph()
resources = set([r for triple in triples for r in triple])
resource2uri = {r: URI_PREFIX + r for r in resources}
for (s, p, o) in tqdm(triples, total=len(triples), ncols=70):
s_uri = rdflib.URIRef(resource2uri[s])
p_uri = rdflib.URIRef(resource2uri[p])
o_uri = rdflib.URIRef(resource2uri[o])
graph.add((s_uri, p_uri, o_uri))
"""
Suppose we want to find a subject under relation <?, _hyponym, 04371774>
"""
GIVEN_P, GIVEN_O = '_hyponym', '04371774'
SPARQL = (
"""
PREFIX : <%s>
SELECT DISTINCT ?which_subject WHERE {
?which_subject :%s :%s .
}
""" % (URI_PREFIX, GIVEN_P, GIVEN_O)
)
uri2resource = {uri: r for r, uri in resource2uri.items()}
t0 = time.time()
res = []
count = 0
for row in graph.query(SPARQL):
found_subject = uri2resource[row.which_subject.toPython()]
res.append((found_subject, GIVEN_P, GIVEN_O))
count += 1
print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples)))
pprint.pprint(res)
"""
This example is more complex with two paths
"""
SPARQL = (
"""
PREFIX : <%s>
SELECT DISTINCT ?which_subject_1 ?which_subject_2 WHERE {
?which_subject_1 :%s :%s .
?which_subject_2 :%s ?which_subject_1.
}
""" % (URI_PREFIX, GIVEN_P, GIVEN_O, GIVEN_P)
)
t0 = time.time()
res = []
count = 0
for row in graph.query(SPARQL):
found_subject_1 = uri2resource[row.which_subject_1.toPython()]
found_subject_2 = uri2resource[row.which_subject_2.toPython()]
res.append([[found_subject_1, GIVEN_P, GIVEN_O], [found_subject_2, GIVEN_P, found_subject_1]])
count += 1
print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples)))
pprint.pprint(res)
# check whether what we found lives in the graph
print([triple in triples for chain in res for triple in chain])