In [1]:
"""
We use following lines because we are running on Google Colab
If you are running notebook on a local computer, you don't need these
"""
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow2/knowledge_graph_completion/wn18/data')
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
In [2]:
!pip install rdflib
Requirement already satisfied: rdflib in /usr/local/lib/python3.6/dist-packages (4.2.2)
Requirement already satisfied: isodate in /usr/local/lib/python3.6/dist-packages (from rdflib) (0.6.0)
Requirement already satisfied: pyparsing in /usr/local/lib/python3.6/dist-packages (from rdflib) (2.4.2)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from isodate->rdflib) (1.12.0)
In [0]:
import rdflib
import time
import pprint

from tqdm import tqdm
In [0]:
URI_PREFIX = 'http://example.org/'
In [0]:
with open('./wn18/train.txt') as f:
  triples = [line.rstrip().split('\t') for line in f]
In [6]:
graph = rdflib.Graph()
resources = set([r for triple in triples for r in triple])
resource2uri = {r: URI_PREFIX + r for r in resources}
for (s, p, o) in tqdm(triples, total=len(triples), ncols=70):
  s_uri = rdflib.URIRef(resource2uri[s])
  p_uri = rdflib.URIRef(resource2uri[p])
  o_uri = rdflib.URIRef(resource2uri[o])
  graph.add((s_uri, p_uri, o_uri))
100%|██████████████████████| 141442/141442 [00:05<00:00, 25140.61it/s]
In [0]:
"""
Suppose we want to find a subject under relation <?, _hyponym, 04371774>
"""

GIVEN_P, GIVEN_O = '_hyponym', '04371774'

SPARQL = (
    """
    PREFIX : <%s>
    
    SELECT DISTINCT ?which_subject WHERE {
        ?which_subject :%s :%s .
    }
    """ % (URI_PREFIX, GIVEN_P, GIVEN_O)
)
In [8]:
uri2resource = {uri: r for r, uri in resource2uri.items()}
t0 = time.time()
res = []
count = 0
for row in graph.query(SPARQL):
    found_subject = uri2resource[row.which_subject.toPython()]
    res.append((found_subject, GIVEN_P, GIVEN_O))
    count += 1
    
print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples)))
pprint.pprint(res)
Query Spent 1.93 sec
Get 2 / 141442 Results
[('03964744', '_hyponym', '04371774'), ('03736970', '_hyponym', '04371774')]
In [0]:
"""
This example is more complex with two paths
"""

SPARQL = (
    """
    PREFIX : <%s>
    
    SELECT DISTINCT ?which_subject_1 ?which_subject_2 WHERE {
        ?which_subject_1 :%s :%s .
        ?which_subject_2 :%s ?which_subject_1.
    }
    """ % (URI_PREFIX, GIVEN_P, GIVEN_O, GIVEN_P)
)
In [10]:
t0 = time.time()
res = []
count = 0
for row in graph.query(SPARQL):
    found_subject_1 = uri2resource[row.which_subject_1.toPython()]
    found_subject_2 = uri2resource[row.which_subject_2.toPython()]
    res.append([[found_subject_1, GIVEN_P, GIVEN_O], [found_subject_2, GIVEN_P, found_subject_1]])
    count += 1
    
print("Query Spent %.2f sec" % (time.time() - t0))
print("Get %d / %d Results" % (count, len(triples)))
pprint.pprint(res)

# check whether what we found lives in the graph
print([triple in triples for chain in res for triple in chain])
Query Spent 0.01 sec
Get 2 / 141442 Results
[[['03964744', '_hyponym', '04371774'], ['00021939', '_hyponym', '03964744']],
 [['03736970', '_hyponym', '04371774'], ['03738472', '_hyponym', '03736970']]]
[True, True, True, True]