%load_ext autoreload
%autoreload 2
We (down)load the corpus.
from tf.app import use
A = use("annotation/banks", hoist=globals())
Locating corpus resources ...
Name | # of nodes | # slots / node | % coverage |
---|---|---|---|
book | 1 | 99.00 | 100 |
chapter | 2 | 49.50 | 100 |
sentence | 3 | 33.00 | 100 |
line | 12 | 7.67 | 93 |
word | 99 | 1.00 | 100 |
3
annotation/banks
/Users/me/text-fabric-data/github/annotation/banks/app
g5e28b54aaf679d5fbbbf7e879a6316f323b9d330
''
layoutRich
normal
{docRoot}/{org}/{repo}/blob/master/programs
.ipynb
convert
{urlNb}
{docBase}
convert
{}
True
local
/Users/me/text-fabric-data/github/annotation/banks/_temp
Two quotes from Consider Phlebas by Iain M. Banks
10.5281/zenodo.2630416
annotation
/tf
banks
0.2
v3.1
author
}terminator
{number}
{number}
True
gap
}F.letters.freqList()
(('the', 8), ('of', 5), ('and', 4), ('in', 3), ('we', 3), ('everything', 2), ('know', 2), ('most', 2), ('ones', 2), ('patterns', 2), ('us', 2), ('Besides', 1), ('Culture', 1), ('Everything', 1), ('So', 1), ('a', 1), ('about', 1), ('aid', 1), ('any', 1), ('around', 1), ('as', 1), ('barbarian', 1), ('bottom', 1), ('can', 1), ('care', 1), ('climbing', 1), ('composed', 1), ('control', 1), ('dead', 1), ('elegant', 1), ('enjoyable', 1), ('final', 1), ('find', 1), ('free', 1), ('games', 1), ('good', 1), ('harness', 1), ('have', 1), ('high', 1), ('humans', 1), ('impossible', 1), ('is', 1), ('it', 1), ('languages', 1), ('left', 1), ('life', 1), ('line', 1), ('make', 1), ('mattered', 1), ('mountains', 1), ('not', 1), ('nothing', 1), ('our', 1), ('over', 1), ('own', 1), ('problems', 1), ('really', 1), ('romance', 1), ('safety', 1), ('societies', 1), ('sports', 1), ('studying', 1), ('such', 1), ('take', 1), ('terms', 1), ('that', 1), ('that’s', 1), ('things', 1), ('those', 1), ('to', 1), ('truth', 1), ('ultimately', 1), ('where', 1), ('why', 1), ('without', 1))
For the node types we can get info by calling this:
C.levels.data
(('book', 99.0, 100, 100), ('chapter', 49.5, 101, 102), ('sentence', 33.0, 115, 117), ('line', 7.666666666666667, 103, 114), ('word', 1, 1, 99))
It means that chapters are 49.5 words long on average, and that the chapter nodes are 101 and 102.
And you see that we have 99 words.
We are going to make a relationship between each pair of words, and we annotate each related pair with how similar they are.
We measure the similarity by looking at the distinct letters in each word (lowercase), and computing the percentage of how many letters they have in common with respect to how many letters they jointly have.
This will become a symmetric edge feature. Symmetric means, that if a and b are similar, then b and a as well, with the same similarity.
We only store one copy of each symmetric pair of edges.
We can then use
E.sim.b(node)
to find all nodes that are parallel to node.
If words do not have letters in common, their similarity is 0, and we do not make an edge.
We pre-compute all letter sets for all words.
def makeSet(w):
return set(F.letters.v(w).lower())
words = {}
for w in F.otype.s("word"):
words[w] = makeSet(w)
nWords = len(words)
print(f"{nWords} words")
99 words
def sim(wSet, vSet):
return int(round(100 * len(wSet & vSet) / len(wSet | vSet)))
def computeSim():
similarity = {}
wordNodes = sorted(words.keys())
nWords = len(wordNodes)
nComparisons = nWords * (nWords - 1) // 2
print(f"{nComparisons} comparisons to make")
TF.indent(reset=True)
co = 0
si = 0
stop = False
for i in range(nWords):
nodeI = wordNodes[i]
wordI = words[nodeI]
for j in range(i + 1, nWords):
nodeJ = wordNodes[j]
wordJ = words[nodeJ]
s = sim(wordI, wordJ)
co += 1
if s:
similarity[(nodeI, nodeJ)] = sim(wordI, wordJ)
si += 1
if stop:
break
TF.info(f"{co:>4} comparisons and {si:>4} similarities")
return similarity
similarity = computeSim()
4851 comparisons to make 0.01s 4851 comparisons and 3332 similarities
print(min(similarity.values()))
print(max(similarity.values()))
7 100
eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 50]
print(eq[0])
print(neq[0])
((1, 4), 100) ((1, 2), 8)
print(len(eq))
print(len(neq))
58 3247
print(eq[0][0][0], F.letters.v(eq[0][0][0]))
print(eq[0][0][1], F.letters.v(eq[0][0][1]))
1 Everything 4 everything
print(neq[0][0][0], F.letters.v(neq[0][0][0]))
print(neq[0][0][1], F.letters.v(neq[0][0][1]))
1 Everything 2 about
We now add this information to the Banks dataset as an edge feature.
import os
GH_BASE = os.path.expanduser("~/github")
path = f"{A.context.org}/{A.context.repo}/sim/tf"
location = f"{GH_BASE}/{path}"
module = A.context.version
metaData = {
"": {
"name": "Banks (similar words)",
"converters": "Dirk Roorda",
"sourceUrl": "https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/text-fabric/use.ipynb",
"version": "0.2",
},
"sim": {
"valueType": "int",
"edgeValues": True,
"description": "similarity between words, as a percentage of the common material wrt the combined material",
},
}
simData = {}
for ((f, t), d) in similarity.items():
simData.setdefault(f, {})[t] = d
A.api.TF.save(
edgeFeatures=dict(sim=simData), metaData=metaData, location=location, module=module
)
0.00s Exporting 0 node and 1 edge and 0 configuration features to ~/github/annotation/banks/sim/tf/0.2: | 0.00s T sim to ~/github/annotation/banks/sim/tf/0.2 0.00s Exported 0 node features and 1 edge features and 0 config features to ~/github/annotation/banks/sim/tf/0.2
True