#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import os import pickle import gzip from tf.app import use from tf.fabric import Fabric # In[3]: ghBase = os.path.expanduser("~/github") org = "etcbc" repo = "dss" subdir = "parallels" mainpath = f"{org}/{repo}/tf" path = f"{org}/{repo}/{subdir}/tf" location = f"{ghBase}/{path}" mainlocation = f"{ghBase}/{mainpath}" version = "1.6" module = version tempdir = f"{ghBase}/{org}/{repo}/_temp" # In[4]: TF = Fabric(locations=mainlocation, modules=module) # In[5]: api = TF.load("lex type") docs = api.makeAvailableIn(globals()) # # Parallels # # We make edges between similar lines. # # When are lines similar? # # If a certain distance metric is above a certain threshold. # # We choose this metric: # # * we reduce a line to the set of lexemes in it. # * the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100. # # Preparation # # We pre-compute all sets for lines. # # But because not all lines are filled with definite material, we exclude lines with 5 or less consonants. # In[6]: CONS = "cons" valid = set() allLines = F.otype.s("line") TF.indent(reset=True) for ln in F.otype.s("line"): if ln in valid: continue if sum(1 for s in L.d(ln, otype="sign") if F.type.v(s) == CONS) >= 5: valid.add(ln) TF.info(f"{len(valid)} contentful lines out of {len(allLines)}") # In[7]: def makeSet(ln): lineSet = set() for s in L.d(ln, otype="word"): r = F.lex.v(s) if r: lineSet.add(r) return lineSet # In[8]: lines = {} TF.indent(reset=True) for ln in valid: lineSet = makeSet(ln) if lineSet: lines[ln] = lineSet nLines = len(lines) TF.info(f"{nLines} lines") # # Measure # In[9]: def sim(lSet, mSet): return int(round(100 * len(lSet & mSet) / len(lSet | mSet))) # # Compute all similarities # # We are going to perform more than half a billion of comparisons, each of which is more than an elementary operation. # # Let's measure time. # In[10]: THRESHOLD = 60 def computeSim(limit=None): similarity = {} lineNodes = sorted(lines.keys()) nLines = len(lineNodes) nComparisons = nLines * (nLines - 1) // 2 print(f"{nComparisons} comparisons to make") chunkSize = nComparisons // 1000 co = 0 b = 0 si = 0 p = 0 TF.indent(reset=True) stop = False for i in range(nLines): nodeI = lineNodes[i] lineI = lines[nodeI] for j in range(i + 1, nLines): nodeJ = lineNodes[j] lineJ = lines[nodeJ] s = sim(lineI, lineJ) co += 1 b += 1 if b == chunkSize: p += 1 TF.info(f"{p:>3}‰ - {co:>12} comparisons and {si:>10} similarities") b = 0 if limit is not None and p >= limit: stop = True break if s < THRESHOLD: continue similarity[(nodeI, nodeJ)] = sim(lineI, lineJ) si += 1 if stop: break TF.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities") return similarity # We are going to run it to several ‰ first and do some checks then. # In[11]: similarity = computeSim(limit=3) # We check the sanity of the results. # In[12]: print(min(similarity.values())) print(max(similarity.values())) # In[13]: eq = [x for x in similarity.items() if x[1] >= 100] neq = [x for x in similarity.items() if x[1] <= 70] # In[14]: print(len(eq)) print(len(neq)) # In[15]: print(eq[0]) print(neq[0]) # In[16]: print(T.text(eq[0][0][0])) print(T.text(eq[0][0][1])) # Looks good. # # Now the whole computation. # # But if we have done this before, and nothing has changed, we load previous results from disk. # # If we do not find previous results, we compute them and save the results to disk. # In[17]: PARA_DIR = f"{tempdir}/parallels" def writeResults(data, location, name): if not os.path.exists(location): os.makedirs(location, exist_ok=True) path = f"{location}/{name}" with gzip.open(path, "wb") as f: pickle.dump(data, f) TF.info(f"Data written to {path}") def readResults(location, name): TF.indent(reset=True) path = f"{location}/{name}" if not os.path.exists(path): print(f"File not found: {path}") return None with gzip.open(path, "rb") as f: data = pickle.load(f) TF.info(f"Data read from {path}") return data # In[18]: similarity = readResults(PARA_DIR, f"sim-{version}.zip") if not similarity: similarity = computeSim() writeResults(similarity, PARA_DIR, f"sim-{version}.zip") # In[19]: len(similarity) # So, just over 50,000 pairs of similar lines. # # Add parallels to the TF dataset # # We can add this information to the DSS dataset as an *edge feature*. # # An edge feature links two nodes and may annotate that link with a value. # # For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between # the two lines. The similarity is a percentage, and we round it to integer values. # # If `n1` is similar to `n2`, then `n2` is similar to `n1`. # In order to save space, we only add such links once. # # We can then use # [`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html#tf.core.edgefeature) # to find all nodes that are parallel to node. # # In[21]: metaData = { "": { "acronym": "dss", "description": "parallel lines in the DSS (computed)", "createdBy": "Dirk Roorda", "createdDate": "2022-09-29", "sourceCreatedDate": "2015", "sourceCreatedBy": "Martin G. Abegg, Jr., James E. Bowley, and Edward M. Cook", "convertedBy": "Jarod Jacobs, Martijn Naaijer and Dirk Roorda", "source": "Martin Abegg's data files, personal communication", "license": "Creative Commons Attribution-NonCommercial 4.0 International License", "licenseUrl": "http://creativecommons.org/licenses/by-nc/4.0/", "sourceDescription": "Dead Sea Scrolls: biblical and non-biblical scrolls", }, "sim": { "valueType": "int", "edgeValues": True, "description": "similarity between lines, as a percentage of the common material wrt the combined material", }, } # In[22]: simData = {} for ((f, t), d) in similarity.items(): simData.setdefault(f, {})[t] = d # In[23]: TF.save( edgeFeatures=dict(sim=simData), metaData=metaData, module=module ) # # Turn the parallels feature into a module # # Here we show how to turn the new feature `sim` into a module, so that users can easily load it in a Jupyter notebook or in the TF browser. # In[15]: #%%bash get_ipython().system('text-fabric-zip etcbc/dss/parallels/tf') # In[16]: get_ipython().system('text-fabric-zip etcbc/dss/tf') # I have added this file to a new release of the DSS GitHub repo. # # Use the parallels module # # We load the DSS corpus again, but now with the parallels module. # In[25]: A = use("ETCBC/dss:clone", checkout="clone", hoist=globals()) # Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates # it is an edge feature. # # We just do a quick check here and in another notebook we study parallels a bit more, using the feature `sim`. # # We count how many similar pairs their are, and how many 100% similar pairs there are. # In[31]: query = """ line -sim> line """ results = A.search(query) refNode = results[20000][0] refNode # In[32]: query = """ line -sim=100> line """ results = A.search(query) # Let's show a few of the pairs are 100 percent similar. # In[33]: A.table(results, start=1, end=10, withNodes=True) # There is also a lower level way to work with edge features. # # We can list all edges going out from a reference node. # What we see is tuple of pairs: the target node and the similarity between the reference node and that target node. # In[34]: E.sim.f(refNode) # Likewise, we can observe the nodes that target the reference node: # In[35]: E.sim.t(refNode) # Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines. # # But there is another way: # In[36]: E.sim.b(refNode) # Let's make sure that `.b()` gives the combination of `.f()` and `.t()`. # In[37]: f = {x[0] for x in E.sim.f(refNode)} b = {x[0] for x in E.sim.b(refNode)} t = {x[0] for x in E.sim.t(refNode)} # are f and t disjoint ? print(f"the intersection of f and t is {f & t}") # is b the union of f and t ? print(f"t | f = b ? {f | t == b}")