#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


import os
import pickle
import gzip

from tf.app import use
from tf.fabric import Fabric


# In[3]:


ghBase = os.path.expanduser("~/github")
org = "etcbc"
repo = "dss"
subdir = "parallels"
mainpath = f"{org}/{repo}/tf"
path = f"{org}/{repo}/{subdir}/tf"
location = f"{ghBase}/{path}"
mainlocation = f"{ghBase}/{mainpath}"
version = "1.6"
module = version
tempdir = f"{ghBase}/{org}/{repo}/_temp"


# In[4]:


TF = Fabric(locations=mainlocation, modules=module)


# In[5]:


api = TF.load("lex type")
docs = api.makeAvailableIn(globals())


# # Parallels
# 
# We make edges between similar lines.
# 
# When are lines similar?
# 
# If a certain distance metric is above a certain threshold.
# 
# We choose this metric:
# 
# * we reduce a line to the set of lexemes in it.
# * the similarity between two lines is the length of the intersection divided by the length of the union of their sets times 100.

# # Preparation
# 
# We pre-compute all sets for lines.
# 
# But because not all lines are filled with definite material, we exclude lines with 5 or less consonants.

# In[6]:


CONS = "cons"

valid = set()

allLines = F.otype.s("line")

TF.indent(reset=True)
for ln in F.otype.s("line"):
    if ln in valid:
        continue
    if sum(1 for s in L.d(ln, otype="sign") if F.type.v(s) == CONS) >= 5:
        valid.add(ln)

TF.info(f"{len(valid)} contentful lines out of {len(allLines)}")


# In[7]:


def makeSet(ln):
    lineSet = set()
    for s in L.d(ln, otype="word"):
        r = F.lex.v(s)
        if r:
            lineSet.add(r)
    return lineSet


# In[8]:


lines = {}

TF.indent(reset=True)
for ln in valid:
    lineSet = makeSet(ln)
    if lineSet:
        lines[ln] = lineSet

nLines = len(lines)
TF.info(f"{nLines} lines")


# # Measure

# In[9]:


def sim(lSet, mSet):
    return int(round(100 * len(lSet & mSet) / len(lSet | mSet)))


# # Compute all similarities
# 
# We are going to perform more than half a billion of comparisons, each of which is more than an elementary operation.
# 
# Let's measure time.

# In[10]:


THRESHOLD = 60


def computeSim(limit=None):
    similarity = {}

    lineNodes = sorted(lines.keys())
    nLines = len(lineNodes)

    nComparisons = nLines * (nLines - 1) // 2

    print(f"{nComparisons} comparisons to make")
    chunkSize = nComparisons // 1000

    co = 0
    b = 0
    si = 0
    p = 0

    TF.indent(reset=True)

    stop = False
    for i in range(nLines):
        nodeI = lineNodes[i]
        lineI = lines[nodeI]
        for j in range(i + 1, nLines):
            nodeJ = lineNodes[j]
            lineJ = lines[nodeJ]
            s = sim(lineI, lineJ)
            co += 1
            b += 1
            if b == chunkSize:
                p += 1
                TF.info(f"{p:>3}‰ - {co:>12} comparisons and {si:>10} similarities")
                b = 0
                if limit is not None and p >= limit:
                    stop = True
                    break

            if s < THRESHOLD:
                continue
            similarity[(nodeI, nodeJ)] = sim(lineI, lineJ)
            si += 1
        if stop:
            break

    TF.info(f"{p:>3}% - {co:>12} comparisons and {si:>10} similarities")
    return similarity


# We are going to run it to several ‰ first and do some checks then.

# In[11]:


similarity = computeSim(limit=3)


# We check the sanity of the results.

# In[12]:


print(min(similarity.values()))
print(max(similarity.values()))


# In[13]:


eq = [x for x in similarity.items() if x[1] >= 100]
neq = [x for x in similarity.items() if x[1] <= 70]


# In[14]:


print(len(eq))
print(len(neq))


# In[15]:


print(eq[0])
print(neq[0])


# In[16]:


print(T.text(eq[0][0][0]))
print(T.text(eq[0][0][1]))


# Looks good.
# 
# Now the whole computation.
# 
# But if we have done this before, and nothing has changed, we load previous results from disk.
# 
# If we do not find previous results, we compute them and save the results to disk.

# In[17]:


PARA_DIR = f"{tempdir}/parallels"


def writeResults(data, location, name):
    if not os.path.exists(location):
        os.makedirs(location, exist_ok=True)
    path = f"{location}/{name}"
    with gzip.open(path, "wb") as f:
        pickle.dump(data, f)
    TF.info(f"Data written to {path}")


def readResults(location, name):
    TF.indent(reset=True)
    path = f"{location}/{name}"
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return None
    with gzip.open(path, "rb") as f:
        data = pickle.load(f)
    TF.info(f"Data read from {path}")
    return data


# In[18]:


similarity = readResults(PARA_DIR, f"sim-{version}.zip")
if not similarity:
    similarity = computeSim()
    writeResults(similarity, PARA_DIR, f"sim-{version}.zip")


# In[19]:


len(similarity)


# So, just over 50,000 pairs of similar lines.

# # Add parallels to the TF dataset
# 
# We can add this information to the DSS dataset as an *edge feature*.
# 
# An edge feature links two nodes and may annotate that link with a value.
# 
# For parallels, we link each line to each of its parallel lines and we annotate that link with the similarity between
# the two lines. The similarity is a percentage, and we round it to integer values.
# 
# If `n1` is similar to `n2`, then `n2` is similar to `n1`.
# In order to save space, we only add such links once.
# 
# We can then use
# [`E.sim.b(node)`](https://annotation.github.io/text-fabric/tf/core/edgefeature.html#tf.core.edgefeature)
# to find all nodes that are parallel to node.
# 

# In[21]:


metaData = {
    "": {
        "acronym": "dss",
        "description": "parallel lines in the DSS (computed)",
        "createdBy": "Dirk Roorda",
        "createdDate": "2022-09-29",
        "sourceCreatedDate": "2015",
        "sourceCreatedBy": "Martin G. Abegg, Jr., James E. Bowley, and Edward M. Cook",
        "convertedBy": "Jarod Jacobs, Martijn Naaijer and Dirk Roorda",
        "source": "Martin Abegg's data files, personal communication",
        "license": "Creative Commons Attribution-NonCommercial 4.0 International License",
        "licenseUrl": "http://creativecommons.org/licenses/by-nc/4.0/",
        "sourceDescription": "Dead Sea Scrolls: biblical and non-biblical scrolls",
    },
    "sim": {
        "valueType": "int",
        "edgeValues": True,
        "description": "similarity between lines, as a percentage of the common material wrt the combined material",
    },
}


# In[22]:


simData = {}

for ((f, t), d) in similarity.items():
    simData.setdefault(f, {})[t] = d


# In[23]:


TF.save(
    edgeFeatures=dict(sim=simData), metaData=metaData, module=module
)


# # Turn the parallels feature into a module
# 
# Here we show how to turn the new feature `sim` into a module, so that users can easily load it in a Jupyter notebook or in the TF browser.

# In[15]:


#%%bash
get_ipython().system('text-fabric-zip etcbc/dss/parallels/tf')


# In[16]:


get_ipython().system('text-fabric-zip etcbc/dss/tf')


# I have added this file to a new release of the DSS GitHub repo.

# # Use the parallels module
# 
# We load the DSS corpus again, but now with the parallels module.

# In[25]:


A = use("ETCBC/dss:clone", checkout="clone", hoist=globals())


# Lo and behold: you see the parallels module listed with one feature: `sim`. It is in *italics*, which indicates
# it is an edge feature.
# 
# We just do a quick check here and in another notebook we study parallels a bit more, using the feature `sim`.
# 
# We count how many similar pairs their are, and how many 100% similar pairs there are.

# In[31]:


query = """
line
-sim> line
"""
results = A.search(query)
refNode = results[20000][0]
refNode


# In[32]:


query = """
line
-sim=100> line
"""
results = A.search(query)


# Let's show a few of the pairs are 100 percent similar.

# In[33]:


A.table(results, start=1, end=10, withNodes=True)


# There is also a lower level way to work with edge features.
# 
# We can list all edges going out from a reference node.
# What we see is tuple of pairs: the target node and the similarity between the reference node and that target node.

# In[34]:


E.sim.f(refNode)


# Likewise, we can observe the nodes that target the reference node:

# In[35]:


E.sim.t(refNode)


# Both sets of nodes are similar to the reference node and it is inconvenient to use both `.f()` and `.t()` to get the similar lines.
# 
# But there is another way:

# In[36]:


E.sim.b(refNode)


# Let's make sure that `.b()` gives the combination of `.f()` and `.t()`.

# In[37]:


f = {x[0] for x in E.sim.f(refNode)}
b = {x[0] for x in E.sim.b(refNode)}
t = {x[0] for x in E.sim.t(refNode)}

# are f and t disjoint ?

print(f"the intersection of f and t is {f & t}")

# is b the union of f and t ?

print(f"t | f = b ? {f | t == b}")