#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
# ---
#
# To get started: consult [start](start.ipynb)
#
# ---
#
# # Similar lines
#
# We spot the many similarities between lines in the corpus.
#
# There are ca 50,000 lines in the corpus of which ca 35,000 with real content.
# To compare these requires more than half a billion comparisons.
# That is a costly operation.
# [On this laptop it took 21 whole minutes](https://nbviewer.jupyter.org/github/etcbc/dss/blob/master/programs/parallels.ipynb).
#
# The good news it that we have stored the outcome in an extra feature.
#
# This feature is packaged in a TF data module,
# that we will automatically loaded with the DSS.
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
import collections
from tf.app import use
# In[3]:
A = use("etcbc/dss", hoist=globals())
# The new feature is **sim** and it it an edge feature.
# It annotates pairs of lines $(l, m)$ where $l$ and $m$ have similar content.
# The degree of similarity is a percentage (between 60 and 100), and this value
# is annotated onto the edges.
#
# Here is an example:
# In[4]:
allLines = F.otype.s("line")
nLines = len(allLines)
exampleLine = allLines[0]
sisters = E.sim.b(exampleLine)
print(f"{len(sisters)} similar lines")
print("\n".join(f"{s[0]} with similarity {s[1]}" for s in sisters[0:10]))
A.table(tuple((s[0],) for s in ((exampleLine,), *sisters)), end=10)
# # All similarities
#
# Let's first find out the range of similarities:
# In[5]:
minSim = None
maxSim = None
similarity = dict()
for ln in F.otype.s("line"):
sisters = E.sim.f(ln)
if not sisters:
continue
for (m, s) in sisters:
similarity[(ln, m)] = s
thisMin = min(s[1] for s in sisters)
thisMax = max(s[1] for s in sisters)
if minSim is None or thisMin < minSim:
minSim = thisMin
if maxSim is None or thisMax > maxSim:
maxSim = thisMax
print(f"minimum similarity is {minSim:>3}")
print(f"maximum similarity is {maxSim:>3}")
# # The bottom lines
#
# We give a few examples of the least similar lines.
#
# We can use a search template to get the 90% lines.
# In[6]:
query = """
line
-sim=60> line
"""
# In words: find a line connected via a sim-edge with value 60 to an other line.
# In[7]:
results = A.search(query)
# In[8]:
A.table(results, start=1, end=10, withPassage="1 2")
# Or in full layout:
# In[9]:
A.table(results, start=1, end=10, fmt="layout-orig-full", withPassage="1 2")
# # More research
#
# Let's find out which lines have the most correspondences.
# In[10]:
parallels = {}
for (ln, m) in similarity:
parallels.setdefault(ln, set()).add(m)
parallels.setdefault(m, set()).add(ln)
print(f"{len(parallels)} out of {nLines} lines have at least one similar line")
# In[11]:
rankedParallels = sorted(
parallels.items(),
key=lambda x: (-len(x[1]), x[0]),
)
# In[12]:
for (ln, paras) in rankedParallels[0:10]:
print(
f'{len(paras):>4} siblings of {ln} = {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}'
)
# In[13]:
for (ln, paras) in rankedParallels[100:110]:
print(
f'{len(paras):>4} siblings of {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}'
)
# In[14]:
for (ln, paras) in rankedParallels[500:510]:
print(
f'{len(paras):>4} siblings of {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}'
)
# And how many lines have just one correspondence?
#
# We look at the tail of `rankedParallels`.
# In[15]:
pairs = [(x, list(paras)[0]) for (x, paras) in rankedParallels if len(paras) == 1]
print(f"There are {len(pairs)} exclusively parallel pairs of lines")
# In[16]:
for (x, y) in pairs[0:10]:
A.dm("---\n")
print(f"similarity {similarity[(x,y)]}")
A.plain(x, fmt="layout-orig-full")
A.plain(y, fmt="layout-orig-full")
# Why not make an overview of exactly how wide-spread parallel lines are?
#
# We count how many lines have how many parallels.
# In[17]:
parallelCount = collections.Counter()
buckets = (2, 10, 20, 50, 100)
bucketRep = {}
prevBucket = None
for bucket in buckets:
if prevBucket is None:
bucketRep[bucket] = f" n <= {bucket:>3}"
elif bucket == buckets[-1]:
bucketRep[bucket] = f" n > {bucket:>3}"
else:
bucketRep[bucket] = f"{prevBucket:>3} < n <= {bucket:>3}"
prevBucket = bucket
for (ln, paras) in rankedParallels:
clusterSize = len(paras) + 1
if clusterSize > buckets[-1]:
theBucket = buckets[-1]
else:
for bucket in buckets:
if clusterSize <= bucket:
theBucket = bucket
break
parallelCount[theBucket] += 1
for (bucket, amount) in sorted(
parallelCount.items(),
key=lambda x: (-x[0], x[1]),
):
print(f"{amount:>4} lines have {bucketRep[bucket]} sisters")
# # Cluster the lines
#
# Before we try to find them, let's see if we can cluster the similar lines in similar clusters.
# From now on we forget about the level of similarity, and focus on whether two lines are just "similar", meaning that they have
# a high degree of similarity.
# In[18]:
SIMILARITY_THRESHOLD = 0.8
CLUSTER_THRESHOLD = 0.4
def makeClusters():
# determine the domain
domain = set()
for ln in allLines:
ms = E.sim.f(ln)
for (m, s) in ms:
if s > SIMILARITY_THRESHOLD:
domain.add(s)
added = True
if added:
domain.add(m)
A.indent(reset=True)
chunkSize = 1000
b = 0
j = 0
clusters = []
for ln in domain:
j += 1
b += 1
if b == chunkSize:
b = 0
A.info(f"{j:>5} lines and {len(clusters):>5} clusters")
lSisters = {x[0] for x in E.sim.b(ln) if x[1] > SIMILARITY_THRESHOLD}
lAdded = False
for cl in clusters:
if len(cl & lSisters) > CLUSTER_THRESHOLD * len(cl):
cl.add(ln)
lAdded = True
break
if not lAdded:
clusters.append({ln})
A.info(f"{j:>5} lines and {len(clusters)} clusters")
return clusters
# In[19]:
clusters = makeClusters()
# What is the distribution of the clusters, in terms of how many similar lines they contain?
# We count them.
# In[20]:
clusterSizes = collections.Counter()
for cl in clusters:
clusterSizes[len(cl)] += 1
for (size, amount) in sorted(
clusterSizes.items(),
key=lambda x: (-x[0], x[1]),
):
print(f"clusters of size {size:>4}: {amount:>5}")
# # Interesting groups
#
# Exercise: investigate some interesting groups, that lie in some sweet spots.
#
# * the biggest clusters: more than 13 members
# * the medium clusters: between 4 and 13 members
# * the small clusters: between 2 and 4 members
# ---
#
# All chapters:
#
# * **[start](start.ipynb)** become an expert in creating pretty displays of your text structures
# * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures
# * **[search](search.ipynb)** turbo charge your hand-coding with search templates
# * **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results
# * **[share](share.ipynb)** draw in other people's data and let them use yours
# * **similar Lines** spot the similarities between lines
#
# ---
#
# See the [cookbook](cookbook) for recipes for small, concrete tasks.
#
# CC-BY Dirk Roorda