#!/usr/bin/env python # coding: utf-8 # # # # # # --- # # To get started: consult [start](start.ipynb) # # --- # # # Similar lines # # We spot the many similarities between lines in the corpus. # # There are ca 50,000 lines in the corpus of which ca 35,000 with real content. # To compare these requires more than half a billion comparisons. # That is a costly operation. # [On this laptop it took 21 whole minutes](https://nbviewer.jupyter.org/github/etcbc/dss/blob/master/programs/parallels.ipynb). # # The good news it that we have stored the outcome in an extra feature. # # This feature is packaged in a TF data module, # that we will automatically loaded with the DSS. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: import collections from tf.app import use # In[3]: A = use("etcbc/dss", hoist=globals()) # The new feature is **sim** and it it an edge feature. # It annotates pairs of lines $(l, m)$ where $l$ and $m$ have similar content. # The degree of similarity is a percentage (between 60 and 100), and this value # is annotated onto the edges. # # Here is an example: # In[4]: allLines = F.otype.s("line") nLines = len(allLines) exampleLine = allLines[0] sisters = E.sim.b(exampleLine) print(f"{len(sisters)} similar lines") print("\n".join(f"{s[0]} with similarity {s[1]}" for s in sisters[0:10])) A.table(tuple((s[0],) for s in ((exampleLine,), *sisters)), end=10) # # All similarities # # Let's first find out the range of similarities: # In[5]: minSim = None maxSim = None similarity = dict() for ln in F.otype.s("line"): sisters = E.sim.f(ln) if not sisters: continue for (m, s) in sisters: similarity[(ln, m)] = s thisMin = min(s[1] for s in sisters) thisMax = max(s[1] for s in sisters) if minSim is None or thisMin < minSim: minSim = thisMin if maxSim is None or thisMax > maxSim: maxSim = thisMax print(f"minimum similarity is {minSim:>3}") print(f"maximum similarity is {maxSim:>3}") # # The bottom lines # # We give a few examples of the least similar lines. # # We can use a search template to get the 90% lines. # In[6]: query = """ line -sim=60> line """ # In words: find a line connected via a sim-edge with value 60 to an other line. # In[7]: results = A.search(query) # In[8]: A.table(results, start=1, end=10, withPassage="1 2") # Or in full layout: # In[9]: A.table(results, start=1, end=10, fmt="layout-orig-full", withPassage="1 2") # # More research # # Let's find out which lines have the most correspondences. # In[10]: parallels = {} for (ln, m) in similarity: parallels.setdefault(ln, set()).add(m) parallels.setdefault(m, set()).add(ln) print(f"{len(parallels)} out of {nLines} lines have at least one similar line") # In[11]: rankedParallels = sorted( parallels.items(), key=lambda x: (-len(x[1]), x[0]), ) # In[12]: for (ln, paras) in rankedParallels[0:10]: print( f'{len(paras):>4} siblings of {ln} = {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}' ) # In[13]: for (ln, paras) in rankedParallels[100:110]: print( f'{len(paras):>4} siblings of {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}' ) # In[14]: for (ln, paras) in rankedParallels[500:510]: print( f'{len(paras):>4} siblings of {T.text(ln)} = {T.text(ln, fmt="text-source-full", descend=True)}' ) # And how many lines have just one correspondence? # # We look at the tail of `rankedParallels`. # In[15]: pairs = [(x, list(paras)[0]) for (x, paras) in rankedParallels if len(paras) == 1] print(f"There are {len(pairs)} exclusively parallel pairs of lines") # In[16]: for (x, y) in pairs[0:10]: A.dm("---\n") print(f"similarity {similarity[(x,y)]}") A.plain(x, fmt="layout-orig-full") A.plain(y, fmt="layout-orig-full") # Why not make an overview of exactly how wide-spread parallel lines are? # # We count how many lines have how many parallels. # In[17]: parallelCount = collections.Counter() buckets = (2, 10, 20, 50, 100) bucketRep = {} prevBucket = None for bucket in buckets: if prevBucket is None: bucketRep[bucket] = f" n <= {bucket:>3}" elif bucket == buckets[-1]: bucketRep[bucket] = f" n > {bucket:>3}" else: bucketRep[bucket] = f"{prevBucket:>3} < n <= {bucket:>3}" prevBucket = bucket for (ln, paras) in rankedParallels: clusterSize = len(paras) + 1 if clusterSize > buckets[-1]: theBucket = buckets[-1] else: for bucket in buckets: if clusterSize <= bucket: theBucket = bucket break parallelCount[theBucket] += 1 for (bucket, amount) in sorted( parallelCount.items(), key=lambda x: (-x[0], x[1]), ): print(f"{amount:>4} lines have {bucketRep[bucket]} sisters") # # Cluster the lines # # Before we try to find them, let's see if we can cluster the similar lines in similar clusters. # From now on we forget about the level of similarity, and focus on whether two lines are just "similar", meaning that they have # a high degree of similarity. # In[18]: SIMILARITY_THRESHOLD = 0.8 CLUSTER_THRESHOLD = 0.4 def makeClusters(): # determine the domain domain = set() for ln in allLines: ms = E.sim.f(ln) for (m, s) in ms: if s > SIMILARITY_THRESHOLD: domain.add(s) added = True if added: domain.add(m) A.indent(reset=True) chunkSize = 1000 b = 0 j = 0 clusters = [] for ln in domain: j += 1 b += 1 if b == chunkSize: b = 0 A.info(f"{j:>5} lines and {len(clusters):>5} clusters") lSisters = {x[0] for x in E.sim.b(ln) if x[1] > SIMILARITY_THRESHOLD} lAdded = False for cl in clusters: if len(cl & lSisters) > CLUSTER_THRESHOLD * len(cl): cl.add(ln) lAdded = True break if not lAdded: clusters.append({ln}) A.info(f"{j:>5} lines and {len(clusters)} clusters") return clusters # In[19]: clusters = makeClusters() # What is the distribution of the clusters, in terms of how many similar lines they contain? # We count them. # In[20]: clusterSizes = collections.Counter() for cl in clusters: clusterSizes[len(cl)] += 1 for (size, amount) in sorted( clusterSizes.items(), key=lambda x: (-x[0], x[1]), ): print(f"clusters of size {size:>4}: {amount:>5}") # # Interesting groups # # Exercise: investigate some interesting groups, that lie in some sweet spots. # # * the biggest clusters: more than 13 members # * the medium clusters: between 4 and 13 members # * the small clusters: between 2 and 4 members # --- # # All chapters: # # * **[start](start.ipynb)** become an expert in creating pretty displays of your text structures # * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures # * **[search](search.ipynb)** turbo charge your hand-coding with search templates # * **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results # * **[share](share.ipynb)** draw in other people's data and let them use yours # * **similar Lines** spot the similarities between lines # # --- # # See the [cookbook](cookbook) for recipes for small, concrete tasks. # # CC-BY Dirk Roorda