#!/usr/bin/env python
# coding: utf-8
#
#
#
#
# You might want to consider the [start](search.ipynb) of this tutorial.
#
# Short introductions to other TF datasets:
#
# * [Dead Sea Scrolls](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/dss.ipynb),
# * [Old Babylonian Letters](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/oldbabylonian.ipynb),
# or the
# * [Quran](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/quran.ipynb)
#
# # Annotation outside TF
#
# Task:
#
# * prepare a text file based on TF data.
# * annotate the text file by assigning values to pieces of text
# * generate TF features based on these annotations
#
# We use a device in Text-Fabric that has been developed for this kind of round-trip:
# the [Recorder](https://annotation.github.io/text-fabric/tf/convert/recorder.html).
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# # Incantation
#
# The ins and outs of installing Text-Fabric, getting the corpus, and initializing a notebook are
# explained in the [start tutorial](start.ipynb).
# In[2]:
from tf.app import use
from tf.convert.recorder import Recorder
# In[3]:
A = use("ETCBC/bhsa", hoist=globals())
# We work with Genesis 1 (in fact, only the first 10 clauses).
# In[4]:
gen1 = T.nodeFromSection(("Genesis", 1))
# We prepare our portion of text for annotation outside TF.
#
# What needs to happen is, that we produce a text file and that we remember the positions of the relevant
# nodes in that text file.
# The Recorder is a new thing in TF (in development) that lets you create a string from nodes,
# where the positions of the nodes in that string are remembered.
# You may add all kinds of material in between the texts of the nodes.
# And it is up to you how you represent the nodes.
# We start a recorder.
# In[5]:
rec = Recorder()
# We can add strings to the recorder, and we can tell nodes to start and to stop.
#
# We add clause atoms and phrase atoms to the recorder.
# In[6]:
LIMIT = 10
for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")):
if i >= LIMIT: # only first ten clause atoms
break
# we want a label in front of each clause atom
label = "{} {}:{}".format(*T.sectionFromNode(cla))
rec.add(f"{label}@{i} ")
# we start a clause atom node:
# until we end this node, all text that we add counts as material for this clause atom
rec.start(cla)
for pa in L.d(cla, otype="phrase_atom"):
# we start a phrase node
# until we end this node, all text that we add also counts as material for this phrase atom
rec.start(pa)
# we add text, it belongs to the current clause atom and to the current phrase atom
rec.add(T.text(pa, fmt="text-trans-plain"))
# we end the phrase atom
rec.end(pa)
# we end the clause atom
rec.end(cla)
# very clause atom on its own line
# this return character does not belong to any node
rec.add("\n")
# We can print the recorded text.
# In[7]:
print(rec.text())
# We can print the recorded node positions.
# In[8]:
print("\n".join(f"pos {i}: {p}" for (i, p) in enumerate(rec.positions()) if p))
# We can write the recorded text and the positions to two files:
# In[9]:
rec.write("data/gen1.txt")
# In[10]:
get_ipython().system('head -n 10 data/gen1.txt')
# In[11]:
get_ipython().system('head -n 30 data/gen1.txt.pos')
# Now we produce a (fake) annotation file, based on the text.
#
# The file is tab delimited, the columns are:
#
# * start character position
# * end character position
# * feature 1 value
# * feature 2 value
# * etc
# We annotate as follows:
#
# * every word that starts with a `B` gets `bword=1`
# * every word that ends with a `T` gets `tword=1`
#
# Then we want every phrase with a b-word to get `bword=1` and likewise
# every clause with a b-word to get `bword=1`,
# and the same for `tword`.
# In[12]:
def annotate(fileName):
annotations = {}
with open(fileName) as fh:
pos = 0
for line in fh:
words = line.split(" ")
for word in words[0:2]:
lWord = len(word)
pos += lWord + 1
for word in words[2:]:
word = word.rstrip("\n")
lWord = len(word)
start = pos
end = pos + lWord - 1
pos += lWord + 1
if lWord:
if word[0] == "B":
annotations.setdefault((start, end), {})["bword"] = 1
if word[-1] == "T":
annotations.setdefault((start, end), {})["tword"] = 1
with open(f"{fileName}.ann", "w") as fh:
fh.write("start\tend\tbword\ttword\n")
for ((start, end), features) in annotations.items():
row = "\t".join(
str(a)
for a in (
start,
end,
features.get("bword", ""),
features.get("tword", ""),
)
)
fh.write(f"{row}\n")
# In[13]:
annotate("data/gen1.txt")
# Here is the annotation file.
# In[14]:
get_ipython().system('cat data/gen1.txt.ann')
# Now we want to feed back these annotations as TF features on `phrase_atom` and `clause_atom` nodes.
#
# Our recorder knows how to do that.
# In[15]:
features = rec.makeFeatures("data/gen1.txt.ann")
# Let's see.
# In[16]:
features["bword"]
# In[17]:
features["tword"]
# Let's check:
# In[18]:
for feat in ("bword", "tword"):
for n in features[feat]:
print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}')
# What if we want to transform the annotations to word features instead to features on phrase and clause atoms?
#
# Then we should record the text differently.
#
# We only add slots to the mix.
# In[19]:
rec = Recorder()
LIMIT = 10
for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")):
if i >= LIMIT:
break
label = "{} {}:{}".format(*T.sectionFromNode(cla))
rec.add(f"{label}@{i} ")
for w in L.d(cla, otype="word"):
rec.start(w)
rec.add(T.text(w, fmt="text-trans-plain"))
rec.end(w)
rec.add("\n")
# It gives the same text:
# In[20]:
print(rec.text())
# but the node positions are different:
# In[21]:
print("\n".join(f"pos {i}: {p}" for (i, p) in enumerate(rec.positions()) if p))
# We have produced the same text,
# so we can use the earlier annotation file to create word features.
# In[22]:
features = rec.makeFeatures("data/gen1.txt.ann")
# In[23]:
features["bword"]
# In[24]:
features["tword"]
# Let's check:
# In[25]:
for feat in ("bword", "tword"):
for n in features[feat]:
print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}')
# ## Explanation:
#
# The annotator just looked at the string `BR>CJT` without knowing that it is two words.
# In[26]:
get_ipython().system('cat data/gen1.txt.ann')
# So it has annotated pos 14-19 as a `bword` and as a `tword`.
#
# But TF knows that 14-19 are slots 1 and 2, so when the annotations are applied,
# slots 1 and 2 are both set to `bwords` and `twords`.
#
# We can remedy the situation by producing an other text to the annotator, one where
# slots are always separated by a space.
#
# Lets do that by always adding a space, so real words are separated by two spaces.
# In[27]:
rec = Recorder()
LIMIT = 10
for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")):
if i >= LIMIT:
break
label = "{} {}:{}".format(*T.sectionFromNode(cla))
rec.add(f"{label}@{i} ")
for w in L.d(cla, otype="word"):
rec.start(w)
rec.add(T.text(w, fmt="text-trans-plain") + " ")
rec.end(w)
rec.add("\n")
# Here is the text
# In[28]:
print(rec.text())
# We write the text to file.
# In[29]:
rec.write("data/gen1wx.txt")
# We run our annotator again, because we have a different text:
# In[30]:
annotate("data/gen1wx.txt")
# Here is the new annotation file.
# In[31]:
get_ipython().system('cat data/gen1wx.txt.ann')
# The features are no surprise:
# In[32]:
features = rec.makeFeatures("data/gen1wx.txt.ann")
# In[33]:
features["bword"]
# In[34]:
features["tword"]
# Let's check:
# In[35]:
for feat in ("bword", "tword"):
for n in features[feat]:
print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}')
# # All steps
#
# * **[start](start.ipynb)** your first step in mastering the bible computationally
# * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures
# * **[search](search.ipynb)** turbo charge your hand-coding with search templates
# * **[export Excel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results
# * **[share](share.ipynb)** draw in other people's data and let them use yours
# * **[export](export.ipynb)** export your dataset as an Emdros database
# * **annotate** annotate plain text by means of other tools and import the annotations as TF features
# * **[volumes](volumes.ipynb)** work with selected books only
# * **[trees](trees.ipynb)** work with the BHSA data as syntax trees
#
# CC-BY Dirk Roorda