#!/usr/bin/env python # coding: utf-8 # # # # # You might want to consider the [start](search.ipynb) of this tutorial. # # Short introductions to other TF datasets: # # * [Dead Sea Scrolls](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/dss.ipynb), # * [Old Babylonian Letters](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/oldbabylonian.ipynb), # or the # * [Quran](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/quran.ipynb) # # # Annotation outside TF # # Task: # # * prepare a text file based on TF data. # * annotate the text file by assigning values to pieces of text # * generate TF features based on these annotations # # We use a device in Text-Fabric that has been developed for this kind of round-trip: # the [Recorder](https://annotation.github.io/text-fabric/tf/convert/recorder.html). # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # # Incantation # # The ins and outs of installing Text-Fabric, getting the corpus, and initializing a notebook are # explained in the [start tutorial](start.ipynb). # In[2]: from tf.app import use from tf.convert.recorder import Recorder # In[3]: A = use("ETCBC/bhsa", hoist=globals()) # We work with Genesis 1 (in fact, only the first 10 clauses). # In[4]: gen1 = T.nodeFromSection(("Genesis", 1)) # We prepare our portion of text for annotation outside TF. # # What needs to happen is, that we produce a text file and that we remember the positions of the relevant # nodes in that text file. # The Recorder is a new thing in TF (in development) that lets you create a string from nodes, # where the positions of the nodes in that string are remembered. # You may add all kinds of material in between the texts of the nodes. # And it is up to you how you represent the nodes. # We start a recorder. # In[5]: rec = Recorder() # We can add strings to the recorder, and we can tell nodes to start and to stop. # # We add clause atoms and phrase atoms to the recorder. # In[6]: LIMIT = 10 for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")): if i >= LIMIT: # only first ten clause atoms break # we want a label in front of each clause atom label = "{} {}:{}".format(*T.sectionFromNode(cla)) rec.add(f"{label}@{i} ") # we start a clause atom node: # until we end this node, all text that we add counts as material for this clause atom rec.start(cla) for pa in L.d(cla, otype="phrase_atom"): # we start a phrase node # until we end this node, all text that we add also counts as material for this phrase atom rec.start(pa) # we add text, it belongs to the current clause atom and to the current phrase atom rec.add(T.text(pa, fmt="text-trans-plain")) # we end the phrase atom rec.end(pa) # we end the clause atom rec.end(cla) # very clause atom on its own line # this return character does not belong to any node rec.add("\n") # We can print the recorded text. # In[7]: print(rec.text()) # We can print the recorded node positions. # In[8]: print("\n".join(f"pos {i}: {p}" for (i, p) in enumerate(rec.positions()) if p)) # We can write the recorded text and the positions to two files: # In[9]: rec.write("data/gen1.txt") # In[10]: get_ipython().system('head -n 10 data/gen1.txt') # In[11]: get_ipython().system('head -n 30 data/gen1.txt.pos') # Now we produce a (fake) annotation file, based on the text. # # The file is tab delimited, the columns are: # # * start character position # * end character position # * feature 1 value # * feature 2 value # * etc # We annotate as follows: # # * every word that starts with a `B` gets `bword=1` # * every word that ends with a `T` gets `tword=1` # # Then we want every phrase with a b-word to get `bword=1` and likewise # every clause with a b-word to get `bword=1`, # and the same for `tword`. # In[12]: def annotate(fileName): annotations = {} with open(fileName) as fh: pos = 0 for line in fh: words = line.split(" ") for word in words[0:2]: lWord = len(word) pos += lWord + 1 for word in words[2:]: word = word.rstrip("\n") lWord = len(word) start = pos end = pos + lWord - 1 pos += lWord + 1 if lWord: if word[0] == "B": annotations.setdefault((start, end), {})["bword"] = 1 if word[-1] == "T": annotations.setdefault((start, end), {})["tword"] = 1 with open(f"{fileName}.ann", "w") as fh: fh.write("start\tend\tbword\ttword\n") for ((start, end), features) in annotations.items(): row = "\t".join( str(a) for a in ( start, end, features.get("bword", ""), features.get("tword", ""), ) ) fh.write(f"{row}\n") # In[13]: annotate("data/gen1.txt") # Here is the annotation file. # In[14]: get_ipython().system('cat data/gen1.txt.ann') # Now we want to feed back these annotations as TF features on `phrase_atom` and `clause_atom` nodes. # # Our recorder knows how to do that. # In[15]: features = rec.makeFeatures("data/gen1.txt.ann") # Let's see. # In[16]: features["bword"] # In[17]: features["tword"] # Let's check: # In[18]: for feat in ("bword", "tword"): for n in features[feat]: print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}') # What if we want to transform the annotations to word features instead to features on phrase and clause atoms? # # Then we should record the text differently. # # We only add slots to the mix. # In[19]: rec = Recorder() LIMIT = 10 for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")): if i >= LIMIT: break label = "{} {}:{}".format(*T.sectionFromNode(cla)) rec.add(f"{label}@{i} ") for w in L.d(cla, otype="word"): rec.start(w) rec.add(T.text(w, fmt="text-trans-plain")) rec.end(w) rec.add("\n") # It gives the same text: # In[20]: print(rec.text()) # but the node positions are different: # In[21]: print("\n".join(f"pos {i}: {p}" for (i, p) in enumerate(rec.positions()) if p)) # We have produced the same text, # so we can use the earlier annotation file to create word features. # In[22]: features = rec.makeFeatures("data/gen1.txt.ann") # In[23]: features["bword"] # In[24]: features["tword"] # Let's check: # In[25]: for feat in ("bword", "tword"): for n in features[feat]: print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}') # ## Explanation: # # The annotator just looked at the string `BR>CJT` without knowing that it is two words. # In[26]: get_ipython().system('cat data/gen1.txt.ann') # So it has annotated pos 14-19 as a `bword` and as a `tword`. # # But TF knows that 14-19 are slots 1 and 2, so when the annotations are applied, # slots 1 and 2 are both set to `bwords` and `twords`. # # We can remedy the situation by producing an other text to the annotator, one where # slots are always separated by a space. # # Lets do that by always adding a space, so real words are separated by two spaces. # In[27]: rec = Recorder() LIMIT = 10 for (i, cla) in enumerate(L.d(gen1, otype="clause_atom")): if i >= LIMIT: break label = "{} {}:{}".format(*T.sectionFromNode(cla)) rec.add(f"{label}@{i} ") for w in L.d(cla, otype="word"): rec.start(w) rec.add(T.text(w, fmt="text-trans-plain") + " ") rec.end(w) rec.add("\n") # Here is the text # In[28]: print(rec.text()) # We write the text to file. # In[29]: rec.write("data/gen1wx.txt") # We run our annotator again, because we have a different text: # In[30]: annotate("data/gen1wx.txt") # Here is the new annotation file. # In[31]: get_ipython().system('cat data/gen1wx.txt.ann') # The features are no surprise: # In[32]: features = rec.makeFeatures("data/gen1wx.txt.ann") # In[33]: features["bword"] # In[34]: features["tword"] # Let's check: # In[35]: for feat in ("bword", "tword"): for n in features[feat]: print(f'{feat} {F.otype.v(n)} {n}: {T.text(n, fmt="text-trans-plain")}') # # All steps # # * **[start](start.ipynb)** your first step in mastering the bible computationally # * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures # * **[search](search.ipynb)** turbo charge your hand-coding with search templates # * **[export Excel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results # * **[share](share.ipynb)** draw in other people's data and let them use yours # * **[export](export.ipynb)** export your dataset as an Emdros database # * **annotate** annotate plain text by means of other tools and import the annotations as TF features # * **[volumes](volumes.ipynb)** work with selected books only # * **[trees](trees.ipynb)** work with the BHSA data as syntax trees # # CC-BY Dirk Roorda