#!/usr/bin/env python # coding: utf-8 # # # # # You might want to consider the [start](search.ipynb) of this tutorial. # # Short introductions to other TF datasets: # # * [Dead Sea Scrolls](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/dss.ipynb), # * [Old Babylonian Letters](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/oldbabylonian.ipynb), # or the # * [Q'uran](https://nbviewer.jupyter.org/github/annotation/tutorials/blob/master/lorentz2020/quran.ipynb) # # # Trees # # The textual objects of the BHSA text are syntactic, but they are not syntax trees. # # The BHSA is the result of a data-driven parsing strategy with occasional human decisions. # It results in functional objects such as sentences, clauses, and phrases, # which are build from chunks called sentece-atoms, clause-atoms, and phrase-atoms. # # There is no deeper nesting of clauses within phrases, or even clauses within clauses or phrases within phrases. # Instead, whenever objects are linguistically nested, there is an edge called `mother` between the # objects in question. # # For people that prefer to think in trees, we have unwrapped the `mother` relationship between clauses # and made tree structures out of the data. # # The whole generation process of trees, including the quirks underway, is documented # in the notebook # [trees.ipynb](https://nbviewer.jupyter.org/github/etcbc/trees/blob/master/programs/trees.ipynb). # You see it done there for version 2017. # We have used an ordinary Python program to generate trees for all versions of the BHSA: # [alltrees.py](https://github.com/etcbc/trees/blob/master/programs/alltrees.py) # # Those trees are available as a feature on sentence nodes, and you can load those features # alongside the BHSA data. # # Here we show some examples of what you can do with it. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # # Incantation # # The ins and outs of installing Text-Fabric, getting the corpus, and initializing a notebook are # explained in the [start tutorial](start.ipynb). # In[2]: from utils import structure, layout from tf.app import use # Note that we load the trees module. # # We also load the morphology of Open Scriptures for example usage later on. # In[3]: A = use("ETCBC/bhsa", mod="ETCBC/trees/tf,ETCBC/bridging/tf", hoist=globals()) # We first inspect the nature of these features, lets pick the first, last and middle sentence of # the Hebrew Bible # In[4]: sentences = F.otype.s("sentence") examples = (sentences[0], sentences[len(sentences) // 2], sentences[-1]) # We examine feature `tree`: # In[5]: for s in examples: print(F.tree.v(s)) # Now `treen`: # In[6]: for s in examples: print(F.treen.v(s)) # The structure of the trees is the same, but `treen` has numbers between braces in the tags of the nodes. # These numbers are the Text-Fabric nodes of the sentences, clauses and phrases that the nodes of the tree # correspond to. # ## Using trees # # These strings are not very pleasant to the eye. # For one thing, we see numbers instead of words. # They also seem a bit unwieldy to integrate with the usual text-fabric business. # But nothing is farther from the truth. # # We show how to # # * produce a multiline view # * see the words (in several representations) # * add a gloss # * add morphological data from an other project (**Open Scriptures**) # # Honesty compels us to note that we make use of a bunch of auxiliary functions in an # accompanying `utils` pacckage: # In[7]: passage = ("Job", 3, 16) passageStr = "{} {}:{}".format(*passage) verse = T.nodeFromSection(passage) sentence = L.d(verse, otype="sentence")[0] firstSlot = L.d(sentence, otype="word")[0] stringTree = F.tree.v(sentence) print(f"{passageStr} - first word = {firstSlot}\n\ntree =\n{stringTree}") # ## Parsing # # Key to effective manipulation of tree strings is to parse them into tree structures: lists of lists. # # Here we use the generic utility `structure()`: # In[8]: tree = structure(stringTree) tree # ## Apply layout # # Having the real tree structure in hand, we can layout it in all kinds of ways. # We use the generic utility `layout()` to # display it a bit more friendly and to replace the numbers by real Text-Fabric slot numbers: # In[9]: print(layout(tree, firstSlot, str)) # That opens up the way to get the words in. # The third argument of `layout()` above is `str`, which is a function that is applied to the slot numbers. # It returns those numbers as string, and this is what ends up in the layout. # ## Fillin the words # # We can pass any function, why not the function that looks up the word? # # Remember that `F.g_word_utf8.v` is a function that returns the full Hebrew word given a slot node. # In[10]: print(layout(tree, firstSlot, F.g_word_utf8.v)) # ## Add a gloss # In[11]: def gloss(n): lexNode = L.u(n, otype="lex")[0] return f'{F.g_word_utf8.v(n)} "{F.gloss.v(lexNode)}"' print(layout(tree, firstSlot, gloss)) # ## Morphology # # In 2018 I compared the morphology of Open Scriptures with that of the BHSA. # See [brdiging](https://nbviewer.jupyter.org/github/ETCBC/bridging/blob/master/programs/BHSAbridgeOSM.ipynb). # # As a by-product I saved their morphology as a Text-Fabric feature on words. # So we can add it to our trees. # # We also show the nesting depth in the resulting tree. # In[12]: def osmPhonoGloss(n): lexNode = L.u(n, otype="lex")[0] return ( f'({F.osm.v(n)}) {F.g_word_utf8.v(n)} [{F.phono.v(n)}] "{F.gloss.v(lexNode)}"' ) print(layout(tree, firstSlot, osmPhonoGloss, withLevel=True)) # ## Taking it further # # We saw how the fact that we have slot numbers in our tree structures opens up all kinds of # possibilities for further processing. # # However, so far, we have only made use of slot nodes. # # What if we want to draw in side information for the non-terminal nodes? # # That is where the feature `treen` comes in. # It has node information for all non-terminals between braces, so it is fairly easy to write # new `structure()` and `layout()` functions that exploit them. # # # # All steps # # * **[start](start.ipynb)** your first step in mastering the bible computationally # * **[display](display.ipynb)** become an expert in creating pretty displays of your text structures # * **[search](search.ipynb)** turbo charge your hand-coding with search templates # * **[exportExcel](exportExcel.ipynb)** make tailor-made spreadsheets out of your results # * **[share](share.ipynb)** draw in other people's data and let them use yours # * **[export](export.ipynb)** export your dataset as an Emdros database # * **[annotate](annotate.ipynb)** annotate plain text by means of other tools and import the annotations as TF features # * **[map](map.ipynb)** map somebody else's annotations to a new version of the corpus # * **[volumes](volumes.ipynb)** work with selected books only # * **trees** work with the BHSA data as syntax trees # # CC-BY Dirk Roorda