#!/usr/bin/env python # coding: utf-8 # # How to query the `nametype` feature? # # ## The case # # Victor Isaak reported a strange case on Slack, a SHEBANQ query # # ``` # select all objects # where # [lex focus # nametype = 'pers' # OR # nametype = 'gens' # ] # ``` # # whose results were not shown properly shown in SHEBANQ. # # In particular, in this verse there seem to be 3 hits, but only one hit (`Riphath`) is highlighted: # # ![nr](images/nametype.png) # # ## Locating # # Let's drill down by means of Text-Fabric. # # First we need to find where this case is, and in what version of the BHSA it occurs. # # We start with loading version `c` and locating the case. # We will load the versions `4b`, `2017` and `c`. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: from tf.app import use # In[4]: # A = use("ETCBC/bhsa", hoist=globals()) A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals()) # Make sure which version we have: # In[5]: A.version # Right. Let's start with looking for `RIJPA37T` in the `g_word` feature: # In[6]: results = A.search( """ word g_word=RIJPA73T """ ) # Good, that's clear. Where is it? # In[7]: w = results[0][0] A.webLink(w) # When we click this link, the verse opens in SHEBANQ # ## Reproducing # # Now let's do the original query. # In[8]: query = """ lex nametype=pers|gens """ # In[9]: results = A.search(query) # Lets find the occurrences of the results in this verse: # In[10]: query = """ lex nametype=pers|gens w:word verse book=Genesis chapter=10 verse=3 w """ # In[11]: results = A.search(query) # We show these words and their name types # In[12]: A.show(results, condensed=True, withNodes=True) # This looks perfectly alright. # # Other versions # # Let's repeat this exercise in two other versions: `2017` and `4b`. # # We write a function that produces the result right away. # # The TF-API of the data source is passed as parameter. # In[13]: def gensPers(A): A.dm(f"Version `{A.version}`\n") query = """ lex nametype=pers|gens w:word verse book=Genesis chapter=10 verse=3 w """ results = A.search(query) A.show(results, condensed=True, withNodes=True) # We load version 2017, but without hoisting the API to the global namespace. # Instead, we retain the API in a mapping from version name to TF-API. # We make sure that we do not loose the API for version `c` which we have just loaded. # In[14]: A = {"c": A} # ## 2017 # In[16]: # A['2017'] = use('ETCBC/bhsa', version='2017') A["2017"] = use("ETCBC/bhsa:clone", checkout="clone", version="2017") # In[17]: gensPers(A["2017"]) # Observation: the same words are highlighted, but the `nametype` feature is not shown. Why? Probably because in version `2017` # the `nametype` feature is only available for `lex` nodes and not for `word` nodes. Let's find out for sure. # In[18]: A2017 = A["2017"] F2017 = A2017.api.F L2017 = A2017.api.L w = 4572 lx = L2017.u(w, otype="lex")[0] A2017.dm(f"*word* {w} has nametype `{F2017.nametype.v(w)}`\n") A2017.dm(f"*lexeme* {lx} has nametype `{F2017.nametype.v(lx)}`\n") # Indeed! # ## 4b # In[19]: # A['4b'] = use('ETCBC/bhsa', version='4b') A["4b"] = use("ETCBC/bhsa:clone", checkout="clone", version="4b") # Ok, version `4b` is rather old. We go to GitHub to look at the # [release notes of the BHSA data](https://github.com/ETCBC/bhsa/releases). # # There we see that the latest release of the data does not include the older versions anymore. # So we have to go back to an earlier release, `v1.5`: # In[23]: # A['4b'] = use('ETCBC/bhsa', checkout='1.5', version='4b') A["4b"] = use("ETCBC/bhsa:clone", checkout="1.5", version="4b") # Now we have the core data, but TF wants to get additional data (`parallels` and `phono` which is not available for this version in this release. # We tell TF to not fetch additional modules: # In[24]: # A['4b'] = use('ETCBC/bhsa', checkout='1.5', version='4b', provenanceSpec=dict(moduleSpecs=())) A["4b"] = use( "bhsa:clone", checkout="1.5", version="4b", provenanceSpec=dict(moduleSpecs=()) ) # In[25]: gensPers(A["4b"]) # Again: all three highlighted. # In[26]: # A['4'] = use('ETCBC/bhsa', checkout='1.5', version='4', provenanceSpec=dict(moduleSpecs=())) A["4"] = use( "bhsa:clone", checkout="1.5", version="4", provenanceSpec=dict(moduleSpecs=()) ) # In[27]: gensPers(A["4"]) # # Conclusion 1 # # Versions `4`, `4b`, `2017`, and `c` of the BHSA all have the `nametype` feature on `lex` nodes with values `pers`, `gens`, `gens` for the three words of Genesis 10:3. # # Version `c` also has the `nametype` on `word` nodes. # # Conclusion 2 # # I have run this # query on [SHEBANQ](https://shebanq.ancient-data.org/hebrew/query?id=3921) # on version `2017`, and `c` and they all produced the expected results. # # For version `4` and `4b` I had to modify the query, because these versions have not the `lex` node type. # # The data on GitHub, however, has the `lex` node type, see [`otype` in version 4](https://github.com/ETCBC/bhsa/blob/master/tf/4/otype.tf). # # Probably I have added `lex` later to `4` and `4b`, without bringing it over to SHEBANQ. # Without more information I can not reproduce the screenshot at the start of the notebook. # # Reproduced! # # Viktor has shared the full [query](https://shebanq.ancient-data.org/hebrew/query?version=c&id=3919). # # Observations: # # The text of the query is # # ``` # select all objects # in {4539-4965} # where # [lex focus # nametype = 'pers' # OR # nametype = 'gens' # ] # ``` # # There are only three results, one of which is in Genesis 10:3, the word `RIJPA73T` only. # # Explanation # # How does this make sense? The meaning of the query is: # # * restrict the search to the portion of the corpus from slot (word) 4539 till slot 4965 (including); # * in that portion find lexeme nodes in it with certain properties # # What does it mean, lexeme nodes inside a portion of the corpus? # # A lexeme node occupies the slots of its occurrences, so we are interested in lexemes that have # all of their occurrences in the indicated portion. # # This rules out many lexemes. # # Let's verify by manual coding that the other two `pers` and `gens` words in Genesis 10:3 # have occurrences outside this region. # We do this in version `c` and continue to work in this version only. We still have the globals `N E F L T S TF` tied to the `c` version # of the data, we only have to restore `A` to the `c` version: # In[28]: A = A["c"] # We repeat the query # In[29]: query = """ lex nametype=pers|gens w:word verse book=Genesis chapter=10 verse=3 w """ # In[30]: results = A.search(query) # Of each result (a tuple of nodes), we pretty-display the lex node, which is the first of the tuple # since `lex` is the first node mentioned in the search template. # # The pretty display of a lexeme shows the first and last occurrence of it. # In[31]: for result in results: lx = result[0] A.pretty(lx) # Indeed, only `RIJPA73T` occurs in the narrow portion that the SHEBANQ query was looking in. # # Tips # # How to query for word with certain lexeme properties in a portion of the query? # # If the lexeme properties are present on the occurrences of the lexeme (the word nodes), # this query will do: # # ``` # select all objects # in {4539-4965} # where # [word focus # nametype = 'pers' # OR # nametype = 'gens' # ] # ``` # But, as we saw, in version `2017` the `nametype` property only exists on the `lex` nodes? # # How do we go about this then? # # The clue is on p. 21 of Ulrik's MQL query guide. # We can search for words that are contained in a lex by using monad set relation clauses: # # ``` # select all objects # in {4539-4965} # where # [word focus # [lex overlap(substrate) nametype = 'pers' OR nametype = 'gens'] # ] # ``` # # So, we start for selecting all words 4539 - 4965, and for each word we require that there is a lex with some # properties that overlaps with it. # See [`nametype` x](https://shebanq.ancient-data.org/hebrew/query?version=c&id=3922) # on SHEBANQ.