import os
from itertools import chain
from tf.app import use
from tf.core.files import dirMake
A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals())
Locating corpus resources ...
Name | # of nodes | # slots/node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
lex | 9230 | 46.22 | 100 |
verse | 23213 | 18.38 | 100 |
half_verse | 45179 | 9.44 | 100 |
sentence | 63717 | 6.70 | 100 |
sentence_atom | 64514 | 6.61 | 100 |
clause | 88131 | 4.84 | 100 |
clause_atom | 90704 | 4.70 | 100 |
phrase | 253203 | 1.68 | 100 |
phrase_atom | 267532 | 1.59 | 100 |
subphrase | 113850 | 1.42 | 38 |
word | 426590 | 1.00 | 100 |
3
ETCBC/bhsa
/Users/me/github/ETCBC/bhsa/app
''
<code>Genesis 1:1</code> (use <a href="https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf" target="_blank">English book names</a>)
g_uvf_utf8
g_vbs
kq_hybrid
languageISO
g_nme
lex0
is_root
g_vbs_utf8
g_uvf
dist
root
suffix_person
g_vbe
dist_unit
suffix_number
distributional_parent
kq_hybrid_utf8
crossrefSET
instruction
g_prs
lexeme_count
rank_occ
g_pfm_utf8
freq_occ
crossrefLCS
functional_parent
g_pfm
g_nme_utf8
g_vbe_utf8
kind
g_prs_utf8
suffix_gender
mother_object_type
absent
n/a
none
unknown
NA
{docRoot}/{repo}
''
''
https://{org}.github.io
0_home
{}
True
clone
/Users/me/github/ETCBC/bhsa/_temp
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
10.5281/zenodo.1007624
Phonetic Transcriptions
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
10.5281/zenodo.1007636
ETCBC
/tf
phono
Parallel Passages
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
10.5281/zenodo.1007642
ETCBC
/tf
parallels
ETCBC
/tf
bhsa
2021
https://shebanq.ancient-data.org/hebrew
Show this on SHEBANQ
la
True
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
{webBase}/word?version={version}&id=<lid>
{typ} {rela}
''
True
{code}
1
''
True
{label}
''
True
gloss
{voc_lex_utf8}
word
orig
{voc_lex_utf8}
{typ} {function}
''
True
{typ} {rela}
1
''
{number}
''
True
{number}
1
''
True
{number}
''
pdp vs vt
lex:gloss
hbo
query = """
phrase
=: fi:word
# la:word
:=
fi .nu. la
"""
resultsQ = A.search(query)
0.90s 15522 results
A.show(resultsQ, end=3, condenseType="clause")
result 1
result 2
result 3
resultsH = []
for p in F.otype.s("phrase"):
ws = L.d(p, otype="word")
if len(ws) < 2:
continue
fi = ws[0]
la = ws[-1]
if F.nu.v(fi) != F.nu.v(la):
continue
resultsH.append((p, fi, la))
len(resultsH)
15522
set(resultsQ) == set(resultsH)
True
Challenges:
Given a phrase, we need to find its words.
Well, a phrase is an annotation with key otype
and value phrase
and target some word annotations.
These target annotations are easy to get, by means of the annotations()
method on annotations.
Then we have to find the first and last words among these targets.
That is currently difficult!
You need a concept of order between annotations. One possibility is to put sequence numbers in the data, as annotations. But that is very cumbersome, because you need to refer to yet another level of annotation. And it will inflate the data.
The other possibility is "canonical ordering". Annotations that target the text can be ordered by their targets. A target is a subset of textual positions. Two such subsets can be ordered as follows:
As part of the index building, you could create the rank of each annotation in this ordering.
Annotations that target annotations that are already canonically ordered, can themselves be canonically ordered w.r.t. their targets.
Without this, the user will need to implement sorting in ad-hoc ways.
Given the annotations for the first and last word in a phrase, we have to find annotations with key nu
and target these words, and read off their value.
That is currently difficult!
A way out is this:
As preparation, before looping through the phrases:
Make a dict that associates word annotation identifiers with nu
-values:
nu
, for each annotation:Then, for each phrase with at least two words:
nu
-value for that wordnu
-value for that wordThis can be improved if the API offers an efficient function to look up values. That could be a pre computation of all those dictionaries.
Even better: those dictionaries could be the primary data!
import stam
from memutil import memUsage
memUsage()
workDir = f"{A.tempDir}/stam"
storeC = stam.AnnotationStore(file=f"{workDir}/bhsa.store.stam.csv")
Current: 3.01 GB Delta: 3.01 GB
aDataSet = list(storeC.annotationsets())[0]
def stamOtype(otype):
otypeData = aDataSet.find_data("otype", otype)
otypeAnnos = otypeData[0].annotations()
return otypeAnnos
def idsOf(annos):
return {a.id() for a in annos}
# get the word annotations, sorted, and the phrase annotations
def getPos(wordAnno):
t = wordAnno.textselections()[0]
return (t.begin(), t.end())
wordAnnos = stamOtype("word")
wordIds = idsOf(wordAnnos)
phraseAnnos = stamOtype("phrase")
wordAnnos = sorted(wordAnnos, key=getPos)
# make a rank of the word annos
wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}
# get the phrase annotations together with their first and last word
phrases = []
for pAnno in phraseAnnos:
words = pAnno.annotations()
if len(words) < 2:
continue
sortedWords = sorted(words, key=lambda x: wordRank[x.id()])
phrases.append((p, words[0], words[-1]))
len(phrases)
78754
# intermediate check with TF
query = """
phrase
=: word
# word
:=
"""
results = A.search(query)
0.90s 78754 results
# get the `nu` information ready
# we collect a dict keyed by word id with values the grammatical number of the word
nuKey = aDataSet.key("nu")
nuAnnos = nuKey.annotations()
nuValue = {}
for nuAnno in nuAnnos:
value = nuAnno.data()[0].value()
word = list(nuAnno.annotations())[0]
nuValue[word.id()] = value
# check some values
for wordAnno in wordAnnos[0:11]:
print(f"{wordAnno} {nuValue[wordAnno.id()]}")
בְּ NA רֵאשִׁ֖ית sg בָּרָ֣א sg אֱלֹהִ֑ים pl אֵ֥ת NA הַ NA שָּׁמַ֖יִם pl וְ NA אֵ֥ת NA הָ NA אָֽרֶץ00 sg
So far so good!
# now compute the final result
resultsSTAM = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]
len(resultsSTAM)
15522
# The complete task in one go
def getNicePhrases():
aDataSet = list(storeC.annotationsets())[0]
wordAnnos = sorted(stamOtype("word"), key=getPos)
wordIds = idsOf(wordAnnos)
wordRank = {anno.id(): i for (i, anno) in enumerate(wordAnnos)}
phraseAnnos = stamOtype("phrase")
phrases = []
for pAnno in phraseAnnos:
words = pAnno.annotations()
if len(words) < 2:
continue
sortedWords = sorted(words, key=lambda x: wordRank[x.id()])
phrases.append((p, words[0], words[-1]))
nuKey = aDataSet.key("nu")
nuAnnos = nuKey.annotations()
nuValue = {}
for nuAnno in nuAnnos:
value = nuAnno.data()[0].value()
word = list(nuAnno.annotations())[0]
nuValue[word.id()] = value
results = [x for x in phrases if nuValue[x[1].id()] == nuValue[x[2].id()]]
print(len(results))
return results
resultsSTAM = getNicePhrases()
15522
The execution times for this task were
TF query | TF hand coding | STAM |
---|---|---|
0.9 | 0.2 | 2.0 |
But in STAM we can move quite a bit out of the task:
nu
value should be optimized (could save 0.9 sec)