%load_ext autoreload
%autoreload 2
from tf.app import use
A = use("ETCBC/bhsa:clone", hoist=globals())
Locating corpus resources ...
Name | # of nodes | # slots / node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
lex | 9230 | 46.22 | 100 |
verse | 23213 | 18.38 | 100 |
half_verse | 45179 | 9.44 | 100 |
sentence | 63717 | 6.70 | 100 |
sentence_atom | 64514 | 6.61 | 100 |
clause | 88131 | 4.84 | 100 |
clause_atom | 90704 | 4.70 | 100 |
phrase | 253203 | 1.68 | 100 |
phrase_atom | 267532 | 1.59 | 100 |
subphrase | 113850 | 1.42 | 38 |
word | 426590 | 1.00 | 100 |
3
ETCBC/bhsa
/Users/me/github/ETCBC/bhsa/app
''
<code>Genesis 1:1</code> (use <a href="https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf" target="_blank">English book names</a>)
g_uvf_utf8
g_vbs
kq_hybrid
languageISO
g_nme
lex0
is_root
g_vbs_utf8
g_uvf
dist
root
suffix_person
g_vbe
dist_unit
suffix_number
distributional_parent
kq_hybrid_utf8
crossrefSET
instruction
g_prs
lexeme_count
rank_occ
g_pfm_utf8
freq_occ
crossrefLCS
functional_parent
g_pfm
g_nme_utf8
g_vbe_utf8
kind
g_prs_utf8
suffix_gender
mother_object_type
absent
n/a
none
unknown
NA
{docRoot}/{repo}
''
''
https://{org}.github.io
0_home
{}
True
clone
/Users/me/github/ETCBC/bhsa/_temp
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
10.5281/zenodo.1007624
ner
Phonetic Transcriptions
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
10.5281/zenodo.1007636
ETCBC
/tf
phono
Parallel Passages
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
10.5281/zenodo.1007642
ETCBC
/tf
parallels
ETCBC
/tf
bhsa
2021
https://shebanq.ancient-data.org/hebrew
Show this on SHEBANQ
la
True
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
{webBase}/word?version={version}&id=<lid>
{typ} {rela}
''
True
{code}
1
''
True
{label}
''
True
gloss
{voc_lex_utf8}
word
orig
{voc_lex_utf8}
{typ} {function}
''
True
{typ} {rela}
1
''
{number}
''
True
{number}
1
''
True
{number}
''
pdp vs vt
lex:gloss
hbo
A.header(allMeta=True)
Name | # of nodes | # slots / node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
lex | 9230 | 46.22 | 100 |
verse | 23213 | 18.38 | 100 |
half_verse | 45179 | 9.44 | 100 |
sentence | 63717 | 6.70 | 100 |
sentence_atom | 64514 | 6.61 | 100 |
clause | 88131 | 4.84 | 100 |
clause_atom | 90704 | 4.70 | 100 |
phrase | 253203 | 1.68 | 100 |
phrase_atom | 267532 | 1.59 | 100 |
subphrase | 113850 | 1.42 | 38 |
word | 426590 | 1.00 | 100 |
3
ETCBC/bhsa
/Users/me/github/ETCBC/bhsa/app
''
<code>Genesis 1:1</code> (use <a href="https://github.com/{org}/{repo}/blob/master/tf/{version}/book%40en.tf" target="_blank">English book names</a>)
g_uvf_utf8
g_vbs
kq_hybrid
languageISO
g_nme
lex0
is_root
g_vbs_utf8
g_uvf
dist
root
suffix_person
g_vbe
dist_unit
suffix_number
distributional_parent
kq_hybrid_utf8
crossrefSET
instruction
g_prs
lexeme_count
rank_occ
g_pfm_utf8
freq_occ
crossrefLCS
functional_parent
g_pfm
g_nme_utf8
g_vbe_utf8
kind
g_prs_utf8
suffix_gender
mother_object_type
absent
n/a
none
unknown
NA
{docRoot}/{repo}
''
''
https://{org}.github.io
0_home
{}
True
clone
/Users/me/github/ETCBC/bhsa/_temp
BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis
10.5281/zenodo.1007624
ner
Phonetic Transcriptions
https://nbviewer.jupyter.org/github/etcbc/phono/blob/master/programs/phono.ipynb
10.5281/zenodo.1007636
ETCBC
/tf
phono
Parallel Passages
https://nbviewer.jupyter.org/github/ETCBC/parallels/blob/master/programs/parallels.ipynb
10.5281/zenodo.1007642
ETCBC
/tf
parallels
ETCBC
/tf
bhsa
2021
https://shebanq.ancient-data.org/hebrew
Show this on SHEBANQ
la
True
{webBase}/text?book=<1>&chapter=<2>&verse=<3>&version={version}&mr=m&qw=q&tp=txt_p&tr=hb&wget=v&qget=v&nget=vt
{webBase}/word?version={version}&id=<lid>
{typ} {rela}
''
True
{code}
1
''
True
{label}
''
True
gloss
{voc_lex_utf8}
word
orig
{voc_lex_utf8}
{typ} {function}
''
True
{typ} {rela}
1
''
{number}
''
True
{number}
1
''
True
{number}
''
pdp vs vt
lex:gloss
hbo
import collections
The following snippet is thanks to Marek Polášek:
A.indent(reset=True)
data = collections.defaultdict(list)
allTypes = F.otype.all
for prop in Fall():
print(prop)
if prop == "otype":
continue
for t in allTypes:
if len(Fs(prop).freqList({t})) > 0:
data[prop].append(t)
print(f"\t{', '.join(data[prop])}")
A.info("done")
book book, chapter, verse book@am book book@ar book book@bn book book@da book book@de book book@el book book@en book book@es book book@fa book book@fr book book@he book book@hi book book@id book book@ja book book@ko book book@la book book@nl book book@pa book book@pt book book@ru book book@sw book book@syc book book@tr book book@ur book book@yo book book@zh book chapter chapter, verse code clause_atom det phrase, phrase_atom domain clause freq_lex lex, word function phrase g_cons word g_cons_utf8 word g_lex word g_lex_utf8 word g_word word g_word_utf8 word gloss lex, word gn word label verse, half_verse language lex, word lex lex, word lex_utf8 lex, word ls lex, word nametype lex, word nme word nu word number sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word otype pargr clause_atom pdp word pfm word phono word phono_trailer word prs word prs_gn word prs_nu word prs_ps word ps word qere word qere_trailer word qere_trailer_utf8 word qere_utf8 word rank_lex lex, word rela clause, phrase, phrase_atom, subphrase sp lex, word st word tab clause_atom trailer word trailer_utf8 word txt clause typ clause, clause_atom, phrase, phrase_atom uvf word vbe word vbs word verse verse voc_lex lex, word voc_lex_utf8 lex, word vs word vt word 21s done
Here I explore if it can be done a bit faster
A.indent(reset=True)
data1 = collections.defaultdict(list)
allProps = [p for p in Fall() if p != "otype"]
for t in F.otype.all:
print(t)
nodes = F.otype.s(t)
for prop in allProps:
if any(Fs(prop).v(n) is not None for n in nodes):
data1[prop].append(t)
for prop in allProps:
print(prop)
print(f"\t{', '.join(data1[prop])}")
A.info("done")
book chapter lex verse half_verse sentence sentence_atom clause clause_atom phrase phrase_atom subphrase word book book, chapter, verse book@am book book@ar book book@bn book book@da book book@de book book@el book book@en book book@es book book@fa book book@fr book book@he book book@hi book book@id book book@ja book book@ko book book@la book book@nl book book@pa book book@pt book book@ru book book@sw book book@syc book book@tr book book@ur book book@yo book book@zh book chapter chapter, verse code clause_atom det phrase, phrase_atom domain clause freq_lex lex, word function phrase g_cons word g_cons_utf8 word g_lex word g_lex_utf8 word g_word word g_word_utf8 word gloss lex, word gn word label verse, half_verse language lex, word lex lex, word lex_utf8 lex, word ls lex, word nametype lex, word nme word nu word number sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word pargr clause_atom pdp word pfm word phono word phono_trailer word prs word prs_gn word prs_nu word prs_ps word ps word qere word qere_trailer word qere_trailer_utf8 word qere_utf8 word rank_lex lex, word rela clause, phrase, phrase_atom, subphrase sp lex, word st word tab clause_atom trailer word trailer_utf8 word txt clause typ clause, clause_atom, phrase, phrase_atom uvf word vbe word vbs word verse verse voc_lex lex, word voc_lex_utf8 lex, word vs word vt word 14s done
data == data1
True
Marek responded with this algorithm:
A.indent(reset=True)
data3 = collections.defaultdict(list)
nodes_type = collections.defaultdict(list)
allProps = [p for p in Fall() if p != "otype"]
for t in F.otype.all:
print(t)
nodes_type[t] = F.otype.s(t)
for prop in allProps:
nodes_with_prop = set([i[0] for i in Fs(prop).items()])
for t in F.otype.all:
if len(nodes_with_prop.intersection(nodes_type[t]))>0:
data3[prop].append(t)
for prop in allProps:
print(prop)
print(f"\t{', '.join(data3[prop])}")
A.info("done")
book chapter lex verse half_verse sentence sentence_atom clause clause_atom phrase phrase_atom subphrase word book book, chapter, verse book@am book book@ar book book@bn book book@da book book@de book book@el book book@en book book@es book book@fa book book@fr book book@he book book@hi book book@id book book@ja book book@ko book book@la book book@nl book book@pa book book@pt book book@ru book book@sw book book@syc book book@tr book book@ur book book@yo book book@zh book chapter chapter, verse code clause_atom det phrase, phrase_atom domain clause freq_lex lex, word function phrase g_cons word g_cons_utf8 word g_lex word g_lex_utf8 word g_word word g_word_utf8 word gloss lex, word gn word label verse, half_verse language lex, word lex lex, word lex_utf8 lex, word ls lex, word nametype lex, word nme word nu word number sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word pargr clause_atom pdp word pfm word phono word phono_trailer word prs word prs_gn word prs_nu word prs_ps word ps word qere word qere_trailer word qere_trailer_utf8 word qere_utf8 word rank_lex lex, word rela clause, phrase, phrase_atom, subphrase sp lex, word st word tab clause_atom trailer word trailer_utf8 word txt clause typ clause, clause_atom, phrase, phrase_atom uvf word vbe word vbs word verse verse voc_lex lex, word voc_lex_utf8 lex, word vs word vt word 2.24s done
data3 == data1
True
Marvellous!
Now I make that code a tad more pythonic.
A.indent(reset=True)
data4 = collections.defaultdict(list)
nodes_type = collections.defaultdict(list)
allProps = [p for p in Fall() if p != "otype"]
allTypes = F.otype.all
for t in allTypes:
print(t)
nodes_type[t] = set(F.otype.s(t))
for prop in allProps:
nodes_with_prop = {i[0] for i in Fs(prop).items()}
for t in allTypes:
if len(nodes_with_prop & nodes_type[t]) > 0:
data4[prop].append(t)
for prop in allProps:
print(prop)
print(f"\t{', '.join(data4[prop])}")
A.info("done")
book chapter lex verse half_verse sentence sentence_atom clause clause_atom phrase phrase_atom subphrase word book book, chapter, verse book@am book book@ar book book@bn book book@da book book@de book book@el book book@en book book@es book book@fa book book@fr book book@he book book@hi book book@id book book@ja book book@ko book book@la book book@nl book book@pa book book@pt book book@ru book book@sw book book@syc book book@tr book book@ur book book@yo book book@zh book chapter chapter, verse code clause_atom det phrase, phrase_atom domain clause freq_lex lex, word function phrase g_cons word g_cons_utf8 word g_lex word g_lex_utf8 word g_word word g_word_utf8 word gloss lex, word gn word label verse, half_verse language lex, word lex lex, word lex_utf8 lex, word ls lex, word nametype lex, word nme word nu word number sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word pargr clause_atom pdp word pfm word phono word phono_trailer word prs word prs_gn word prs_nu word prs_ps word ps word qere word qere_trailer word qere_trailer_utf8 word qere_utf8 word rank_lex lex, word rela clause, phrase, phrase_atom, subphrase sp lex, word st word tab clause_atom trailer word trailer_utf8 word txt clause typ clause, clause_atom, phrase, phrase_atom uvf word vbe word vbs word verse verse voc_lex lex, word voc_lex_utf8 lex, word vs word vt word 1.73s done
data4 == data3
True
This takes 25% off the execution time.
The crux of Marek's optimization is to start with the items in the feature, which for most features is smaller than the number of nodes of a given type.
Knowing that, I wonder whether I can use the iterator any
again instead of set construction?
Will that improve the speed? No set has to be constructed. But set construction is very fast. Let's see.
A.indent(reset=True)
data5 = collections.defaultdict(list)
nodes_type = collections.defaultdict(list)
allProps = [p for p in Fall() if p != "otype"]
allTypes = F.otype.all
for t in allTypes:
print(t)
nodes_type[t] = set(F.otype.s(t))
for prop in allProps:
nodes_with_prop = [i[0] for i in Fs(prop).items()]
for t in allTypes:
these_nodes_type = nodes_type[t]
if any(n in these_nodes_type for n in nodes_with_prop):
data5[prop].append(t)
for prop in allProps:
print(prop)
print(f"\t{', '.join(data5[prop])}")
A.info("done")
book chapter lex verse half_verse sentence sentence_atom clause clause_atom phrase phrase_atom subphrase word book book, chapter, verse book@am book book@ar book book@bn book book@da book book@de book book@el book book@en book book@es book book@fa book book@fr book book@he book book@hi book book@id book book@ja book book@ko book book@la book book@nl book book@pa book book@pt book book@ru book book@sw book book@syc book book@tr book book@ur book book@yo book book@zh book chapter chapter, verse code clause_atom det phrase, phrase_atom domain clause freq_lex lex, word function phrase g_cons word g_cons_utf8 word g_lex word g_lex_utf8 word g_word word g_word_utf8 word gloss lex, word gn word label verse, half_verse language lex, word lex lex, word lex_utf8 lex, word ls lex, word nametype lex, word nme word nu word number sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word pargr clause_atom pdp word pfm word phono word phono_trailer word prs word prs_gn word prs_nu word prs_ps word ps word qere word qere_trailer word qere_trailer_utf8 word qere_utf8 word rank_lex lex, word rela clause, phrase, phrase_atom, subphrase sp lex, word st word tab clause_atom trailer word trailer_utf8 word txt clause typ clause, clause_atom, phrase, phrase_atom uvf word vbe word vbs word verse verse voc_lex lex, word voc_lex_utf8 lex, word vs word vt word 8.12s done
data5 == data4
True
Important lesson: the overhead of the set construction is far less than using any
.
In the set intersection there is no loop with python code involved, so it proceeds with C-speed.
In the any
iterator there is a Python expression (n in these_nodes_type
) and that will be executed at Python speed.
There we loose the time.
So the second crux of Marek's method is to use set intersection instead of an iterator.
showFeatureTypes()
¶A.featureTypes()
feature | node types |
---|---|
book | book, chapter, verse |
book@am | book |
book@ar | book |
book@bn | book |
book@da | book |
book@de | book |
book@el | book |
book@en | book |
book@es | book |
book@fa | book |
book@fr | book |
book@he | book |
book@hi | book |
book@id | book |
book@ja | book |
book@ko | book |
book@la | book |
book@nl | book |
book@pa | book |
book@pt | book |
book@ru | book |
book@sw | book |
book@syc | book |
book@tr | book |
book@ur | book |
book@yo | book |
book@zh | book |
chapter | chapter, verse |
code | clause_atom |
det | phrase, phrase_atom |
domain | clause |
freq_lex | lex, word |
function | phrase |
g_cons | word |
g_cons_utf8 | word |
g_lex | word |
g_lex_utf8 | word |
g_word | word |
g_word_utf8 | word |
gloss | lex, word |
gn | word |
label | verse, half_verse |
language | lex, word |
lex | lex, word |
lex_utf8 | lex, word |
ls | lex, word |
nametype | lex, word |
nme | word |
nu | word |
number | sentence, sentence_atom, clause, clause_atom, phrase, phrase_atom, word |
pargr | clause_atom |
pdp | word |
pfm | word |
phono | word |
phono_trailer | word |
prs | word |
prs_gn | word |
prs_nu | word |
prs_ps | word |
ps | word |
qere | word |
qere_trailer | word |
qere_trailer_utf8 | word |
qere_utf8 | word |
rank_lex | lex, word |
rela | clause, phrase, phrase_atom, subphrase |
sp | lex, word |
st | word |
tab | clause_atom |
trailer | word |
trailer_utf8 | word |
txt | clause |
typ | clause, clause_atom, phrase, phrase_atom |
uvf | word |
vbe | word |
vbs | word |
verse | verse |
voc_lex | lex, word |
voc_lex_utf8 | lex, word |
vs | word |
vt | word |
To get this overview in a dict, call A.featureTypes(show=False)
A.featureTypes(show=False)
{'book': ['book', 'chapter', 'verse'], 'book@am': ['book'], 'book@ar': ['book'], 'book@bn': ['book'], 'book@da': ['book'], 'book@de': ['book'], 'book@el': ['book'], 'book@en': ['book'], 'book@es': ['book'], 'book@fa': ['book'], 'book@fr': ['book'], 'book@he': ['book'], 'book@hi': ['book'], 'book@id': ['book'], 'book@ja': ['book'], 'book@ko': ['book'], 'book@la': ['book'], 'book@nl': ['book'], 'book@pa': ['book'], 'book@pt': ['book'], 'book@ru': ['book'], 'book@sw': ['book'], 'book@syc': ['book'], 'book@tr': ['book'], 'book@ur': ['book'], 'book@yo': ['book'], 'book@zh': ['book'], 'chapter': ['chapter', 'verse'], 'code': ['clause_atom'], 'det': ['phrase', 'phrase_atom'], 'domain': ['clause'], 'freq_lex': ['lex', 'word'], 'function': ['phrase'], 'g_cons': ['word'], 'g_cons_utf8': ['word'], 'g_lex': ['word'], 'g_lex_utf8': ['word'], 'g_word': ['word'], 'g_word_utf8': ['word'], 'gloss': ['lex', 'word'], 'gn': ['word'], 'label': ['verse', 'half_verse'], 'language': ['lex', 'word'], 'lex': ['lex', 'word'], 'lex_utf8': ['lex', 'word'], 'ls': ['lex', 'word'], 'nametype': ['lex', 'word'], 'nme': ['word'], 'nu': ['word'], 'number': ['sentence', 'sentence_atom', 'clause', 'clause_atom', 'phrase', 'phrase_atom', 'word'], 'pargr': ['clause_atom'], 'pdp': ['word'], 'pfm': ['word'], 'phono': ['word'], 'phono_trailer': ['word'], 'prs': ['word'], 'prs_gn': ['word'], 'prs_nu': ['word'], 'prs_ps': ['word'], 'ps': ['word'], 'qere': ['word'], 'qere_trailer': ['word'], 'qere_trailer_utf8': ['word'], 'qere_utf8': ['word'], 'rank_lex': ['lex', 'word'], 'rela': ['clause', 'phrase', 'phrase_atom', 'subphrase'], 'sp': ['lex', 'word'], 'st': ['word'], 'tab': ['clause_atom'], 'trailer': ['word'], 'trailer_utf8': ['word'], 'txt': ['clause'], 'typ': ['clause', 'clause_atom', 'phrase', 'phrase_atom'], 'uvf': ['word'], 'vbe': ['word'], 'vbs': ['word'], 'verse': ['verse'], 'voc_lex': ['lex', 'word'], 'voc_lex_utf8': ['lex', 'word'], 'vs': ['word'], 'vt': ['word']}
for lx in F.otype.s("lex"):
lex = F.lex.v(lx)
lan = F.language.v(lx)
if lex[0:3] == "NHR":
print(f"{lx=} {lex=} {lan=}")
lx=1437743 lex='NHR/' lan='Hebrew' lx=1442740 lex='NHR[' lan='Hebrew' lx=1443420 lex='NHR=[' lan='Hebrew' lx=1444637 lex='NHRH/' lan='Hebrew' lx=1445749 lex='NHR/' lan='Aramaic'
The lexemes in question belong to a different language!
freq
and rank
features¶from collections import Counter
import pandas as pd
lex_count = {
lang: Counter([F.lex.v(n) for n in F.otype.s("word") if F.language.v(n) == lang])
for lang in ("Hebrew", "Aramaic")
}
data = []
for i in F.otype.s("lex"):
lang = F.language.v(i)
data.append(
{
"id": i,
"lex": F.lex.v(i),
"lang": lang,
"freq_lex": F.freq_lex.v(i),
"rank_lex": F.rank_lex.v(i),
"freq_cnt": lex_count[lang][F.lex.v(i)],
}
)
df = pd.DataFrame(data)
df = df.set_index(["id"])
df.sort_values("rank_lex")
lex | lang | freq_lex | rank_lex | freq_cnt | |
---|---|---|---|---|---|
id | |||||
1443538 | W | Aramaic | 731 | 0 | 731 |
1437609 | W | Hebrew | 50272 | 0 | 50272 |
1443534 | L | Aramaic | 378 | 1 | 378 |
1437607 | H | Hebrew | 30386 | 1 | 30386 |
1437629 | L | Hebrew | 20069 | 2 | 20069 |
... | ... | ... | ... | ... | ... |
1444624 | >LP=[ | Hebrew | 1 | 5713 | 1 |
1444625 | JWY>T/ | Hebrew | 1 | 5713 | 1 |
1441980 | >MH====/ | Hebrew | 1 | 5713 | 1 |
1444616 | MDXPH/ | Hebrew | 1 | 5713 | 1 |
1446831 | JCC/ | Hebrew | 1 | 5713 | 1 |
9230 rows × 5 columns
for (v, f) in F.lex.freqList(nodeTypes={"word"})[0:10]:
print(f"{v} {f}")
W 51003 H 30392 L 20447 B 15768 >T 10987 MN 7681 JHWH/ 6828 <L 5870 >L 5521 >CR 5500
wa = 1443538
wh = 1437609
for lx in (wa, wh):
print(f"{F.lex.v(lx)} {F.language.v(lx)}")
W Aramaic W Hebrew
F.freq_lex.v(wa)
731
F.freq_lex.v(wh)
50272
F.rank_lex.v(wa)
0
F.rank_lex.v(wh)
0