Notebook

Numpy arrays¶

We experiment with numpy arrays for storing a lot of data.

We load the BHSA in the normal way, end then we write code to represent the levUp data, which is a list of lists of numbers.

Can we represent this as a numpy array, and what is the performance gain in terms of memory, and is there a performance penalty in terms of speed?

In [1]:

%load_ext autoreload
%autoreload 2

In [2]:

from tf.app import use
import functools
from timeit import timeit

import numpy
from pack import deepSize

In [3]:

def testPerformance(data):
    testMember = 100000
    times = 10000000
    xTime = timeit("data[testMember]", globals=locals(), number=times)
    return xTime

In [4]:

A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals(), silent="verbose")

Locating corpus resources ...

app: ~/github/ETCBC/bhsa/app

data: ~/github/ETCBC/bhsa/tf/2021

data: ~/github/ETCBC/phono/tf/2021

data: ~/github/ETCBC/parallels/tf/2021

This is Text-Fabric 11.4.6
122 features found and 0 ignored
   |     0.80s T otype                from ~/github/ETCBC/bhsa/tf/2021
   |       11s T oslots               from ~/github/ETCBC/bhsa/tf/2021
    12s Dataset without structure sections in otext:no structure functions in the T-API
   |     0.00s T book@zh              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@pa              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@ja              from ~/github/ETCBC/bhsa/tf/2021
   |     0.99s T g_cons_utf8          from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@el              from ~/github/ETCBC/bhsa/tf/2021
   |     1.05s T g_word               from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@ur              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@la              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@es              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@id              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@ko              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@fr              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@pt              from ~/github/ETCBC/bhsa/tf/2021
   |     0.95s T lex                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.96s T g_cons               from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@ar              from ~/github/ETCBC/bhsa/tf/2021
   |     0.82s T trailer              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@sw              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@en              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@am              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@ru              from ~/github/ETCBC/bhsa/tf/2021
   |     0.97s T lex_utf8             from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@de              from ~/github/ETCBC/bhsa/tf/2021
   |     0.04s T chapter              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@hi              from ~/github/ETCBC/bhsa/tf/2021
   |     0.83s T trailer_utf8         from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@tr              from ~/github/ETCBC/bhsa/tf/2021
   |     0.04s T verse                from ~/github/ETCBC/bhsa/tf/2021
   |     0.01s T qere_utf8            from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T qere_trailer_utf8    from ~/github/ETCBC/bhsa/tf/2021
   |     1.12s T g_word_utf8          from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@nl              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@bn              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@yo              from ~/github/ETCBC/bhsa/tf/2021
   |     0.01s T qere                 from ~/github/ETCBC/bhsa/tf/2021
   |     1.00s T g_lex_utf8           from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@fa              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@da              from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@he              from ~/github/ETCBC/bhsa/tf/2021
   |     1.03s T g_lex                from ~/github/ETCBC/bhsa/tf/2021
   |     0.05s T book                 from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T book@syc             from ~/github/ETCBC/bhsa/tf/2021
   |     1.01s T voc_lex_utf8         from ~/github/ETCBC/bhsa/tf/2021
   |     0.00s T qere_trailer         from ~/github/ETCBC/bhsa/tf/2021
   |      |     0.26s C __levels__           from otype, oslots, otext
   |      |     6.62s C __order__            from otype, oslots, __levels__
   |      |     0.30s C __rank__             from otype, __order__
   |      |       17s C __levUp__            from otype, oslots, __rank__
   |      |       11s C __levDown__          from otype, __levUp__, __rank__
   |      |     0.73s C __characters__       from otext
   |      |     3.20s C __boundary__         from otype, oslots, __rank__
   |      |     0.06s C __sections__         from otype, oslots, otext, __levUp__, __levels__, book, chapter, verse
 1m 02s All features loaded/computed - for details use TF.isLoaded()
   |     0.17s T code                 from ~/github/ETCBC/bhsa/tf/2021
   |     1.09s T det                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.18s T domain               from ~/github/ETCBC/bhsa/tf/2021
   |     0.84s T freq_lex             from ~/github/ETCBC/bhsa/tf/2021
   |     0.53s T function             from ~/github/ETCBC/bhsa/tf/2021
   |     0.96s T gloss                from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T gn                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.15s T label                from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T language             from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T ls                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.85s T mother               from ~/github/ETCBC/bhsa/tf/2021
   |     0.09s T nametype             from ~/github/ETCBC/bhsa/tf/2021
   |     0.84s T nme                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T nu                   from ~/github/ETCBC/bhsa/tf/2021
   |     2.32s T number               from ~/github/ETCBC/bhsa/tf/2021
   |     0.19s T pargr                from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T pdp                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T pfm                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T prs                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T prs_gn               from ~/github/ETCBC/bhsa/tf/2021
   |     0.87s T prs_nu               from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T prs_ps               from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T ps                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.82s T rank_lex             from ~/github/ETCBC/bhsa/tf/2021
   |     1.49s T rela                 from ~/github/ETCBC/bhsa/tf/2021
   |     0.90s T sp                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T st                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.16s T tab                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.18s T txt                  from ~/github/ETCBC/bhsa/tf/2021
   |     1.46s T typ                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T uvf                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.86s T vbe                  from ~/github/ETCBC/bhsa/tf/2021
   |     0.89s T vbs                  from ~/github/ETCBC/bhsa/tf/2021
   |     1.00s T voc_lex              from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T vs                   from ~/github/ETCBC/bhsa/tf/2021
   |     0.88s T vt                   from ~/github/ETCBC/bhsa/tf/2021
    29s All additional features loaded - for details use TF.isLoaded()

Text-Fabric: Text-Fabric API 11.4.6, ETCBC/bhsa/app v3, Search Reference
Data: ETCBC - bhsa 2021, Character table, Feature docs

Node types

Name	# of nodes	# slots/node	% coverage
book	39	10938.21	100
chapter	929	459.19	100
lex	9230	46.22	100
verse	23213	18.38	100
half_verse	45179	9.44	100
sentence	63717	6.70	100
sentence_atom	64514	6.61	100
clause	88131	4.84	100
clause_atom	90704	4.70	100
phrase	253203	1.68	100
phrase_atom	267532	1.59	100
subphrase	113850	1.42	38
word	426590	1.00	100

Sets: no custom sets
Features:

Parallel Passages

crossref

int

🆗 links between similar passages

author:

BHSA Data: Constantijn Sikkel; Parallels Notebook: Dirk Roorda, Martijn Naaijer

coreData:

BHSA

dateWritten:

2021-12-09T14:40:46Z

provenance:

Parallels notebook, see https://github.com/ETCBC/parallels

version:

2021

writtenBy:

Text-Fabric

BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis

book

str

✅ book name in Latin (Genesis; Numeri; Reges1; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:55Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

book@ll

str

✅ book name in amharic (ኣማርኛ)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:20:27Z

email:

shebanq@ancient-data.org

encoders:

Dirk Roorda (TF)

language:

ኣማርኛ

languageCode:

languageEnglish:

amharic

provenance:

book names from wikipedia and other sources

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

chapter

int

✅ chapter number (1; 2; 3; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:55Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

code

int

✅ identifier of a clause atom relationship (0; 74; 367; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:56Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

det

str

✅ determinedness of phrase(atom) (det; und; NA.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:56Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

domain

str

✅ text type of clause (? (Unknown); N (narrative); D (discursive); Q (Quotation).)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:57Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

freq_lex

int

✅ frequency of lexemes

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:24:45Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

computed on the basis of the ETCBC core set of features

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

function

str

✅ syntactic function of phrase (Cmpl; Objc; Pred; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:57Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_cons

str

✅ word consonantal-transliterated (B R>CJT BR> >LHJM ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:57Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_cons_utf8

str

✅ word consonantal-Hebrew (ב ראשׁית ברא אלהים)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:58Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_lex

str

✅ lexeme pointed-transliterated (B.:- R;>CIJT B.@R@> >:ELOH ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:58Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_lex_utf8

str

✅ lexeme pointed-Hebrew (בְּ רֵאשִׁית בָּרָא אֱלֹה)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:17:59Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_word

str

✅ word pointed-transliterated (B.:- R;>CI73JT B.@R@74> >:ELOHI92JM)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:04Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

g_word_utf8

str

✅ word pointed-Hebrew (בְּ רֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:04Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

gloss

str

🆗 english translation of lexeme (beginning create god(s))

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:13Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ grammatical gender (m; f; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:05Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

label

str

✅ (half-)verse label (half verses: A; B; C; verses: GEN 01,02)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:06Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

language

str

✅ of word or lexeme (Hebrew; Aramaic.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:13Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

lex

str

✅ lexeme consonantal-transliterated (B R>CJT/ BR>[ >LHJM/)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:14Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

lex_utf8

str

✅ lexeme consonantal-Hebrew (ב ראשׁית֜ ברא אלהים֜)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:15Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ lexical set, subclassification of part-of-speech (card; ques; mult)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:15Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

nametype

str

⚠️ named entity type (pers; mens; gens; topo; ppde.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:15Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

nme

str

✅ nominal ending consonantal-transliterated (absent; n/a; JM, ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:08Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ grammatical number (sg; du; pl; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:08Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

number

int

✅ sequence number of an object within its context

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:09Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

otype

str

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:15Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

pargr

str

🆗 hierarchical paragraph number (1; 1.2; 1.2.3.4; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:22:50Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional paragraph file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

pdp

str

✅ phrase dependent part-of-speech (art; verb; subs; nmpr, ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:10Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

pfm

str

✅ preformative consonantal-transliterated (absent; n/a; J, ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:11Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

prs

str

✅ pronominal suffix consonantal-transliterated (absent; n/a; W; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:11Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

prs_gn

str

✅ pronominal suffix gender (m; f; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:11Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

prs_nu

str

✅ pronominal suffix number (sg; du; pl; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:12Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

prs_ps

str

✅ pronominal suffix person (p1; p2; p3; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:12Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ grammatical person (p1; p2; p3; NA; unknown.)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:12Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

qere

str

✅ word pointed-transliterated masoretic reading correction

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:23:29Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional ketiv/qere file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

qere_trailer

str

✅ interword material -pointed-transliterated (Masoretic correction)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:23:29Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional ketiv/qere file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

qere_trailer_utf8

str

✅ interword material -pointed-transliterated (Masoretic correction)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:23:29Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional ketiv/qere file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

qere_utf8

str

✅ word pointed-Hebrew masoretic reading correction

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:23:29Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional ketiv/qere file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

rank_lex

int

✅ ranking of lexemes based on freqnuecy

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:24:46Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

computed on the basis of the ETCBC core set of features

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

rela

str

✅ linguistic relation between clause/(sub)phrase(atom) (ADJ; MOD; ATR; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:13Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ part-of-speech (art; verb; subs; nmpr, ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:16Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ state of a noun (a (absolute); c (construct); e (emphatic).)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:14Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

tab

int

✅ clause atom: its level in the linguistic embedding

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:16Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

trailer

str

✅ interword material pointed-transliterated (& 00 05 00_P ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:01Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

trailer_utf8

str

✅ interword material pointed-Hebrew (־ ׃)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:01Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

txt

str

✅ text type of clause and surrounding (repetion of ? N D Q as in feature domain)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:16Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

typ

str

✅ clause/phrase(atom) type (VP; NP; Ellp; Ptcp; WayX)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:16Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

uvf

str

✅ univalent final consonant consonantal-transliterated (absent; N; J; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:17Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

vbe

str

✅ verbal ending consonantal-transliterated (n/a; W; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:17Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

vbs

str

✅ root formation consonantal-transliterated (absent; n/a; H; ...)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:17Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

verse

int

✅ verse number

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:18Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

voc_lex

str

✅ vocalized lexeme pointed-transliterated (B.: R;>CIJT BR> >:ELOHIJM)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:16Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

voc_lex_utf8

str

✅ vocalized lexeme pointed-Hebrew (בְּ רֵאשִׁית ברא אֱלֹהִים)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:17Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

provenance:

from additional lexicon file provided by the ETCBC

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ verbal stem (qal; piel; hif; apel; pael)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:18Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

str

✅ verbal tense (perf; impv; wayq; infc)

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:18Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

mother

none

✅ linguistic dependency between textual objects

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:18:22Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

oslots

none

author:

Eep Talstra Centre for Bible and Computer

dataset:

BHSA

datasetName:

Biblia Hebraica Stuttgartensia Amstelodamensis

dateWritten:

2021-12-09T14:21:17Z

email:

shebanq@ancient-data.org

encoders:

Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)

version:

2021

website:

https://shebanq.ancient-data.org

writtenBy:

Text-Fabric

Phonetic Transcriptions

phono

str

🆗 phonological transcription (bᵊ rēšˌîṯ bārˈā ʔᵉlōhˈîm)

author:

BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda

coreData:

BHSA

dateWritten:

2021-12-09T14:25:55Z

provenance:

computed by the phono notebook, see https://github.com/ETCBC/phono

version:

2021

writtenBy:

Text-Fabric

phono_trailer

str

🆗 interword material in phonological transcription

author:

BHSA Data: Constantijn Sikkel; Phono Notebook: Dirk Roorda

coreData:

BHSA

dateWritten:

2021-12-09T14:25:55Z

provenance:

computed by the phono notebook, see https://github.com/ETCBC/phono

version:

2021

writtenBy:

Text-Fabric

Text-Fabric API: names N F E L T S C TF directly usable

In [22]:

x = numpy.array([1, 2, 3], dtype=numpy.uint32)
type(x[0])

Out[22]:

numpy.uint32

In [23]:

type(x[0]) is numpy.uint32

Out[23]:

True

In [9]:

results = A.search("""
clause
  phrase function=Pred
    word sp=verb
""")

  0.47s 57070 results

In [6]:

results[0:3]

Out[6]:

[(427559, 651574, 3), (427560, 651579, 15), (427563, 651589, 33)]

In [7]:

T.text(results[0][0])

Out[7]:

'בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ '

In [8]:

A.show(results, end=3)

result 1

Genesis 1:1

verse

sentence 1

clause xQtX NA

phrase PP Time

function=Time

בְּ

sp=prep

רֵאשִׁ֖ית

sp=subs

phrase VP Pred

function=Pred

בָּרָ֣א

sp=verb

phrase NP Subj

function=Subj

אֱלֹהִ֑ים

sp=subs

phrase PP Objc

function=Objc

sp=prep

sp=art

sp=subs

sp=conj

sp=prep

sp=art

sp=subs

result 2

Genesis 1:2

verse

sentence 2

clause WXQt NA

phrase CP Conj

function=Conj

וְ

sp=conj

phrase NP Subj

function=Subj

הָ

sp=art

אָ֗רֶץ

sp=subs

phrase VP Pred

function=Pred

הָיְתָ֥ה

sp=verb

phrase NP PreC

function=PreC

תֹ֨הוּ֙

sp=subs

וָ

sp=conj

בֹ֔הוּ

sp=subs

sentence 3

clause NmCl NA

phrase CP Conj

function=Conj

וְ

sp=conj

phrase NP Subj

function=Subj

חֹ֖שֶׁךְ

sp=subs

phrase PP PreC

function=PreC

עַל־

sp=prep

פְּנֵ֣י

sp=subs

תְהֹ֑ום

sp=subs

sentence 4

clause Ptcp NA

phrase CP Conj

function=Conj

וְ

sp=conj

phrase NP Subj

function=Subj

ר֣וּחַ

sp=subs

אֱלֹהִ֔ים

sp=subs

phrase VP PreC

function=PreC

מְרַחֶ֖פֶת

sp=verb

phrase PP Cmpl

function=Cmpl

sp=prep

sp=subs

sp=art

sp=subs

result 3

Genesis 1:3

verse

sentence 5

clause WayX NA

phrase CP Conj

function=Conj

וַ

sp=conj

phrase VP Pred

function=Pred

יֹּ֥אמֶר

sp=verb

phrase NP Subj

function=Subj

אֱלֹהִ֖ים

sp=subs

sentence 6

clause ZYqX NA

phrase VP Pred

function=Pred

יְהִ֣י

sp=verb

phrase NP Subj

function=Subj

אֹ֑ור

sp=subs

sentence 7

clause WayX NA

phrase CP Conj

function=Conj

וַֽ

sp=conj

phrase VP Pred

function=Pred

יְהִי־

sp=verb

phrase NP Subj

function=Subj

אֹֽור׃

sp=subs

Now we compute an alternative representation for the levUp data.

In [12]:

info = A.info
error = A.error
otype = A.TF.features["otype"].data
oslots = A.TF.features["oslots"].data
rank = C.rank.data

In [45]:

def levUp(info, error, otype, oslots, rank):
    (otype, maxSlot, maxNode, slotType) = otype
    oslots = oslots[0]
    info("making inverse of edge feature oslots")
    oslotsInv = {}
    for (k, mList) in enumerate(oslots):
        for m in mList:
            oslotsInv.setdefault(m, set()).add(k + 1 + maxSlot)
    info("listing embedders of all nodes")
    embedders = []
    for n in range(1, maxSlot + 1):
        contentEmbedders = oslotsInv.get(n, tuple())
        embedders.append(
            numpy.array(
                sorted(
                    (m for m in contentEmbedders if m != n),
                    key=lambda k: -rank[k - 1],
                ),
                dtype="uint32",
            )
        )
    seen = {}
    for n in range(maxSlot + 1, maxNode + 1):
        mList = tuple(oslots[n - maxSlot - 1])
        if mList in seen:
            theseEmbedders = seen[mList]
        else:
            if len(mList) == 0:
                theseEmbedders = numpy.array()
            else:
                contentEmbedders = functools.reduce(
                    lambda x, y: x & oslotsInv[y],
                    mList[1:],
                    oslotsInv[mList[0]],
                )
                theseEmbedders = numpy.array(
                    sorted(
                        (m for m in contentEmbedders if m != n),
                        key=lambda k: -rank[k - 1],
                    ),
                    dtype="uint32",
                )
            seen[mList] = theseEmbedders
        embedders.append(theseEmbedders)
    return numpy.array(embedders, dtype=object)

In [46]:

levUpN = levUp(info, error, otype, oslots, rank)

36m 31s making inverse of edge feature oslots
36m 32s listing embedders of all nodes

In [47]:

deepSize(C.levUp.data)

Out[47]:

310102620

In [48]:

deepSize(levUpN)

Out[48]:

11574760

In [42]:

testPerformance(C.levUp.data)

Out[42]:

0.15462375000061002

In [41]:

testPerformance(levUpN)

Out[41]:

0.30584074999933364

In [ ]: