We experiment with numpy arrays for storing a lot of data.
We load the BHSA in the normal way, end then we write code to represent the levUp data, which is a list of lists of numbers.
Can we represent this as a numpy array, and what is the performance gain in terms of memory, and is there a performance penalty in terms of speed?
%load_ext autoreload
%autoreload 2
from tf.app import use
import functools
from timeit import timeit
import numpy
from pack import deepSize
def testPerformance(data):
testMember = 100000
times = 10000000
xTime = timeit("data[testMember]", globals=locals(), number=times)
return xTime
A = use("ETCBC/bhsa:clone", checkout="clone", hoist=globals(), silent="verbose")
Locating corpus resources ...
This is Text-Fabric 11.4.6 122 features found and 0 ignored | 0.80s T otype from ~/github/ETCBC/bhsa/tf/2021 | 11s T oslots from ~/github/ETCBC/bhsa/tf/2021 12s Dataset without structure sections in otext:no structure functions in the T-API | 0.00s T book@zh from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@pa from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@ja from ~/github/ETCBC/bhsa/tf/2021 | 0.99s T g_cons_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@el from ~/github/ETCBC/bhsa/tf/2021 | 1.05s T g_word from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@ur from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@la from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@es from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@id from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@ko from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@fr from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@pt from ~/github/ETCBC/bhsa/tf/2021 | 0.95s T lex from ~/github/ETCBC/bhsa/tf/2021 | 0.96s T g_cons from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@ar from ~/github/ETCBC/bhsa/tf/2021 | 0.82s T trailer from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@sw from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@en from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@am from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@ru from ~/github/ETCBC/bhsa/tf/2021 | 0.97s T lex_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@de from ~/github/ETCBC/bhsa/tf/2021 | 0.04s T chapter from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@hi from ~/github/ETCBC/bhsa/tf/2021 | 0.83s T trailer_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@tr from ~/github/ETCBC/bhsa/tf/2021 | 0.04s T verse from ~/github/ETCBC/bhsa/tf/2021 | 0.01s T qere_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T qere_trailer_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 1.12s T g_word_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@nl from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@bn from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@yo from ~/github/ETCBC/bhsa/tf/2021 | 0.01s T qere from ~/github/ETCBC/bhsa/tf/2021 | 1.00s T g_lex_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@fa from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@da from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@he from ~/github/ETCBC/bhsa/tf/2021 | 1.03s T g_lex from ~/github/ETCBC/bhsa/tf/2021 | 0.05s T book from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T book@syc from ~/github/ETCBC/bhsa/tf/2021 | 1.01s T voc_lex_utf8 from ~/github/ETCBC/bhsa/tf/2021 | 0.00s T qere_trailer from ~/github/ETCBC/bhsa/tf/2021 | | 0.26s C __levels__ from otype, oslots, otext | | 6.62s C __order__ from otype, oslots, __levels__ | | 0.30s C __rank__ from otype, __order__ | | 17s C __levUp__ from otype, oslots, __rank__ | | 11s C __levDown__ from otype, __levUp__, __rank__ | | 0.73s C __characters__ from otext | | 3.20s C __boundary__ from otype, oslots, __rank__ | | 0.06s C __sections__ from otype, oslots, otext, __levUp__, __levels__, book, chapter, verse 1m 02s All features loaded/computed - for details use TF.isLoaded() | 0.17s T code from ~/github/ETCBC/bhsa/tf/2021 | 1.09s T det from ~/github/ETCBC/bhsa/tf/2021 | 0.18s T domain from ~/github/ETCBC/bhsa/tf/2021 | 0.84s T freq_lex from ~/github/ETCBC/bhsa/tf/2021 | 0.53s T function from ~/github/ETCBC/bhsa/tf/2021 | 0.96s T gloss from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T gn from ~/github/ETCBC/bhsa/tf/2021 | 0.15s T label from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T language from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T ls from ~/github/ETCBC/bhsa/tf/2021 | 0.85s T mother from ~/github/ETCBC/bhsa/tf/2021 | 0.09s T nametype from ~/github/ETCBC/bhsa/tf/2021 | 0.84s T nme from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T nu from ~/github/ETCBC/bhsa/tf/2021 | 2.32s T number from ~/github/ETCBC/bhsa/tf/2021 | 0.19s T pargr from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T pdp from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T pfm from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T prs from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T prs_gn from ~/github/ETCBC/bhsa/tf/2021 | 0.87s T prs_nu from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T prs_ps from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T ps from ~/github/ETCBC/bhsa/tf/2021 | 0.82s T rank_lex from ~/github/ETCBC/bhsa/tf/2021 | 1.49s T rela from ~/github/ETCBC/bhsa/tf/2021 | 0.90s T sp from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T st from ~/github/ETCBC/bhsa/tf/2021 | 0.16s T tab from ~/github/ETCBC/bhsa/tf/2021 | 0.18s T txt from ~/github/ETCBC/bhsa/tf/2021 | 1.46s T typ from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T uvf from ~/github/ETCBC/bhsa/tf/2021 | 0.86s T vbe from ~/github/ETCBC/bhsa/tf/2021 | 0.89s T vbs from ~/github/ETCBC/bhsa/tf/2021 | 1.00s T voc_lex from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T vs from ~/github/ETCBC/bhsa/tf/2021 | 0.88s T vt from ~/github/ETCBC/bhsa/tf/2021 29s All additional features loaded - for details use TF.isLoaded()
Name | # of nodes | # slots/node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
lex | 9230 | 46.22 | 100 |
verse | 23213 | 18.38 | 100 |
half_verse | 45179 | 9.44 | 100 |
sentence | 63717 | 6.70 | 100 |
sentence_atom | 64514 | 6.61 | 100 |
clause | 88131 | 4.84 | 100 |
clause_atom | 90704 | 4.70 | 100 |
phrase | 253203 | 1.68 | 100 |
phrase_atom | 267532 | 1.59 | 100 |
subphrase | 113850 | 1.42 | 38 |
word | 426590 | 1.00 | 100 |
x = numpy.array([1, 2, 3], dtype=numpy.uint32)
type(x[0])
numpy.uint32
type(x[0]) is numpy.uint32
True
results = A.search("""
clause
phrase function=Pred
word sp=verb
""")
0.47s 57070 results
results[0:3]
[(427559, 651574, 3), (427560, 651579, 15), (427563, 651589, 33)]
T.text(results[0][0])
'בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ '
A.show(results, end=3)
result 1
result 2
result 3
Now we compute an alternative representation for the levUp data.
info = A.info
error = A.error
otype = A.TF.features["otype"].data
oslots = A.TF.features["oslots"].data
rank = C.rank.data
def levUp(info, error, otype, oslots, rank):
(otype, maxSlot, maxNode, slotType) = otype
oslots = oslots[0]
info("making inverse of edge feature oslots")
oslotsInv = {}
for (k, mList) in enumerate(oslots):
for m in mList:
oslotsInv.setdefault(m, set()).add(k + 1 + maxSlot)
info("listing embedders of all nodes")
embedders = []
for n in range(1, maxSlot + 1):
contentEmbedders = oslotsInv.get(n, tuple())
embedders.append(
numpy.array(
sorted(
(m for m in contentEmbedders if m != n),
key=lambda k: -rank[k - 1],
),
dtype="uint32",
)
)
seen = {}
for n in range(maxSlot + 1, maxNode + 1):
mList = tuple(oslots[n - maxSlot - 1])
if mList in seen:
theseEmbedders = seen[mList]
else:
if len(mList) == 0:
theseEmbedders = numpy.array()
else:
contentEmbedders = functools.reduce(
lambda x, y: x & oslotsInv[y],
mList[1:],
oslotsInv[mList[0]],
)
theseEmbedders = numpy.array(
sorted(
(m for m in contentEmbedders if m != n),
key=lambda k: -rank[k - 1],
),
dtype="uint32",
)
seen[mList] = theseEmbedders
embedders.append(theseEmbedders)
return numpy.array(embedders, dtype=object)
levUpN = levUp(info, error, otype, oslots, rank)
36m 31s making inverse of edge feature oslots 36m 32s listing embedders of all nodes
deepSize(C.levUp.data)
310102620
deepSize(levUpN)
11574760
testPerformance(C.levUp.data)
0.15462375000061002
testPerformance(levUpN)
0.30584074999933364