We measure the memory size and access times for features of Text-Fabric.
How much space does a loaded feature occupy in RAM?
How fast can we look up the value of a feature for a given node?
It turns out that nothing beats the Python dictionary.
import gzip
from timeit import timeit
import numpy as np
import pandas as pd
from tf.core.data import Data
from tf.core.timestamp import Timestamp
from tf.advanced.helpers import dm
from pack import deepSize
We load some features from the BHSA.
They have different sparsity characteristics, as we shall see.
We also compile this data as Pandas data.
The closest data structure is a Pandas Series.
We also test it with a sparse array as data for the series.
We start with a Series object based on the data of the feature.
TEMP = "_temp"
FEATURES = """
vs
g_word_utf8
rela
""".strip().split()
HEAD = f"""\
feature | length | start | end | NaN | %NaN | python dict | numpy | pandas | pandas-sparse
--- | --- | --- | --- | --- | --- | --- | --- | --- | ---
"""
HEAD_TIME = f"""\
feature | python dict | numpy | pandas | pandas-sparse
--- | --- | --- | --- | ---
"""
NONES = {None, "NA"}
TIMES = 1
KEYS = (100001, 1000001)
T = Timestamp()
We define a class of feature test objects where we store data in various representations.
We measure memory usage and add methods to measure the access times.
class FeatureTest:
def __init__(self, feat):
dataObj = Data(f"{TEMP}/{feat}.tf", T)
dataObj.load()
T.indent(level=0)
data = dataObj.data
start = min(data)
end = max(data)
self.start = start
self.end = end
self.nan = sum(1 for n in range(start, end + 1) if data.get(n, None) in NONES)
self.feat = feat
self.data = data
self.ln = len(data)
self.mem = deepSize(data) // (1024 * 1024)
def adjust(self, totalMax):
self.totalMax = totalMax
self.nanPerc = self.nan * 100 / totalMax
def numpy(self):
data = self.data
totalMax = self.totalMax
array = [data.get(i, "") for i in range(totalMax + 1)]
dataN = np.array(array, np.str)
self.dataN = dataN
self.memN = deepSize(dataN) // (1024 * 1024)
def pandas(self):
data = self.data
dataP = pd.Series(data, dtype="string")
self.dataP = dataP
self.memP = dataP.memory_usage(index=True, deep=True) // (1024 * 1024)
totalMax = self.totalMax
array = [data.get(i, "NA") or "NA" for i in range(totalMax + 1)]
dataSP = pd.Series(pd.arrays.SparseArray(array, fill_value="NA", dtype="string"))
self.dataSP = dataSP
self.memSP = dataSP.memory_usage(index=True, deep=False) // (1024 * 1024)
def accessTime(self, times):
data = self.data
locs = locals()
self.access = sum(timeit(f"data.get({key}, None)", globals=locs, number=times) for key in KEYS)
def accessTimeN(self, times):
data = self.dataN
locs = locals()
self.accessN = sum(
timeit(f"data[{key}] if {key} < data.size else None", globals=locs, number=times)
for key in KEYS
)
def accessTimeP(self, times):
dataP = self.dataP
locs = locals()
self.accessP = sum(timeit(f"dataP.get({key})", globals=locs, number=times) for key in KEYS)
def accessTimeSP(self, times):
dataSP = self.dataSP
locs = locals()
self.accessSP = sum(timeit(f"dataSP.get({key})", globals=locs, number=times) for key in KEYS)
def report(self):
return (
f"{self.feat} | "
f"{self.ln} | "
f"{self.start} | "
f"{self.end} | "
f"{self.nan} | "
f"{self.nanPerc} | "
f"{self.mem} MB | "
f"{self.memN} MB | "
f"{self.memP} MB | "
f"{self.memSP} MB\n"
)
def reportTime(self):
return (
f"{self.feat} | "
f"{self.access} s | "
f"{self.accessN} s | "
f"{self.accessP} s | "
f"{self.accessSP} s\n"
)
We collect the feature test objects in a general test object.
class DataTest:
def __init__(self):
totalMax = 0
features = {}
T.indent(reset=True)
for feat in FEATURES:
T.info(f"stage1 {feat}")
fObj = FeatureTest(feat)
if fObj.end > totalMax:
totalMax = fObj.end
features[feat] = fObj
T.info("done")
self.features = features
for (feat, fObj) in features.items():
T.info(f"stage2 {feat}")
fObj.adjust(totalMax)
fObj.numpy()
fObj.pandas()
T.info("done")
def accessTime(self, times):
features = self.features
for (feat, fObj) in features.items():
T.info(f"timing {feat}")
fObj.accessTime(times)
fObj.accessTimeN(times)
fObj.accessTimeP(times)
fObj.accessTimeSP(times)
def report(self):
features = self.features
md = HEAD
for fObj in features.values():
md += fObj.report()
dm(md)
def reportTime(self):
features = self.features
md = HEAD_TIME
for fObj in features.values():
md += fObj.reportTime()
dm(md)
def test(self, feat, nodes):
dt = self.features[feat]
md = f"""\
tf | numpy | pandas | pandas-sparse
---|---|---|---
"""
for i in nodes:
md += f"{dt.data[i]} | {dt.dataN[i]} | {dt.dataP[i]} | {dt.dataSP[i]}\n"
dm(md)
We load the features and measure the sizes.
DT = DataTest()
DT.report()
0.00s stage1 vs 0.74s stage1 g_word_utf8 1.63s stage1 rela 2.89s done 2.89s stage2 vs 3.46s stage2 g_word_utf8 4.13s stage2 rela 4.82s done
feature | length | start | end | NaN | %NaN | python dict | numpy | pandas | pandas-sparse |
---|---|---|---|---|---|---|---|---|---|
vs | 426584 | 1 | 426584 | 352874 | 24.949499877329067 | 31 MB | 21 MB | 27 MB | 0 MB |
g_word_utf8 | 426584 | 1 | 426584 | 0 | 0.0 | 39 MB | 140 MB | 41 MB | 4 MB |
rela | 722681 | 427553 | 1414353 | 894226 | 63.225093028402384 | 59 MB | 21 MB | 46 MB | 1 MB |
A few checks whether the data representations give back the right data:
DT.test("vs", range(1, 11))
tf | numpy | pandas | pandas-sparse |
---|---|---|---|
NA | NA | NA | NA |
NA | NA | NA | NA |
qal | qal | qal | qal |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
DT.test("g_word_utf8", range(1, 11))
tf | numpy | pandas | pandas-sparse |
---|---|---|---|
בְּ | בְּ | בְּ | בְּ |
רֵאשִׁ֖ית | רֵאשִׁ֖ית | רֵאשִׁ֖ית | רֵאשִׁ֖ית |
בָּרָ֣א | בָּרָ֣א | בָּרָ֣א | בָּרָ֣א |
אֱלֹהִ֑ים | אֱלֹהִ֑ים | אֱלֹהִ֑ים | אֱלֹהִ֑ים |
אֵ֥ת | אֵ֥ת | אֵ֥ת | אֵ֥ת |
הַ | הַ | הַ | הַ |
שָּׁמַ֖יִם | שָּׁמַ֖יִם | שָּׁמַ֖יִם | שָּׁמַ֖יִם |
וְ | וְ | וְ | וְ |
אֵ֥ת | אֵ֥ת | אֵ֥ת | אֵ֥ת |
הָ | הָ | הָ | הָ |
DT.test("rela", range(427608, 427619))
tf | numpy | pandas | pandas-sparse |
---|---|---|---|
Adju | Adju | Adju | Adju |
NA | NA | NA | NA |
NA | NA | NA | NA |
NA | NA | NA | NA |
Adju | Adju | Adju | Adju |
Coor | Coor | Coor | Coor |
Coor | Coor | Coor | Coor |
NA | NA | NA | NA |
Objc | Objc | Objc | Objc |
NA | NA | NA | NA |
NA | NA | NA | NA |
DT.accessTime(10000)
DT.reportTime()
10s timing vs 11s timing g_word_utf8 15s timing rela
feature | python dict | numpy | pandas | pandas-sparse |
---|---|---|---|---|
vs | 0.0018969129999995005 s | 0.009688880000002342 s | 0.09362182199999936 s | 0.6480762519999956 s |
g_word_utf8 | 0.0013883559999996464 s | 0.00787365400000084 s | 0.07825148999999954 s | 4.277200847000003 s |
rela | 0.0013625329999982227 s | 0.008385869000001378 s | 0.08113798900000546 s | 0.723207390999999 s |
Storage in NumPy is worse than in a Python dict. The access time is also worse, in the order of 5 times.
Storage in a Pandas series is slightly better space-wise than in a Python dict. However, the access time is 50 times worse.
In a Pandas sparse series, the storage is much smaller, but the access time is 300-3000 times worse.
For Text-Fabric, no performance gains are to be expected when turning to Pandas or Numpy as workhorses for storing and accessing features.