We read the BHSA feature g_word_utf8
, which maps nearly half a million integers to Hebrew word occurrences
in the Hebrew Bible.
We measure the execution time of a second run of the last cell, so that we do not count warming up effects.
import os
import sys
from typing import Dict
base = f'~/text-fabric-data/github/ETCBC/bhsa/tf/2021'
feature = 'g_word_utf8'
featurePath = f'{os.path.expanduser(base)}/{feature}.tf'
def error(msg):
sys.stderr.write(f'{msg}\n')
def showResults(errors, data):
if errors == 0:
maxNode = max(data.keys()) if type(data) is dict else len(data)
print(f'{len(data)} results, last node {maxNode}')
print(data[1])
print(data[2])
print(data[maxNode if type(data) is dict else maxNode - 1])
else:
print(f'{errors} errors')
def valueFromTf(tf):
return '\\'.join(x.replace('\\t', '\t').replace('\\n', '\n') for x in tf.split('\\\\'))
def setFromSpec(spec):
covered = set()
for r_str in spec.split(','):
bounds = r_str.split('-')
if len(bounds) == 1:
covered.add(int(r_str))
else:
b = int(bounds[0])
e = int(bounds[1])
if (e < b):
(b, e) = (e, b)
for n in range(b, e + 1):
covered.add(n)
return covered
Just reading a TF feature from disk, get through the metadata, and deliver all lines in memory, plus the starting line for the data.
The whole file gets slurped.
def readFile(path):
if not os.path.exists(path):
error('TF reading: feature file "{}" does not exist'.format(path))
return False
with open(path, encoding='utf8') as fh:
contents = fh.read()
lines = contents.split('\n')
if lines[-1] == '':
lines.pop()
i = 0
for line in lines:
i += 1
if line.startswith('@'):
continue
else:
if line != '':
error('Line {}: missing blank line after metadata'.format(i))
return False
else:
break
return (lines, i)
The readTf function as done in Text-Fabric.
def readTf(path):
if not os.path.exists(path):
error('TF reading: feature file "{}" does not exist'.format(path))
return False
fh = open(path, encoding='utf8')
i = 0
for line in fh:
i += 1
text = line.rstrip()
if text.startswith('@'):
continue
else:
if text != '':
error('Line {}: missing blank line after metadata'.format(i))
fh.close()
return False
else:
break
result = readDataTf(fh, i)
fh.close()
return result
Reading the data part pf a feature and storing it in a dict.
def readDataTf(fh, firstI):
i = firstI
implicit_node = 1
data = {}
normFields = 2
isNum = False
errors = 0
for line in fh:
i += 1
fields = line.rstrip('\n').split('\t')
lfields = len(fields)
if lfields > normFields:
error(f'{i}: wrongFields')
errors += 1
continue
if lfields == normFields:
nodes = setFromSpec(fields[0])
valTf = fields[-1]
else:
nodes = {implicit_node}
if lfields == 1:
valTf = fields[0]
else:
valTf = ''
implicit_node = max(nodes) + 1
value = (
int(valTf) if isNum and valTf != '' else None if isNum else ''
if valTf == '' else valueFromTf(valTf)
)
for n in nodes:
if value is not None:
data[n] = value
return (errors, data)
A variant: read a TF feature and store it in a list.
def readTfList(path):
if not os.path.exists(path):
error('TF reading: feature file "{}" does not exist'.format(path))
return False
fh = open(path, encoding='utf8')
i = 0
for line in fh:
i += 1
text = line.rstrip()
if text.startswith('@'):
continue
else:
if text != '':
error('Line {}: missing blank line after metadata'.format(i))
fh.close()
return False
else:
break
result = readDataTfList(fh, i)
fh.close()
return result
def readDataTfList(fh, firstI):
i = firstI
implicit_node = 1
data = []
normFields = 2
isNum = False
errors = 0
for line in fh:
i += 1
fields = line.rstrip('\n').split('\t')
lfields = len(fields)
if lfields > normFields:
error(f'{i}: wrongFields')
errors += 1
continue
if lfields == normFields:
nodes = setFromSpec(fields[0])
valTf = fields[-1]
else:
nodes = {implicit_node}
if lfields == 1:
valTf = fields[0]
else:
valTf = ''
implicit_node = max(nodes) + 1
value = (
int(valTf) if isNum and valTf != '' else None if isNum else ''
if valTf == '' else valueFromTf(valTf)
)
for n in nodes:
if value is not None:
data.append(value)
return (errors, data)
Read a TF feature by slurping.
def readTfSlurp(path):
if not os.path.exists(path):
error('TF reading: feature file "{}" does not exist'.format(path))
return False
with open(path, encoding='utf8') as fh:
contents = fh.read()
lines = contents.split('\n')
if lines[-1] == '':
lines.pop()
i = 0
for line in lines:
i += 1
if line.startswith('@'):
continue
else:
if line != '':
error('Line {}: missing blank line after metadata'.format(i))
return False
else:
break
result = readDataTfSlurp(lines, i)
return result
def readDataTfSlurp(lines, firstI):
i = firstI - 1
implicit_node = 1
data = {}
normFields = 2
isNum = False
errors = 0
for line in lines[firstI:]:
i += 1
fields = line.split('\t')
lfields = len(fields)
if lfields > normFields:
error(f'{i}: wrongFields')
errors += 1
continue
if lfields == normFields:
nodes = setFromSpec(fields[0])
valTf = fields[-1]
else:
nodes = {implicit_node}
if lfields == 1:
valTf = fields[0]
else:
valTf = ''
implicit_node = max(nodes) + 1
value = (
int(valTf) if isNum and valTf != '' else None if isNum else ''
if valTf == '' else valueFromTf(valTf)
)
for n in nodes:
if value is not None:
data[n] = value
return (errors, data)
def readDataTfSlurpOpt(lines, firstI):
i = firstI - 1
implicit_node = 1
data: Dict[int, str] = dict()
normFields = 2
isNum = False
errors = 0
for line in lines[firstI:]:
i += 1
fields = line.split('\t')
lfields = len(fields)
if lfields > normFields:
error(f'{i}: wrongFields')
errors += 1
continue
if lfields == normFields:
nodes = setFromSpec(fields[0])
valTf = fields[-1]
else:
nodes = {implicit_node}
if lfields == 1:
valTf = fields[0]
else:
valTf = ''
implicit_node = max(nodes) + 1
value = (
int(valTf) if isNum and valTf != '' else None if isNum else ''
if valTf == '' else valueFromTf(valTf)
)
for n in nodes:
if value is not None:
data[n] = value
return (errors, data)
%%timeit
(errors, data) = readTf(featurePath)
312 ms ± 883 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Execution time: around 1.2s
showResults(errors, data)
426590 results, last node 426590 בְּ רֵאשִׁ֖ית יָֽעַל
%%timeit
(errors, data) = readTfList(featurePath)
302 ms ± 675 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Execution time: around 1.2s
showResults(errors, data)
426590 results, last node 426590 בְּ רֵאשִׁ֖ית יָֽעַל
%%timeit
(errors, data) = readTfSlurp(featurePath)
297 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל
Execution time: around 1.1s
%%timeit
(lines, first) = readFile(featurePath)
24.7 ms ± 393 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Execution time: around 0.1s
(errors, data) = readDataTfSlurpOpt(lines, first)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[48], line 1 ----> 1 (errors, data) = readDataTfSlurpOpt(lines, first) NameError: name 'lines' is not defined
Execution time: around 1.0s
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל