Reading node feature in Python¶

We read the BHSA feature g_word_utf8, which maps nearly half a million integers to Hebrew word occurrences in the Hebrew Bible.

We measure the execution time of a second run of the last cell, so that we do not count warming up effects.

In [1]:

import os
import sys
from typing import Dict

Choice of test feature¶

In [28]:

base = f'~/text-fabric-data/github/ETCBC/bhsa/tf/2021'
feature = 'g_word_utf8'
featurePath = f'{os.path.expanduser(base)}/{feature}.tf'

Auxiliary functions for reading a TF feature¶

In [29]:

def error(msg):
    sys.stderr.write(f'{msg}\n')

In [30]:

def showResults(errors, data):
  if errors == 0:
    maxNode = max(data.keys()) if type(data) is dict else len(data)
    print(f'{len(data)} results, last node {maxNode}')
    print(data[1])
    print(data[2])
    print(data[maxNode if type(data) is dict else maxNode - 1])
  else:
    print(f'{errors} errors')

In [31]:

def valueFromTf(tf):
  return '\\'.join(x.replace('\\t', '\t').replace('\\n', '\n') for x in tf.split('\\\\'))

def setFromSpec(spec):
  covered = set()
  for r_str in spec.split(','):
    bounds = r_str.split('-')
    if len(bounds) == 1:
      covered.add(int(r_str))
    else:
      b = int(bounds[0])
      e = int(bounds[1])
      if (e < b):
        (b, e) = (e, b)
      for n in range(b, e + 1):
        covered.add(n)
  return covered

Just reading a TF feature from disk, get through the metadata, and deliver all lines in memory, plus the starting line for the data.

The whole file gets slurped.

In [32]:

def readFile(path):
  if not os.path.exists(path):
    error('TF reading: feature file "{}" does not exist'.format(path))
    return False
  with open(path, encoding='utf8') as fh:
    contents = fh.read()
  lines = contents.split('\n')
  if lines[-1] == '':
    lines.pop()
  i = 0
  for line in lines:
    i += 1
    if line.startswith('@'):
      continue
    else:
      if line != '':
        error('Line {}: missing blank line after metadata'.format(i))
        return False
      else:
        break
  return (lines, i)

The readTf function as done in Text-Fabric.

In [33]:

def readTf(path):
  if not os.path.exists(path):
    error('TF reading: feature file "{}" does not exist'.format(path))
    return False
  fh = open(path, encoding='utf8')
  i = 0
  for line in fh:
    i += 1
    text = line.rstrip()
    if text.startswith('@'):
      continue
    else:
      if text != '':
        error('Line {}: missing blank line after metadata'.format(i))
        fh.close()
        return False
      else:
        break
  result = readDataTf(fh, i)
  fh.close()
  return result

Reading the data part pf a feature and storing it in a dict.

In [34]:

def readDataTf(fh, firstI):
  i = firstI
  implicit_node = 1
  data = {}
  normFields = 2
  isNum = False
  errors = 0
  for line in fh:
    i += 1
    fields = line.rstrip('\n').split('\t')
    lfields = len(fields)
    if lfields > normFields:
      error(f'{i}: wrongFields')
      errors += 1
      continue
    if lfields == normFields:
      nodes = setFromSpec(fields[0])
      valTf = fields[-1]
    else:
      nodes = {implicit_node}
      if lfields == 1:
        valTf = fields[0]
      else:
        valTf = ''
    implicit_node = max(nodes) + 1
    value = (
        int(valTf) if isNum and valTf != '' else None if isNum else ''
        if valTf == '' else valueFromTf(valTf)
    )
    for n in nodes:
      if value is not None:
        data[n] = value
  return (errors, data)

A variant: read a TF feature and store it in a list.

In [35]:

def readTfList(path):
  if not os.path.exists(path):
    error('TF reading: feature file "{}" does not exist'.format(path))
    return False
  fh = open(path, encoding='utf8')
  i = 0
  for line in fh:
    i += 1
    text = line.rstrip()
    if text.startswith('@'):
      continue
    else:
      if text != '':
        error('Line {}: missing blank line after metadata'.format(i))
        fh.close()
        return False
      else:
        break
  result = readDataTfList(fh, i)
  fh.close()
  return result

In [36]:

def readDataTfList(fh, firstI):
  i = firstI
  implicit_node = 1
  data = []
  normFields = 2
  isNum = False
  errors = 0
  for line in fh:
    i += 1
    fields = line.rstrip('\n').split('\t')
    lfields = len(fields)
    if lfields > normFields:
      error(f'{i}: wrongFields')
      errors += 1
      continue
    if lfields == normFields:
      nodes = setFromSpec(fields[0])
      valTf = fields[-1]
    else:
      nodes = {implicit_node}
      if lfields == 1:
        valTf = fields[0]
      else:
        valTf = ''
    implicit_node = max(nodes) + 1
    value = (
        int(valTf) if isNum and valTf != '' else None if isNum else ''
        if valTf == '' else valueFromTf(valTf)
    )
    for n in nodes:
      if value is not None:
        data.append(value)
  return (errors, data)

Read a TF feature by slurping.

In [37]:

def readTfSlurp(path):
  if not os.path.exists(path):
    error('TF reading: feature file "{}" does not exist'.format(path))
    return False
  with open(path, encoding='utf8') as fh:
    contents = fh.read()
  lines = contents.split('\n')
  if lines[-1] == '':
    lines.pop()
  i = 0
  for line in lines:
    i += 1
    if line.startswith('@'):
      continue
    else:
      if line != '':
        error('Line {}: missing blank line after metadata'.format(i))
        return False
      else:
        break
  result = readDataTfSlurp(lines, i)
  return result

In [38]:

def readDataTfSlurp(lines, firstI):
  i = firstI - 1
  implicit_node = 1
  data = {}
  normFields = 2
  isNum = False
  errors = 0
  for line in lines[firstI:]:
    i += 1
    fields = line.split('\t')
    lfields = len(fields)
    if lfields > normFields:
      error(f'{i}: wrongFields')
      errors += 1
      continue
    if lfields == normFields:
      nodes = setFromSpec(fields[0])
      valTf = fields[-1]
    else:
      nodes = {implicit_node}
      if lfields == 1:
        valTf = fields[0]
      else:
        valTf = ''
    implicit_node = max(nodes) + 1
    value = (
        int(valTf) if isNum and valTf != '' else None if isNum else ''
        if valTf == '' else valueFromTf(valTf)
    )
    for n in nodes:
      if value is not None:
        data[n] = value
  return (errors, data)

In [39]:

def readDataTfSlurpOpt(lines, firstI):
  i = firstI - 1
  implicit_node = 1
  data: Dict[int, str] = dict()
  normFields = 2
  isNum = False
  errors = 0
  for line in lines[firstI:]:
    i += 1
    fields = line.split('\t')
    lfields = len(fields)
    if lfields > normFields:
      error(f'{i}: wrongFields')
      errors += 1
      continue
    if lfields == normFields:
      nodes = setFromSpec(fields[0])
      valTf = fields[-1]
    else:
      nodes = {implicit_node}
      if lfields == 1:
        valTf = fields[0]
      else:
        valTf = ''
    implicit_node = max(nodes) + 1
    value = (
        int(valTf) if isNum and valTf != '' else None if isNum else ''
        if valTf == '' else valueFromTf(valTf)
    )
    for n in nodes:
      if value is not None:
        data[n] = value
  return (errors, data)

Test: straight TF reading¶

In [43]:

%%timeit
(errors, data) = readTf(featurePath)

312 ms ± 883 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

Execution time: around 1.2s

In [41]:

showResults(errors, data)

426590 results, last node 426590
בְּ
רֵאשִׁ֖ית
יָֽעַל

Test: TF reading as list¶

In [44]:

%%timeit
(errors, data) = readTfList(featurePath)

302 ms ± 675 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

Execution time: around 1.2s

In [45]:

showResults(errors, data)

426590 results, last node 426590
בְּ
רֵאשִׁ֖ית
יָֽעַל

Test: TF slurping¶

In [46]:

%%timeit
(errors, data) = readTfSlurp(featurePath)

297 ms ± 1.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [19]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל

Execution time: around 1.1s

Test: slurping and then optimized TF processing¶

In [47]:

%%timeit
(lines, first) = readFile(featurePath)

24.7 ms ± 393 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Execution time: around 0.1s

In [48]:

(errors, data) = readDataTfSlurpOpt(lines, first)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 1
----> 1 (errors, data) = readDataTfSlurpOpt(lines, first)

NameError: name 'lines' is not defined

Execution time: around 1.0s

In [29]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל

In [ ]: