Reading node feature in Julia¶

We read the BHSA feature g_word_utf8, which maps nearly half a million integers to Hebrew word occurrences in the Hebrew Bible.

We measure the execution time of a second run of the last cell, so that we do not count warming up effects.

Choice of test feature¶

In [1]:

base = "$(homedir())/text-fabric-data/etcbc/bhsa/tf/c"
feature = "g_word_utf8"
featurePath = "$base/$feature.tf"

Out[1]:

"/Users/dirk/text-fabric-data/etcbc/bhsa/tf/c/g_word_utf8.tf"

Auxiliary functions for reading a TF feature¶

In [2]:

function error(msg)
    write(STDERR, "$msg\n")
end

Out[2]:

error (generic function with 1 method)

In [3]:

function showResults(errors, data)
  if errors == 0
      maxNode = maximum(keys(data))
      print("$(length(data)) results, last node $maxNode\n")
      print("$(data[1])\n")
      print("$(data[2])\n")
      print("$(data[maxNode])\n")
  else
      print("$errors errors")
  end
end

Out[3]:

showResults (generic function with 1 method)

In [4]:

function valueFromTf(tf)
  join([replace(replace(x, "\\t", "\t"), "\\n", "\n") for x in split(tf, "\\\\")], "\\")
end

Out[4]:

valueFromTf (generic function with 1 method)

In [5]:

function setFromSpec(spec)::Set{UInt32}
  covered = Set{UInt32}()
  for r_str in split(spec, ",")
    bounds = split(r_str, "-")
    if length(bounds) == 1
      push!(covered, parse(UInt32, r_str))
    else
      b = parse(UInt32, bounds[1])
      e = parse(UInt32, bounds[2])
      if e < b
        (b, e) = (e, b)
      end
      for n in b:e
        push!(covered, n)
      end
    end
  end
  covered
end

Out[5]:

setFromSpec (generic function with 1 method)

Just reading a TF feature from disk, get through the metadata, and deliver all lines in memory, plus the starting line for the data.

The whole file gets slurped.

In [6]:

function readFile(path)
  if !isfile(path)
    error("TF reading: feature file '$path' does not exist")
    return false
  end
  contents = open(path) do fh
    read(fh, String)
  end
  lines = split(contents, "\n")
  if lines[end] == ""
    pop!(lines)
  end
  i::UInt32 = 0
  for line in lines
    i += 1
    if startswith(line, "@")
      continue
    else
      if line != ""
        error("Line $i: missing blank line after metadata")
        return false
      else
        break
      end
    end
  end
  i += 1
  (lines, i)
end

Out[6]:

readFile (generic function with 1 method)

The readTf function as done in Text-Fabric.

In [7]:

function readTf(path)
  if !isfile(path)
    error("TF reading: feature file '$path' does not exist")
    return false
  end
  fh = open(path)
  i = 0
  for line in eachline(fh)
    i += 1
    text = rstrip(line)
    if startswith(text, "@")
      continue
    else
      if text != ""
        error("Line $i: missing blank line after metadata")
        close(fh)
        return false
      else
        break
      end
    end
  end
  result = readDataTf(fh, i)
  close(fh)
  result
end

Out[7]:

readTf (generic function with 1 method)

Reading the data part pf a feature and storing it in a dict.

In [8]:

function readDataTf(fh, firstI)
  i = firstI
  implicit_node = 1
  data = Dict{Integer, String}()
  normFields = 2
  isNum = false
  errors = 0
  for line in eachline(fh)
    i += 1
    fields = split(rstrip(line, '\n'), "\t")
    lfields = length(fields)
    if lfields > normFields
      error("$(i) : wrongFields")
      errors += 1
      continue
    end
    if lfields == normFields
      nodes = setFromSpec(fields[1])
      valTf = fields[end]
    else
      nodes = Set([implicit_node])
      if lfields == 1
        valTf = fields[1]
      else
        valTf = ""
      end
    end
    implicit_node = maximum(nodes) + 1
    value = (
        valTf == "" ?
          (isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
          valueFromTf(valTf)
    )
    for n in nodes
      if value !== nothing
        data[n] = value
      end
    end
  end
  (errors, data)
end

Out[8]:

readDataTf (generic function with 1 method)

A variant: read a TF feature and store it in a list.

In [9]:

function readTfList(path)
  if !isfile(path)
    error("TF reading: feature file '$path' does not exist")
    return false
  end
  fh = open(path)
  i = 0
  for line in eachline(fh)
    i += 1
    text = rstrip(line)
    if startswith(text, "@")
      continue
    else
      if text != ""
        error("Line $i: missing blank line after metadata")
        close(fh)
        return false
      else
        break
      end
    end
  end
  result = readDataTfList(fh, i)
  close(fh)
  result
end

Out[9]:

readTfList (generic function with 1 method)

In [10]:

function readDataTfList(fh, firstI)
  i = firstI
  implicit_node = 1
  data = Array{String, 1}()
  normFields = 2
  isNum = false
  errors = 0
  for line in eachline(fh)
    i += 1
    fields = split(rstrip(line, '\n'), "\t")
    lfields = length(fields)
    if lfields > normFields
      error("$(i) : wrongFields")
      errors += 1
      continue
    end
    if lfields == normFields
      nodes = setFromSpec(fields[1])
      valTf = fields[end]
    else
      nodes = Set([implicit_node])
      if lfields == 1
        valTf = fields[1]
      else
        valTf = ""
      end
    end
    implicit_node = maximum(nodes) + 1
    value = (
        valTf == "" ?
          (isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
          valueFromTf(valTf)
    )
    for n in nodes
      if value !== nothing
        push!(data, value)
      end
    end
  end
  (errors, data)
end

Out[10]:

readDataTfList (generic function with 1 method)

Read a TF feature by slurping.

In [11]:

function readTfSlurp(path)
  if !isfile(path)
    error("TF reading: feature file '$path' does not exist")
    return false
  end
  contents = open(path) do fh
    read(fh, String)
  end
  lines = split(contents, "\n")
  if lines[end] == ""
    pop!(lines)
  end
  i = 0
  for line in lines
    i += 1
    if startswith(line, "@")
      continue
    else
      if line != ""
        error("Line $i: missing blank line after metadata")
        return false
      else
        break
      end
    end
  end
  result = readDataTfSlurp(lines, i + 1)
  result
end

Out[11]:

readTfSlurp (generic function with 1 method)

In [12]:

function readDataTfSlurp(lines, firstI)
  i = firstI
  implicit_node = 1
  data = Dict{Integer, String}()
  normFields = 2
  isNum = false
  errors = 0
  for line in lines[firstI:end]
    i += 1
    fields = split(line, "\t")
    lfields = length(fields)
    if lfields > normFields
      error("$(i) : wrongFields")
      errors += 1
      continue
    end
    if lfields == normFields
      nodes = setFromSpec(fields[1])
      valTf = fields[end]
    else
      nodes = Set([implicit_node])
      if lfields == 1
        valTf = fields[1]
      else
        valTf = ""
      end
    end
    implicit_node = maximum(nodes) + 1
    value = (
        valTf == "" ?
          (isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
          valueFromTf(valTf)
    )
    for n in nodes
      if value !== nothing
        data[n] = value
      end
    end
  end
  (errors, data)
end

Out[12]:

readDataTfSlurp (generic function with 1 method)

A possibly optimized function to read a feature from already slurped data lines.

In [13]:

function readDataTfSlurpOpt(lines, firstI::UInt32)
  i::UInt32 = firstI
  implicit_node::UInt32 = 1
  data = Dict{UInt32, SubString{String}}()
  normFields::UInt8 = 2
  isNum::Bool = false
  errors::UInt32 = 0
  for line in lines[firstI:end]
    i += 1
    fields = split(line, "\t")
    lfields::UInt8 = length(fields)
    if lfields > normFields
      error("$(i) : wrongFields")
      errors += 1
      continue
    end
    if lfields == normFields
      nodes::Set{UInt32} = setFromSpec(fields[1])
      valTf = fields[end]
    else
      nodes = Set{UInt32}([implicit_node])
      if lfields == 1
        valTf = fields[1]
      else
        valTf = ""
      end
    end
    implicit_node = maximum(nodes) + 1
    value = (
        valTf == "" ?
          (isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
          valueFromTf(valTf)
    )
    for n::UInt32 in nodes
      if value !== nothing
        data[n] = value
      end
    end
  end
  (errors, data)
end

Out[13]:

readDataTfSlurpOpt (generic function with 1 method)

Test: straight TF reading¶

In [14]:

(errors, data) = readTf(featurePath)

Out[14]:

(0, Dict{Integer,String}(Pair{Integer,String}(247825, "פְּנֵ֖י"),Pair{Integer,String}(43031, "אֲדָנִ֗ים"),Pair{Integer,String}(349542, "תְּבוּנָֽה"),Pair{Integer,String}(323003, "אֱ֭לֹהִים"),Pair{Integer,String}(355530, "לַֽ"),Pair{Integer,String}(372485, "דִֽי"),Pair{Integer,String}(375950, "רַחֲמִ֖ים"),Pair{Integer,String}(319122, "יִתָּצְךָ֪"),Pair{Integer,String}(61670, "יִטְמָ֖א"),Pair{Integer,String}(119601, "וַ")…))

Execution time: around 3.5s

In [15]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל

Test: TF reading as list¶

In [16]:

(errors, data) = readTfList(featurePath)

Out[16]:

(0, String["בְּ", "רֵאשִׁ֖ית", "בָּרָ֣א", "אֱלֹהִ֑ים", "אֵ֥ת", "הַ", "שָּׁמַ֖יִם", "וְ", "אֵ֥ת", "הָ"  …  "מִֽי", "בָכֶ֣ם", "מִ", "כָּל", "עַמֹּ֗ו", "יְהוָ֧ה", "אֱלֹהָ֛יו", "עִמֹּ֖ו", "וְ", "יָֽעַל"])

Execution time: around 2.5s

In [17]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל

Test: TF slurping¶

In [18]:

(errors, data) = readTfSlurp(featurePath)

Out[18]:

(0, Dict{Integer,String}(Pair{Integer,String}(247825, "פְּנֵ֖י"),Pair{Integer,String}(43031, "אֲדָנִ֗ים"),Pair{Integer,String}(349542, "תְּבוּנָֽה"),Pair{Integer,String}(323003, "אֱ֭לֹהִים"),Pair{Integer,String}(355530, "לַֽ"),Pair{Integer,String}(372485, "דִֽי"),Pair{Integer,String}(375950, "רַחֲמִ֖ים"),Pair{Integer,String}(319122, "יִתָּצְךָ֪"),Pair{Integer,String}(61670, "יִטְמָ֖א"),Pair{Integer,String}(119601, "וַ")…))

Execution time: around 3.8s

In [19]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל

Test: slurping and then optimized TF processing¶

In [20]:

(lines, first) = readFile(featurePath)

Out[20]:

(SubString{String}["@node", "@author=Eep Talstra Centre for Bible and Computer", "@dataset=BHSA", "@datasetName=Biblia Hebraica Stuttgartensia Amstelodamensis", "@email=shebanq@ancient-data.org", "@encoders=Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)", "@valueType=str", "@version=_temp", "@website=https://shebanq.ancient-data.org", "@writtenBy=Text-Fabric"  …  "מִֽי", "בָכֶ֣ם", "מִ", "כָּל", "עַמֹּ֗ו", "יְהוָ֧ה", "אֱלֹהָ֛יו", "עִמֹּ֖ו", "וְ", "יָֽעַל"], 0x0000000d)

Execution time: around 0.12s

In [23]:

(errors, data) = readDataTfSlurpOpt(lines, first)

Out[23]:

(0x00000000, Dict(0x0003c811=>"פְּנֵ֖י",0x0000a817=>"אֲדָנִ֗ים",0x00055566=>"תְּבוּנָֽה",0x0004edbb=>"אֱ֭לֹהִים",0x00056cca=>"לַֽ",0x0005af05=>"דִֽי",0x0005bc8e=>"רַחֲמִ֖ים",0x0004de92=>"יִתָּצְךָ֪",0x0000f0e6=>"יִטְמָ֖א",0x0001d331=>"וַ"…))

Execution time: around 2.2s

In [24]:

showResults(errors, data)

426584 results, last node 426584
בְּ
רֵאשִׁ֖ית
יָֽעַל