We read the BHSA feature g_word_utf8
, which maps nearly half a million integers to Hebrew word occurrences
in the Hebrew Bible.
We measure the execution time of a second run of the last cell, so that we do not count warming up effects.
base = "$(homedir())/text-fabric-data/etcbc/bhsa/tf/c"
feature = "g_word_utf8"
featurePath = "$base/$feature.tf"
"/Users/dirk/text-fabric-data/etcbc/bhsa/tf/c/g_word_utf8.tf"
function error(msg)
write(STDERR, "$msg\n")
end
error (generic function with 1 method)
function showResults(errors, data)
if errors == 0
maxNode = maximum(keys(data))
print("$(length(data)) results, last node $maxNode\n")
print("$(data[1])\n")
print("$(data[2])\n")
print("$(data[maxNode])\n")
else
print("$errors errors")
end
end
showResults (generic function with 1 method)
function valueFromTf(tf)
join([replace(replace(x, "\\t", "\t"), "\\n", "\n") for x in split(tf, "\\\\")], "\\")
end
valueFromTf (generic function with 1 method)
function setFromSpec(spec)::Set{UInt32}
covered = Set{UInt32}()
for r_str in split(spec, ",")
bounds = split(r_str, "-")
if length(bounds) == 1
push!(covered, parse(UInt32, r_str))
else
b = parse(UInt32, bounds[1])
e = parse(UInt32, bounds[2])
if e < b
(b, e) = (e, b)
end
for n in b:e
push!(covered, n)
end
end
end
covered
end
setFromSpec (generic function with 1 method)
Just reading a TF feature from disk, get through the metadata, and deliver all lines in memory, plus the starting line for the data.
The whole file gets slurped.
function readFile(path)
if !isfile(path)
error("TF reading: feature file '$path' does not exist")
return false
end
contents = open(path) do fh
read(fh, String)
end
lines = split(contents, "\n")
if lines[end] == ""
pop!(lines)
end
i::UInt32 = 0
for line in lines
i += 1
if startswith(line, "@")
continue
else
if line != ""
error("Line $i: missing blank line after metadata")
return false
else
break
end
end
end
i += 1
(lines, i)
end
readFile (generic function with 1 method)
The readTf function as done in Text-Fabric.
function readTf(path)
if !isfile(path)
error("TF reading: feature file '$path' does not exist")
return false
end
fh = open(path)
i = 0
for line in eachline(fh)
i += 1
text = rstrip(line)
if startswith(text, "@")
continue
else
if text != ""
error("Line $i: missing blank line after metadata")
close(fh)
return false
else
break
end
end
end
result = readDataTf(fh, i)
close(fh)
result
end
readTf (generic function with 1 method)
Reading the data part pf a feature and storing it in a dict.
function readDataTf(fh, firstI)
i = firstI
implicit_node = 1
data = Dict{Integer, String}()
normFields = 2
isNum = false
errors = 0
for line in eachline(fh)
i += 1
fields = split(rstrip(line, '\n'), "\t")
lfields = length(fields)
if lfields > normFields
error("$(i) : wrongFields")
errors += 1
continue
end
if lfields == normFields
nodes = setFromSpec(fields[1])
valTf = fields[end]
else
nodes = Set([implicit_node])
if lfields == 1
valTf = fields[1]
else
valTf = ""
end
end
implicit_node = maximum(nodes) + 1
value = (
valTf == "" ?
(isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
valueFromTf(valTf)
)
for n in nodes
if value !== nothing
data[n] = value
end
end
end
(errors, data)
end
readDataTf (generic function with 1 method)
A variant: read a TF feature and store it in a list.
function readTfList(path)
if !isfile(path)
error("TF reading: feature file '$path' does not exist")
return false
end
fh = open(path)
i = 0
for line in eachline(fh)
i += 1
text = rstrip(line)
if startswith(text, "@")
continue
else
if text != ""
error("Line $i: missing blank line after metadata")
close(fh)
return false
else
break
end
end
end
result = readDataTfList(fh, i)
close(fh)
result
end
readTfList (generic function with 1 method)
function readDataTfList(fh, firstI)
i = firstI
implicit_node = 1
data = Array{String, 1}()
normFields = 2
isNum = false
errors = 0
for line in eachline(fh)
i += 1
fields = split(rstrip(line, '\n'), "\t")
lfields = length(fields)
if lfields > normFields
error("$(i) : wrongFields")
errors += 1
continue
end
if lfields == normFields
nodes = setFromSpec(fields[1])
valTf = fields[end]
else
nodes = Set([implicit_node])
if lfields == 1
valTf = fields[1]
else
valTf = ""
end
end
implicit_node = maximum(nodes) + 1
value = (
valTf == "" ?
(isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
valueFromTf(valTf)
)
for n in nodes
if value !== nothing
push!(data, value)
end
end
end
(errors, data)
end
readDataTfList (generic function with 1 method)
Read a TF feature by slurping.
function readTfSlurp(path)
if !isfile(path)
error("TF reading: feature file '$path' does not exist")
return false
end
contents = open(path) do fh
read(fh, String)
end
lines = split(contents, "\n")
if lines[end] == ""
pop!(lines)
end
i = 0
for line in lines
i += 1
if startswith(line, "@")
continue
else
if line != ""
error("Line $i: missing blank line after metadata")
return false
else
break
end
end
end
result = readDataTfSlurp(lines, i + 1)
result
end
readTfSlurp (generic function with 1 method)
function readDataTfSlurp(lines, firstI)
i = firstI
implicit_node = 1
data = Dict{Integer, String}()
normFields = 2
isNum = false
errors = 0
for line in lines[firstI:end]
i += 1
fields = split(line, "\t")
lfields = length(fields)
if lfields > normFields
error("$(i) : wrongFields")
errors += 1
continue
end
if lfields == normFields
nodes = setFromSpec(fields[1])
valTf = fields[end]
else
nodes = Set([implicit_node])
if lfields == 1
valTf = fields[1]
else
valTf = ""
end
end
implicit_node = maximum(nodes) + 1
value = (
valTf == "" ?
(isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
valueFromTf(valTf)
)
for n in nodes
if value !== nothing
data[n] = value
end
end
end
(errors, data)
end
readDataTfSlurp (generic function with 1 method)
A possibly optimized function to read a feature from already slurped data lines.
function readDataTfSlurpOpt(lines, firstI::UInt32)
i::UInt32 = firstI
implicit_node::UInt32 = 1
data = Dict{UInt32, SubString{String}}()
normFields::UInt8 = 2
isNum::Bool = false
errors::UInt32 = 0
for line in lines[firstI:end]
i += 1
fields = split(line, "\t")
lfields::UInt8 = length(fields)
if lfields > normFields
error("$(i) : wrongFields")
errors += 1
continue
end
if lfields == normFields
nodes::Set{UInt32} = setFromSpec(fields[1])
valTf = fields[end]
else
nodes = Set{UInt32}([implicit_node])
if lfields == 1
valTf = fields[1]
else
valTf = ""
end
end
implicit_node = maximum(nodes) + 1
value = (
valTf == "" ?
(isNum && valTf != "") ? parse(Int, valTf) : (isNum ? nothing : "") :
valueFromTf(valTf)
)
for n::UInt32 in nodes
if value !== nothing
data[n] = value
end
end
end
(errors, data)
end
readDataTfSlurpOpt (generic function with 1 method)
(errors, data) = readTf(featurePath)
(0, Dict{Integer,String}(Pair{Integer,String}(247825, "פְּנֵ֖י"),Pair{Integer,String}(43031, "אֲדָנִ֗ים"),Pair{Integer,String}(349542, "תְּבוּנָֽה"),Pair{Integer,String}(323003, "אֱ֭לֹהִים"),Pair{Integer,String}(355530, "לַֽ"),Pair{Integer,String}(372485, "דִֽי"),Pair{Integer,String}(375950, "רַחֲמִ֖ים"),Pair{Integer,String}(319122, "יִתָּצְךָ֪"),Pair{Integer,String}(61670, "יִטְמָ֖א"),Pair{Integer,String}(119601, "וַ")…))
Execution time: around 3.5s
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל
(errors, data) = readTfList(featurePath)
(0, String["בְּ", "רֵאשִׁ֖ית", "בָּרָ֣א", "אֱלֹהִ֑ים", "אֵ֥ת", "הַ", "שָּׁמַ֖יִם", "וְ", "אֵ֥ת", "הָ" … "מִֽי", "בָכֶ֣ם", "מִ", "כָּל", "עַמֹּ֗ו", "יְהוָ֧ה", "אֱלֹהָ֛יו", "עִמֹּ֖ו", "וְ", "יָֽעַל"])
Execution time: around 2.5s
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל
(errors, data) = readTfSlurp(featurePath)
(0, Dict{Integer,String}(Pair{Integer,String}(247825, "פְּנֵ֖י"),Pair{Integer,String}(43031, "אֲדָנִ֗ים"),Pair{Integer,String}(349542, "תְּבוּנָֽה"),Pair{Integer,String}(323003, "אֱ֭לֹהִים"),Pair{Integer,String}(355530, "לַֽ"),Pair{Integer,String}(372485, "דִֽי"),Pair{Integer,String}(375950, "רַחֲמִ֖ים"),Pair{Integer,String}(319122, "יִתָּצְךָ֪"),Pair{Integer,String}(61670, "יִטְמָ֖א"),Pair{Integer,String}(119601, "וַ")…))
Execution time: around 3.8s
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל
(lines, first) = readFile(featurePath)
(SubString{String}["@node", "@author=Eep Talstra Centre for Bible and Computer", "@dataset=BHSA", "@datasetName=Biblia Hebraica Stuttgartensia Amstelodamensis", "@email=shebanq@ancient-data.org", "@encoders=Constantijn Sikkel (QDF), Ulrik Petersen (MQL) and Dirk Roorda (TF)", "@valueType=str", "@version=_temp", "@website=https://shebanq.ancient-data.org", "@writtenBy=Text-Fabric" … "מִֽי", "בָכֶ֣ם", "מִ", "כָּל", "עַמֹּ֗ו", "יְהוָ֧ה", "אֱלֹהָ֛יו", "עִמֹּ֖ו", "וְ", "יָֽעַל"], 0x0000000d)
Execution time: around 0.12s
(errors, data) = readDataTfSlurpOpt(lines, first)
(0x00000000, Dict(0x0003c811=>"פְּנֵ֖י",0x0000a817=>"אֲדָנִ֗ים",0x00055566=>"תְּבוּנָֽה",0x0004edbb=>"אֱ֭לֹהִים",0x00056cca=>"לַֽ",0x0005af05=>"דִֽי",0x0005bc8e=>"רַחֲמִ֖ים",0x0004de92=>"יִתָּצְךָ֪",0x0000f0e6=>"יִטְמָ֖א",0x0001d331=>"וַ"…))
Execution time: around 2.2s
showResults(errors, data)
426584 results, last node 426584 בְּ רֵאשִׁ֖ית יָֽעַל