Request by Robert Voogdgeert.
Make a CSV of half verses in a representation that only shows accents and word boundaries.
import os
import re
from tf.app import use
A = use("ETCBC/bhsa:clone", hoist=globals(), silent="deep")
You can configure a chunk to be half_verse
or clause
.
If the chunk is half_verse
, we use the feature label
to identify it within the verse.
If the chunk is clause
, we use the sentence number and the clause number to identify it.
In chunkTypes
we store a mapping of all chunk types we support to functions that provide a label for such chunks.
chunkTypes = dict(
half_verse=F.label.v,
clause=lambda n: f'{F.number.v(L.u(n, otype="sentence")[0])}.{F.number.v(n)}',
clause_atom=F.number.v,
)
Here is a function that shows chunks.
def showChunks(chunks):
for c in chunks:
cType = F.otype.v(c)
headFunc = chunkTypes.get(cType, None)
head = "?" if headFunc is None else headFunc(c)
passage = T.sectionFromNode(c)
heading = "{} {}:{} {}".format(*passage, head)
text = T.text(c, fmt="text-trans-full")
print(f"{heading}\n\t{text}")
Let's inspect a few half verses (the first and second ones and one which contains a word with an in-word space):
chunkType = "half_verse"
(h1, h2) = F.otype.s(chunkType)[0:2]
v = T.nodeFromSection(("1_Chronicles", 2, 54))
h3 = L.d(v, otype=chunkType)[0]
showChunks((h1, h2, h3))
Genesis 1:1 A B.:-R;>CI73JT B.@R@74> >:ELOHI92JM Genesis 1:1 B >;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 1_Chronicles 2:54 A B.:N;74J FAL:M@81> B.;71JT_LE33XEM03 W.-N:VO74WP@TI80J <AV:RO73WT_B.;74JT_JOW>@92B
Let's inspect a few clauses (the first ten).
chunkType = "clause"
chunks = F.otype.s(chunkType)[0:10]
showChunks(chunks)
Genesis 1:1 1.1 B.:-R;>CI73JT B.@R@74> >:ELOHI92JM >;71T HA-C.@MA73JIM W:->;71T H@->@75REY00 Genesis 1:2 2.1 W:-H@->@81REY H@J:T@71H TO33HW.03 W@-BO80HW. Genesis 1:2 3.1 W:-XO73CEK: <AL&P.:N;74J T:HO92WM Genesis 1:2 4.1 W:-R74W.XA >:ELOHI80JM M:RAXE73PET <AL&P.:N;71J HA-M.@75JIM00 Genesis 1:3 5.1 WA-J.O71>MER >:ELOHI73JM Genesis 1:3 6.1 J:HI74J >O92WR Genesis 1:3 7.1 WA45-J:HIJ&>O75WR00 Genesis 1:4 8.1 WA-J.A94R:> >:ELOHI91JM >ET&H@->O73WR Genesis 1:4 8.2 K.IJ&VO92WB Genesis 1:4 9.1 WA-J.AB:D.;74L >:ELOHI80JM B.;71JN H@->O73WR W.-B;71JN HA-XO75CEK:00
We define a function to get the accent pattern from a chunk.
The function works by stripping all non-digit-non-space material, then splitting on space, then dividing the numbers into pairs, and then joining everything together.
We exclude some marks, because they are not proper cantillation accents.
excludedAccents = {
"35",
"45",
"75",
"95", # meteg
"52",
"53", # upper and lower dots
}
stripPat = re.compile(r"[^0-9 ]")
accentPat = re.compile(r"[0-9]{2}")
def getAccents(chunk):
trans = T.text(chunk, fmt="text-trans-full").replace("_", " ")
words = stripPat.sub("", trans).split()
items = []
for word in words:
accents = [ac for ac in accentPat.findall(word) if ac not in excludedAccents]
items.append("_".join(accents))
return " ".join(items)
for c in (h1, h2, h3, *chunks):
print(getAccents(c))
73 74 92 71 73 71 00 74 81 71 33_03 74_80 73 74 92 73 74 92 71 73 71 00 81 71 33_03 80 73 74 92 74 80 73 71 00 71 73 74 92 00 94 91 73 92 74 80 71 73 71 00
We define a function to process a given selection with a given chunk type.
The file is saved to the destination
, by default your Downloads folder.
def process(selection, chunkType, destination="~/Downloads"):
A.indent(reset=True)
A.info(f"Gather all {chunkType}s ...")
rows = []
headFunc = chunkTypes.get(chunkType, None)
if not headFunc:
A.error(f"Chunk type {chunkType} not supported")
return
for v in F.otype.s("verse"):
(book, chapter, verse) = T.sectionFromNode(v)
if selection is not None and book not in selection:
continue
for chunk in L.d(v, otype=chunkType):
head = headFunc(chunk)
accents = getAccents(chunk)
rows.append((book, chapter, verse, head, accents))
A.info(f"{len(rows)} {chunkType}s done")
csvRaw = f"{destination}/accents-{chunkType}.csv"
csv = os.path.expanduser(csvRaw)
with open(csv, "w") as fh:
for row in rows:
fh.write(",".join(str(f) for f in row) + "\n")
A.info(f"Results written to {csvRaw}")
return rows
You may choose to do all books or selected books only.
# tweak this cell by specifying the set of books you want done (English book names)
# books = None means: all books
books = None
# books = {'Numbers', 'Ruth'}
rows = process(books, "half_verse")
0.00s Gather all half_verses ... 2.84s 45180 half_verses done 2.93s Results written to ~/Downloads/accents-half_verse.csv
rows[0:10]
[('Genesis', 1, 1, 'A', '73 74 92'), ('Genesis', 1, 1, 'B', '71 73 71 00'), ('Genesis', 1, 2, 'A', '81 71 33_03 80 73 74 92'), ('Genesis', 1, 2, 'B', '74 80 73 71 00'), ('Genesis', 1, 3, 'A', '71 73 74 92'), ('Genesis', 1, 3, 'B', '00'), ('Genesis', 1, 4, 'A', '94 91 73 92'), ('Genesis', 1, 4, 'B', '74 80 71 73 71 00'), ('Genesis', 1, 5, 'A', '63 70_05 03 80 73 74 92'), ('Genesis', 1, 5, 'B', '71 73 71 00')]
rows = process(books, "clause")
0.00s Gather all clauses ... 3.53s 88071 clauses done 3.68s Results written to ~/Downloads/accents-clause.csv
rows[0:10]
[('Genesis', 1, 1, '1.1', '73 74 92 71 73 71 00'), ('Genesis', 1, 2, '2.1', '81 71 33_03 80'), ('Genesis', 1, 2, '3.1', '73 74 92'), ('Genesis', 1, 2, '4.1', '74 80 73 71 00'), ('Genesis', 1, 3, '5.1', '71 73'), ('Genesis', 1, 3, '6.1', '74 92'), ('Genesis', 1, 3, '7.1', '00'), ('Genesis', 1, 4, '8.1', '94 91 73'), ('Genesis', 1, 4, '8.2', '92'), ('Genesis', 1, 4, '9.1', '74 80 71 73 71 00')]
rows = process(books, "clause_atom")
0.00s Gather all clause_atoms ... 2.79s 90688 clause_atoms done 2.94s Results written to ~/Downloads/accents-clause_atom.csv
rows[0:10]
[('Genesis', 1, 1, 1, '73 74 92 71 73 71 00'), ('Genesis', 1, 2, 2, '81 71 33_03 80'), ('Genesis', 1, 2, 3, '73 74 92'), ('Genesis', 1, 2, 4, '74 80 73 71 00'), ('Genesis', 1, 3, 5, '71 73'), ('Genesis', 1, 3, 6, '74 92'), ('Genesis', 1, 3, 7, '00'), ('Genesis', 1, 4, 8, '94 91 73'), ('Genesis', 1, 4, 9, '92'), ('Genesis', 1, 4, 10, '74 80 71 73 71 00')]