This notebook can read ketiv-qere info in files issued by the ETCBC and transform them into new features. There will be new features at the word level.
NB This conversion will not work for versions 4
and 4b
.
There are already qere
and qere_utf8
features in the MQL of the core data.
However, there are several problems with those:
qere_trailer
and qere_trailer_utf8
are missing;qere
and the absence of a qere
.That is why we reconstruct ketiv and qere from special files that are used by the ETCBC.
import os,sys,re,collections
from tf.fabric import Fabric
from tf.transcription import Transcription
import utils
if 'SCRIPT' not in locals():
SCRIPT = False
FORCE = True
CORE_NAME = 'bhsa'
VERSION= 'c'
def stop(good=False):
if SCRIPT: sys.exit(0 if good else 1)
The conversion is executed in an environment of directories, so that sources, temp files and results are in convenient places and do not have to be shifted around.
repoBase = os.path.expanduser('~/github/etcbc')
thisRepo = '{}/{}'.format(repoBase, CORE_NAME)
thisSource = '{}/source/{}'.format(thisRepo, VERSION)
thisTemp = '{}/_temp/{}'.format(thisRepo, VERSION)
thisTempTf = '{}/tf'.format(thisTemp)
thisTf = '{}/tf/{}'.format(thisRepo, VERSION)
testFeature = 'qere_trailer'
Check whether this conversion is needed in the first place. Only when run as a script.
if SCRIPT:
(good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, testFeature), force=FORCE)
if not good: stop(good=False)
if not work: stop(good=True)
otext
feature of TF, based on lexical features.
We select the version specific otext material,
falling back on a default if nothing appropriate has been specified in oText.We do not do this for the older versions 4
and 4b
.
provenanceMetadata = dict(
dataset='BHSA',
version=VERSION,
datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis',
author='Eep Talstra Centre for Bible and Computer',
encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)',
website='https://shebanq.ancient-data.org',
email='shebanq@ancient-data.org',
)
oText = {
'_temp': '''
@fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}
@fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8}
@fmt:text-trans-full={qere/g_word}{qere_trailer/trailer}
@fmt:text-trans-full-ketiv={g_word}{trailer}''',
'c': '''
@fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}
@fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8}
@fmt:text-trans-full={qere/g_word}{qere_trailer/trailer}
@fmt:text-trans-full-ketiv={g_word}{trailer}''',
'2017': '''
@fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}
@fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8}
@fmt:text-trans-full={qere/g_word}{qere_trailer/trailer}
@fmt:text-trans-full-ketiv={g_word}{trailer}''',
'2016': '''
@fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}
@fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8}
@fmt:text-trans-full={qere/g_word}{qere_trailer/trailer}
@fmt:text-trans-full-ketiv={g_word}{trailer}''',
}
thisOtext = oText.get(VERSION, '')
if thisOtext is '':
utils.caption(0, 'No additional text formats provided')
otextInfo = {}
else:
utils.caption(0, 'New text formats')
otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n'))
for x in sorted(otextInfo.items()):
utils.caption(0, '{:<30} = "{}"'.format(*x))
| 0.00s New text formats | 0.00s fmt:text-orig-full = "{qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8}" | 0.00s fmt:text-orig-full-ketiv = "{g_word_utf8}{trailer_utf8}" | 0.00s fmt:text-trans-full = "{qere/g_word}{qere_trailer/trailer}" | 0.00s fmt:text-trans-full-ketiv = "{g_word}{trailer}"
utils.caption(4, 'Load the existing TF dataset')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('label g_word g_cons trailer_utf8')
api.makeAvailableIn(globals())
.............................................................................................. . 0.01s Load the existing TF dataset . .............................................................................................. This is Text-Fabric 3.0.2 Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api Tutorial : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb Example data : https://github.com/Dans-labs/text-fabric-data 101 features found and 0 ignored 0.00s loading features ... | 0.16s B g_cons from /Users/dirk/github/etcbc/bhsa/tf/c | 0.22s B g_word from /Users/dirk/github/etcbc/bhsa/tf/c | 0.10s B trailer_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c | 0.01s B label from /Users/dirk/github/etcbc/bhsa/tf/c | 0.00s Feature overview: 96 for nodes; 4 for edges; 1 configs; 7 computed 5.53s All features loaded/computed - for details use loadLog()
The ketiv-qere files deal with different verse labels. We make a mapping between verse labels and nodes.
utils.caption(0, 'Mapping between verse labels and verse nodes')
nodeFromLabel = {}
for vs in F.otype.s('verse'):
lab = F.label.v(vs)
nodeFromLabel[lab] = vs
utils.caption(0, '{} verses'.format(len(nodeFromLabel)))
| 5.57s Mapping between verse labels and verse nodes | 5.61s 23213 verses
utils.caption(4, 'Parsing Ketiv-Qere data')
verseInfo = collections.defaultdict(lambda: [])
notFound = set()
missing = collections.defaultdict(lambda: [])
missed = collections.defaultdict(lambda: [])
error_limit = 10
kqFile = '{}/ketivqere.txt'.format(thisSource)
kqHandle = open(kqFile)
ln = 0
can = 0
cur_label = None
for line in kqHandle:
ln += 1
can += 1
vlab = line[0:10]
fields = line.rstrip('\n')[10:].split()
(ketiv, qere) = fields[0:2]
(qtrim, qtrailer) = Transcription.suffix_and_finales(qere)
vnode = nodeFromLabel.get(vlab, None)
if vnode == None:
notFound.add(vlab)
continue
verseInfo[vnode].append((ketiv, qtrim, qtrailer))
kqHandle.close()
utils.caption(0, '\tRead {} ketiv-qere annotations'.format(ln))
.............................................................................................. . 5.67s Parsing Ketiv-Qere data . .............................................................................................. | 5.72s Read 1892 ketiv-qere annotations
data = []
for vnode in verseInfo:
wlookup = collections.defaultdict(lambda: [])
wvisited = collections.defaultdict(lambda: -1)
wnodes = L.d(vnode, otype='word')
for w in wnodes:
gw = F.g_word.v(w)
if '*' in gw:
gw = F.g_cons.v(w)
if gw == '': gw = '.'
if F.trailer_utf8.v(w) == '': gw += '-'
wlookup[gw].append(w)
for (ketiv, qere, qtrailer) in verseInfo[vnode]:
wvisited[ketiv] += 1
windex = wvisited[ketiv]
ws = wlookup.get(ketiv, None)
if ws == None or windex > len(ws) - 1:
missing[vnode].append((windex, ketiv, qere))
continue
w = ws[windex]
qereU = Transcription.to_hebrew(qere)
qtrailerU = Transcription.to_hebrew(qtrailer)
data.append((
w,
ketiv,
qere,
qtrailer.replace('\n', ''),
qereU,
qtrailerU.replace('\n', ''),
))
for ketiv in wlookup:
if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]:
missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), ketiv))
utils.caption(0, '\tParsed {} ketiv-qere annotations'.format(len(data)))
| 5.86s Parsed 1892 ketiv-qere annotations
if not SCRIPT:
print('\n'.join(repr(d) for d in data[0:10]))
(297571, '<NWJ', '<:ANIJ.;J', '&', 'עֲנִיֵּי', '־') (297646, 'W-', 'W:', '', 'וְ', '') (297647, 'NCQH', 'NIC:Q:<@73H', ' ', 'נִשְׁקְעָ֖ה', ' ') (297937, 'M<LWTW', 'MA<:ALOWT@80JW', ' ', 'מַעֲלֹותָ֔יו', ' ') (370147, 'M>WM', 'MW.m04', ' ', 'מוּם֩', ' ') (370611, 'L-', 'L:', '', 'לְ', '') (370612, '<BDJK', '<AB:D@73k:', ' ', 'עַבְדָ֖ךְ', ' ') (370620, 'L-', 'L:', '', 'לְ', '') (370621, 'KFDJ>', 'KAF:D.@>;80J', ' ', 'כַשְׂדָּאֵ֔י', ' ') (370703, 'HZMNTWN', 'HIZ:D.:MIN:T.W.n03', ' ', 'הִזְדְּמִנְתּוּן֙', ' ')
if notFound:
utils.caption(0, '\tWARNING: Could not find {} verses: {}'.format(len(notFound), sorted(notFound)))
else:
utils.caption(0, '\tAll verses entries found in index')
if missing:
utils.caption(0, '\tWARNING: Could not locate ketivs in the text: {} verses'.format(len(missing)))
e = 0
for vnode in sorted(missing):
if e > error_limit: break
vlab = F.label.v(vnode)
for (windex, ketiv, qere) in missing[vnode]:
e += 1
if e > error_limit: break
utils.caption(0, '\t\tNOT IN TEXT: {:<10} {:<20} #{} {}'.format(vlab, ketiv, windex, qere))
else:
utils.caption(0, '\tAll ketivs found in the text')
if missed:
utils.caption(0, '\tCould not lookup qeres in the data: {} verses'.format(len(missed)))
e = 0
for vnode in sorted(missed):
if e > error_limit: break
vlab = F.label.v(vnode)
for (windex, ketiv) in missed[vnode]:
e += 1
if e > error_limit: break
utils.caption(0, '\t\tNOT IN DATA: {:<10} {:<20} #{}'.format(vlab, ketiv, windex))
else:
utils.caption(0, '\tAll ketivs found in the data')
| 5.93s All verses entries found in index | 5.93s All ketivs found in the text | 5.93s All ketivs found in the data
utils.caption(0, 'Prepare TF ketiv qere features')
nodeFeatures = {}
newFeatures = '''
qere
qere_trailer
qere_utf8
qere_trailer_utf8
'''.strip().split()
nodeFeatures = dict(
qere=dict(((x[0], x[2]) for x in data)),
qere_trailer=dict(((x[0], x[3]) for x in data)),
qere_utf8=dict(((x[0], x[4]) for x in data)),
qere_trailer_utf8=dict(((x[0], x[5]) for x in data))
)
| 5.95s Prepare TF ketiv qere features
We update the otext
feature with new/changed formats
utils.caption(0, 'Update the otext feature')
metaData = {}
metaData['otext'] = dict()
metaData['otext'].update(T.config)
metaData['otext'].update(otextInfo)
for f in nodeFeatures:
metaData[f] = {}
metaData[f].update(provenanceMetadata)
metaData[f]['valueType'] = 'str'
| 5.97s Update the otext feature
changedDataFeatures = set(nodeFeatures)
changedFeatures = changedDataFeatures | {'otext'}
Transform the collected information in feature-like datastructures, and write it all
out to .tf
files.
utils.caption(4, 'write new/changed features to TF ...')
TF = Fabric(locations=thisTempTf, silent=True)
TF.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData)
.............................................................................................. . 6.00s write new/changed features to TF ... . .............................................................................................. | 0.01s T qere to /Users/dirk/github/etcbc/bhsa/_temp/c/tf | 0.01s T qere_trailer to /Users/dirk/github/etcbc/bhsa/_temp/c/tf | 0.01s T qere_trailer_utf8 to /Users/dirk/github/etcbc/bhsa/_temp/c/tf | 0.01s T qere_utf8 to /Users/dirk/github/etcbc/bhsa/_temp/c/tf | 0.00s M otext to /Users/dirk/github/etcbc/bhsa/_temp/c/tf
Check differences with previous versions.
The new dataset has been created in a temporary directory, and has not yet been copied to its destination.
Here is your opportunity to compare the newly created features with the older features. You expect some differences in some features.
We check the differences between the previous version of the features and what has been generated. We list features that will be added and deleted and changed. For each changed feature we show the first line where the new feature differs from the old one. We ignore changes in the metadata, because the timestamp in the metadata will always change.
utils.checkDiffs(thisTempTf, thisTf, only=changedFeatures)
.............................................................................................. . 6.06s Check differences with previous version . .............................................................................................. | 6.07s 2 features to add | 6.07s qere_trailer | 6.07s qere_trailer_utf8 | 6.07s no features to delete | 6.07s 3 features in common | 6.07s otext ... differences | 6.07s line 5 OLD -->@dateWritten=2017-10-03T05:39:09Z<-- | 6.07s line 5 NEW -->@dateWritten=2017-10-03T05:43:50Z<-- | 6.07s line 12 OLD -->@fmt:text-orig-full={g_word_utf8}{traile ...<-- | 6.07s line 12 NEW -->@fmt:text-orig-full={qere_utf8/g_word_ut ...<-- | 6.07s line 13 OLD -->@fmt:text-orig-plain={g_cons_utf8}{trail ...<-- | 6.07s line 13 NEW -->@fmt:text-orig-full-ketiv={g_word_utf8}{ ...<-- | 6.07s line 14 OLD -->@fmt:text-trans-full={g_word}{trailer}<-- | 6.07s line 14 NEW -->@fmt:text-orig-plain={g_cons_utf8}{trail ...<-- | 6.07s qere ... differences after the metadata | 6.11s line 2 OLD --><-- | 6.11s line 2 NEW -->3897 HAJ:Y;74><-- | 6.11s line 3 OLD --><-- | 6.11s line 3 NEW -->4420 >@H:@LO75W<-- | 6.11s line 4 OLD --><-- | 6.11s line 4 NEW -->5645 >@H:@LO92W<-- | 6.11s line 5 OLD --><-- | 6.11s line 5 NEW -->5912 >@95H:@LOW03<-- | 6.11s qere_utf8 ... differences after the metadata | 6.14s line 2 OLD --><-- | 6.14s line 2 NEW -->3897 הַיְצֵ֣א<-- | 6.14s line 3 OLD --><-- | 6.14s line 3 NEW -->4420 אָהֳלֹֽו<-- | 6.14s line 4 OLD --><-- | 6.14s line 4 NEW -->5645 אָהֳלֹ֑ו<-- | 6.14s line 5 OLD --><-- | 6.14s line 5 NEW -->5912 אָֽהֳלֹו֙<-- | 6.14s Done
Copy the new TF dataset from the temporary location where it has been created to its final destination.
utils.deliverFeatures(thisTempTf, thisTf, changedFeatures)
.............................................................................................. . 6.15s Deliver features to /Users/dirk/github/etcbc/bhsa/tf/c . .............................................................................................. | 6.15s qere | 6.15s qere_trailer | 6.16s qere_utf8 | 6.16s otext | 6.16s qere_trailer_utf8
We load the new features, use the new format, check some values
utils.caption(4, 'Load and compile the new TF features')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('g_word_utf8 g_word trailer_utf8 trailer {}'.format(' '.join(changedDataFeatures)))
api.makeAvailableIn(globals())
.............................................................................................. . 6.17s Load and compile the new TF features . .............................................................................................. This is Text-Fabric 3.0.2 Api reference : https://github.com/Dans-labs/text-fabric/wiki/Api Tutorial : https://github.com/Dans-labs/text-fabric/blob/master/docs/tutorial.ipynb Example data : https://github.com/Dans-labs/text-fabric-data 103 features found and 0 ignored 0.00s loading features ... | 0.18s B g_word from /Users/dirk/github/etcbc/bhsa/tf/c | 0.25s B g_word_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c | 0.01s T qere from /Users/dirk/github/etcbc/bhsa/tf/c | 0.01s T qere_trailer from /Users/dirk/github/etcbc/bhsa/tf/c | 0.01s T qere_trailer_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c | 0.01s T qere_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c | 0.09s B trailer from /Users/dirk/github/etcbc/bhsa/tf/c | 0.10s B trailer_utf8 from /Users/dirk/github/etcbc/bhsa/tf/c | 0.00s M otext from /Users/dirk/github/etcbc/bhsa/tf/c | | 0.09s C __sections__ from otype, oslots, otext, __levUp__, __levels__, book, chapter, verse | 0.00s Feature overview: 98 for nodes; 4 for edges; 1 configs; 7 computed 5.57s All features loaded/computed - for details use loadLog()
utils.caption(4, 'Basic tests')
def showKq(w):
hw = F.g_word_utf8.v(w)
tw = F.g_word.v(w)
ht = F.trailer_utf8.v(w)
tt = F.trailer.v(w)
qhw = F.qere_utf8.v(w)
qtw = F.qere.v(w)
qht = F.qere_trailer_utf8.v(w)
qtt = F.qere_trailer.v(w)
utils.caption(0, '{:<20} {}'.format('hebrew', hw + ht))
utils.caption(0, '{:<20} {}'.format('hebrew qere', qhw + qht))
utils.caption(0, '{:<20} {}'.format('transcription', tw + tt))
utils.caption(0, '{:<20} {}'.format('transcription qere', qtw + qtt))
utils.caption(0, '{:<30}: {}'.format(
'absence of qere',
' '.join('NA' if F.qere.v(w) == None else F.qere.v(w) for w in (range(24700,24710))),
))
utils.caption(0, '{:<30}: {}'.format(
'presence of qere trailer',
' '.join('NA' if F.qere_trailer.v(w) == None else F.qere_trailer.v(w) for w in (range(30190,30195))),
))
showNode = L.u(122073, otype='verse')[0]
showVerse = T.sectionFromNode(showNode)
utils.caption(4, '{} {}:{} in all formats'.format(*showVerse))
for fmt in T.formats:
utils.caption(0, '{:<30} {}'.format(fmt, T.text(L.d(showNode, otype='word'), fmt=fmt)))
.............................................................................................. . 12s Basic tests . .............................................................................................. | 12s absence of qere : NA WA J.I45C:T.AX:AW.75W. NA NA NA NA NA NA NA | 12s presence of qere trailer : NA NA & NA .............................................................................................. . 12s Joshua 15:53 in all formats . .............................................................................................. | 12s text-orig-plain וינים ובית תפוח ואפקה׃ | 12s text-orig-full וְיָנ֥וּם וּבֵית־תַּפּ֖וּחַ וַאֲפֵֽקָה׃ | 12s lex-orig-plain ו ינום ו בית תפוח ו אפקה | 12s lex-orig-full ו ינים וּ בֵית תַּפּוּחַ וַ אֲפֵקָה | 12s text-orig-full-ketiv וינים וּבֵית־תַּפּ֖וּחַ וַאֲפֵֽקָה׃ | 12s text-trans-full-ketiv *W-*JNJM W.-B;JT&T.AP.73W.XA WA->:AP;75Q@H00 | 12s text-trans-full W:J@N71W.m W.-B;JT&T.AP.73W.XA WA->:AP;75Q@H00 | 12s lex-trans-plain TC</ TC</ TC</ TC</ TC</ TC</ | 12s lex-trans-full W- JNJM W.- B;JT_T.AP.W.XA WA- >:AP;Q@H | 12s text-trans-plain WJNJM WBJT_TPWX W>PQH00