#!/usr/bin/env python # coding: utf-8 #

# # # # Ketiv Qere # # This notebook can read ketiv-qere info in files issued by the ETCBC and transform them # into new features. # There will be new features at the word level. # # **NB** This conversion will not work for versions `4` and `4b`. # # ## Discussion # There are already `qere` and `qere_utf8` features in the MQL of the core data. # However, there are several problems with those: # # * features that contain the after-word material, `qere_trailer` and `qere_trailer_utf8` # are missing; # * if there is no qere, both features are filled with the empty string. # In this way we can make no distinction between a truly empty `qere` and the absence of a `qere`. # # That is why we reconstruct ketiv and qere from special files that are used by the ETCBC. # In[1]: import os,sys,re,collections from tf.fabric import Fabric from tf.transcription import Transcription import utils # # Pipeline # See [operation](https://github.com/ETCBC/pipeline/blob/master/README.md#operation) # for how to run this script in the pipeline. # In[2]: if 'SCRIPT' not in locals(): SCRIPT = False FORCE = True CORE_NAME = 'bhsa' VERSION= 'c' def stop(good=False): if SCRIPT: sys.exit(0 if good else 1) # # Setting up the context: source file and target directories # # The conversion is executed in an environment of directories, so that sources, temp files and # results are in convenient places and do not have to be shifted around. # In[3]: repoBase = os.path.expanduser('~/github/etcbc') thisRepo = '{}/{}'.format(repoBase, CORE_NAME) thisSource = '{}/source/{}'.format(thisRepo, VERSION) thisTemp = '{}/_temp/{}'.format(thisRepo, VERSION) thisTempTf = '{}/tf'.format(thisTemp) thisTf = '{}/tf/{}'.format(thisRepo, VERSION) # In[4]: testFeature = 'qere_trailer' # # Test # # Check whether this conversion is needed in the first place. # Only when run as a script. # In[5]: if SCRIPT: (good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, testFeature), force=FORCE) if not good: stop(good=False) if not work: stop(good=True) # # TF Settings # # * a piece of metadata that will go into these features; the time will be added automatically # * new text formats for the `otext` feature of TF, based on lexical features. # We select the version specific otext material, # falling back on a default if nothing appropriate has been specified in oText. # # We do not do this for the older versions `4` and `4b`. # In[6]: provenanceMetadata = dict( dataset='BHSA', version=VERSION, datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis', author='Eep Talstra Centre for Bible and Computer', encoders='Constantijn Sikkel (QDF), and Dirk Roorda (TF)', website='https://shebanq.ancient-data.org', email='shebanq@ancient-data.org', ) oText = { '_temp': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', 'c': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', '2017': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', '2016': ''' @fmt:text-orig-full={qere_utf8/g_word_utf8}{qere_trailer_utf8/trailer_utf8} @fmt:text-orig-full-ketiv={g_word_utf8}{trailer_utf8} @fmt:text-trans-full={qere/g_word}{qere_trailer/trailer} @fmt:text-trans-full-ketiv={g_word}{trailer}''', } thisOtext = oText.get(VERSION, '') if thisOtext is '': utils.caption(0, 'No additional text formats provided') otextInfo = {} else: utils.caption(0, 'New text formats') otextInfo = dict(line[1:].split('=', 1) for line in thisOtext.strip('\n').split('\n')) for x in sorted(otextInfo.items()): utils.caption(0, '{:<30} = "{}"'.format(*x)) # In[7]: utils.caption(4, 'Load the existing TF dataset') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('label g_word g_cons trailer_utf8') api.makeAvailableIn(globals()) # # Verse labels # The ketiv-qere files deal with different verse labels. # We make a mapping between verse labels and nodes. # In[8]: utils.caption(0, 'Mapping between verse labels and verse nodes') nodeFromLabel = {} for vs in F.otype.s('verse'): lab = F.label.v(vs) nodeFromLabel[lab] = vs utils.caption(0, '{} verses'.format(len(nodeFromLabel))) # # Read the Ketiv-Qere file # In[9]: utils.caption(4, 'Parsing Ketiv-Qere data') verseInfo = collections.defaultdict(lambda: []) notFound = set() missing = collections.defaultdict(lambda: []) missed = collections.defaultdict(lambda: []) error_limit = 10 kqFile = '{}/ketivqere.txt'.format(thisSource) kqHandle = open(kqFile) ln = 0 can = 0 cur_label = None for line in kqHandle: ln += 1 can += 1 vlab = line[0:10] fields = line.rstrip('\n')[10:].split() (ketiv, qere) = fields[0:2] (qtrim, qtrailer) = Transcription.suffix_and_finales(qere) vnode = nodeFromLabel.get(vlab, None) if vnode == None: notFound.add(vlab) continue verseInfo[vnode].append((ketiv, qtrim, qtrailer)) kqHandle.close() utils.caption(0, '\tRead {} ketiv-qere annotations'.format(ln)) # In[10]: data = [] for vnode in verseInfo: wlookup = collections.defaultdict(lambda: []) wvisited = collections.defaultdict(lambda: -1) wnodes = L.d(vnode, otype='word') for w in wnodes: gw = F.g_word.v(w) if '*' in gw: gw = F.g_cons.v(w) if gw == '': gw = '.' if F.trailer_utf8.v(w) == '': gw += '-' wlookup[gw].append(w) for (ketiv, qere, qtrailer) in verseInfo[vnode]: wvisited[ketiv] += 1 windex = wvisited[ketiv] ws = wlookup.get(ketiv, None) if ws == None or windex > len(ws) - 1: missing[vnode].append((windex, ketiv, qere)) continue w = ws[windex] qereU = Transcription.to_hebrew(qere) qtrailerU = Transcription.to_hebrew(qtrailer) data.append(( w, ketiv, qere, qtrailer.replace('\n', ''), qereU, qtrailerU.replace('\n', ''), )) for ketiv in wlookup: if ketiv not in wvisited or len(wlookup[ketiv]) - 1 > wvisited[ketiv]: missed[vnode].append((len(wlookup[ketiv]) - (wvisited.get(ketiv, -1) + 1), ketiv)) utils.caption(0, '\tParsed {} ketiv-qere annotations'.format(len(data))) # In[11]: if not SCRIPT: print('\n'.join(repr(d) for d in data[0:10])) # In[12]: if notFound: utils.caption(0, '\tWARNING: Could not find {} verses: {}'.format(len(notFound), sorted(notFound))) else: utils.caption(0, '\tAll verses entries found in index') if missing: utils.caption(0, '\tWARNING: Could not locate ketivs in the text: {} verses'.format(len(missing))) e = 0 for vnode in sorted(missing): if e > error_limit: break vlab = F.label.v(vnode) for (windex, ketiv, qere) in missing[vnode]: e += 1 if e > error_limit: break utils.caption(0, '\t\tNOT IN TEXT: {:<10} {:<20} #{} {}'.format(vlab, ketiv, windex, qere)) else: utils.caption(0, '\tAll ketivs found in the text') if missed: utils.caption(0, '\tCould not lookup qeres in the data: {} verses'.format(len(missed))) e = 0 for vnode in sorted(missed): if e > error_limit: break vlab = F.label.v(vnode) for (windex, ketiv) in missed[vnode]: e += 1 if e > error_limit: break utils.caption(0, '\t\tNOT IN DATA: {:<10} {:<20} #{}'.format(vlab, ketiv, windex)) else: utils.caption(0, '\tAll ketivs found in the data') # # Prepare TF features # In[13]: utils.caption(0, 'Prepare TF ketiv qere features') nodeFeatures = {} newFeatures = ''' qere qere_trailer qere_utf8 qere_trailer_utf8 '''.strip().split() nodeFeatures = dict( qere=dict(((x[0], x[2]) for x in data)), qere_trailer=dict(((x[0], x[3]) for x in data)), qere_utf8=dict(((x[0], x[4]) for x in data)), qere_trailer_utf8=dict(((x[0], x[5]) for x in data)) ) # We update the `otext` feature with new/changed formats # In[14]: utils.caption(0, 'Update the otext feature') metaData = {} metaData['otext'] = dict() metaData['otext'].update(T.config) metaData['otext'].update(otextInfo) for f in nodeFeatures: metaData[f] = {} metaData[f].update(provenanceMetadata) metaData[f]['valueType'] = 'str' # In[15]: changedDataFeatures = set(nodeFeatures) changedFeatures = changedDataFeatures | {'otext'} # # Write new features # Transform the collected information in feature-like datastructures, and write it all # out to `.tf` files. # In[16]: utils.caption(4, 'write new/changed features to TF ...') TF = Fabric(locations=thisTempTf, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData) # # Diffs # # Check differences with previous versions. # # The new dataset has been created in a temporary directory, # and has not yet been copied to its destination. # # Here is your opportunity to compare the newly created features with the older features. # You expect some differences in some features. # # We check the differences between the previous version of the features and what has been generated. # We list features that will be added and deleted and changed. # For each changed feature we show the first line where the new feature differs from the old one. # We ignore changes in the metadata, because the timestamp in the metadata will always change. # In[17]: utils.checkDiffs(thisTempTf, thisTf, only=changedFeatures) # # Deliver # # Copy the new TF dataset from the temporary location where it has been created to its final destination. # In[18]: utils.deliverFeatures(thisTempTf, thisTf, changedFeatures) # # Compile TF # # We load the new features, use the new format, check some values # In[19]: utils.caption(4, 'Load and compile the new TF features') TF = Fabric(locations=thisTf, modules=['']) api = TF.load('g_word_utf8 g_word trailer_utf8 trailer {}'.format(' '.join(changedDataFeatures))) api.makeAvailableIn(globals()) # # Examples # In[20]: utils.caption(4, 'Basic tests') def showKq(w): hw = F.g_word_utf8.v(w) tw = F.g_word.v(w) ht = F.trailer_utf8.v(w) tt = F.trailer.v(w) qhw = F.qere_utf8.v(w) qtw = F.qere.v(w) qht = F.qere_trailer_utf8.v(w) qtt = F.qere_trailer.v(w) utils.caption(0, '{:<20} {}'.format('hebrew', hw + ht)) utils.caption(0, '{:<20} {}'.format('hebrew qere', qhw + qht)) utils.caption(0, '{:<20} {}'.format('transcription', tw + tt)) utils.caption(0, '{:<20} {}'.format('transcription qere', qtw + qtt)) utils.caption(0, '{:<30}: {}'.format( 'absence of qere', ' '.join('NA' if F.qere.v(w) == None else F.qere.v(w) for w in (range(24700,24710))), )) utils.caption(0, '{:<30}: {}'.format( 'presence of qere trailer', ' '.join('NA' if F.qere_trailer.v(w) == None else F.qere_trailer.v(w) for w in (range(30190,30195))), )) showNode = L.u(122073, otype='verse')[0] showVerse = T.sectionFromNode(showNode) utils.caption(4, '{} {}:{} in all formats'.format(*showVerse)) for fmt in T.formats: utils.caption(0, '{:<30} {}'.format(fmt, T.text(L.d(showNode, otype='word'), fmt=fmt))) # In[ ]: