#!/usr/bin/env python # coding: utf-8 #

Table of Contents

# # # # # # # Booknames (multilingual) # # This notebook adds multilingual book names to a # [BHSA](https://github.com/ETCBC/bhsa) dataset in # [text-Fabric](https://github.com/Dans-labs/text-fabric) # format. # # ## Discussion # # We add the features # `book@`*iso* # where *iso* is a # [two letter ISO-639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) # language code of a modern language. # We use a source file `blang.py` that contains the names of the books of the bible # in modern languages (around 20, most big languages are covered). # This data has been gleaned mostly from Wikipedia. # # We assume that the dataset has the `book` feature present, holding *Latin* book names. # # This program works for all datasets and versions that have this feature with the # intended meaning. # In[1]: import os import sys import utils from tf.fabric import Fabric from blang import bookLangs, bookNames # # Pipeline # See [operation](https://github.com/ETCBC/pipeline/blob/master/README.md#operation) # for how to run this script in the pipeline. # In[2]: if "SCRIPT" not in locals(): SCRIPT = False FORCE = True CORE_NAME = "bhsa" VERSION = "2021" def stop(good=False): if SCRIPT: sys.exit(0 if good else 1) # # Setting up the context: source file and target directories # # The conversion is executed in an environment of directories, so that sources, temp files and # results are in convenient places and do not have to be shifted around. # In[3]: repoBase = os.path.expanduser("~/github/etcbc") thisRepo = "{}/{}".format(repoBase, CORE_NAME) thisTemp = "{}/_temp/{}".format(thisRepo, VERSION) thisTempTf = "{}/tf".format(thisTemp) thisTf = "{}/tf/{}".format(thisRepo, VERSION) # # Collect # # We collect the book names. # In[4]: utils.caption(4, "Book names") metaData = { "": dict( dataset="BHSA", version=VERSION, datasetName="Biblia Hebraica Stuttgartensia Amstelodamensis", author="Eep Talstra Centre for Bible and Computer", provenance="book names from wikipedia and other sources", encoders="Dirk Roorda (TF)", website="https://shebanq.ancient-data.org", email="shebanq@ancient-data.org", ), } for (langCode, (langEnglish, langName)) in bookLangs.items(): metaData["book@{}".format(langCode)] = { "valueType": "str", "language": langName, "languageCode": langCode, "languageEnglish": langEnglish, } newFeatures = sorted(m for m in metaData if m != "") newFeaturesStr = " ".join(newFeatures) utils.caption(0, "{} languages ...".format(len(newFeatures))) # # Test # # Check whether this conversion is needed in the first place. # Only when run as a script. # In[5]: if SCRIPT: (good, work) = utils.mustRun( None, "{}/.tf/{}.tfx".format(thisTf, newFeatures[0]), force=FORCE ) if not good: stop(good=False) if not work: stop(good=True) # # Load existing data # In[6]: utils.caption(4, "Loading relevant features") TF = Fabric(locations=thisTf, modules=[""]) api = TF.load("book") api.makeAvailableIn(globals()) nodeFeatures = {} nodeFeatures["book@la"] = {} bookNodes = [] for b in F.otype.s("book"): bookNodes.append(b) nodeFeatures["book@la"][b] = F.book.v(b) for (langCode, langBookNames) in bookNames.items(): nodeFeatures["book@{}".format(langCode)] = dict(zip(bookNodes, langBookNames)) utils.caption(0, "{} book name features created".format(len(nodeFeatures))) # # Write new features # In[7]: utils.caption(4, "Write book name features as TF") TF = Fabric(locations=thisTempTf, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures={}, metaData=metaData) # # Diffs # # Check differences with previous versions. # In[8]: utils.checkDiffs(thisTempTf, thisTf, only=set(newFeatures)) # # Deliver # # Copy the new Text-Fabric features from the temporary location where they have been created to their final destination. # In[9]: utils.deliverFeatures(thisTempTf, thisTf, newFeatures) # # Compile TF # In[10]: utils.caption(4, "Load and compile the new TF features") TF = Fabric(locations=thisTf, modules=[""]) api = TF.load("") api.makeAvailableIn(globals()) # # Examples # In[11]: utils.caption(4, "Genesis in all languages") genesisNode = F.otype.s("book")[0] for (lang, langInfo) in sorted(T.languages.items()): language = langInfo["language"] langEng = langInfo["languageEnglish"] book = T.sectionFromNode(genesisNode, lang=lang)[0] utils.caption( 0, "{:<2} = {:<20} Genesis is {:<20} in {:<20}".format( lang, langEng, book, language ), ) utils.caption(0, "Done")