#!/usr/bin/env python # coding: utf-8 # # # # # # # Statistics # # This notebook adds statistical features to a # [BHSA](https://github.com/ETCBC/bhsa) dataset in # [text-Fabric](https://github.com/Dans-labs/text-fabric) # format. # # ## Discussion # # We add the features # `freq_occ freq_lex rank_occ rank_lex`. # # We assume that the dataset has these features present: # # * LANG_FEATURE (typically `languageISO`) for determining if the word is Hebrew or Aramaic # * OCC_FEATURE (typically `g_cons`) to get the word string in consonantal transcription # * LEX_FEATURE (typically `lex`) to get the lexical identifier in consonantal transcription # # This program works for all datasets and versions that have these features with the # intended meanings. The exact names of these features can be passed as parameters. # Note that the old version `3` uses very different names for many features. # # #### Languages # We will not identify lexemes and word occurrences across language. # So if two occurrences or lexemes exhibit the same string, but they are categorized as belonging # to different languages, they will not be identified. # # #### Occurrences # We group occurrences by their consonantal transcriptions. # So if two occurrences differ only in pointing, we count them as two occurrences of the same value. # # #### Lexemes # Lexemes are identified by the `lex` feature within a biblical language. # We will not identify lexemes across language. # In[4]: import os import sys import collections import utils from tf.fabric import Fabric # # Pipeline # See [operation](https://github.com/ETCBC/pipeline/blob/master/README.md#operation) # for how to run this script in the pipeline. # In[5]: if "SCRIPT" not in locals(): SCRIPT = False FORCE = True CORE_NAME = "bhsa" VERSION = "2021" LANG_FEATURE = "languageISO" OCC_FEATURE = "g_cons" LEX_FEATURE = "lex" def stop(good=False): if SCRIPT: sys.exit(0 if good else 1) # # Setting up the context: source file and target directories # # The conversion is executed in an environment of directories, so that sources, temp files and # results are in convenient places and do not have to be shifted around. # In[6]: repoBase = os.path.expanduser("~/github/etcbc") thisRepo = "{}/{}".format(repoBase, CORE_NAME) thisTemp = "{}/_temp/{}".format(thisRepo, VERSION) thisTempTf = "{}/tf".format(thisTemp) thisTf = "{}/tf/{}".format(thisRepo, VERSION) # In[7]: newFeaturesStr = """ freq_occ freq_lex rank_occ rank_lex """ newFeatures = newFeaturesStr.strip().split() # # Test # # Check whether this conversion is needed in the first place. # Only when run as a script. # In[8]: if SCRIPT: (good, work) = utils.mustRun( None, "{}/.tf/{}.tfx".format(thisTf, newFeatures[0]), force=FORCE ) if not good: stop(good=False) if not work: stop(good=True) # # Collect # # We collect the statistics. # In[9]: utils.caption(4, "Loading relevant features") TF = Fabric(locations=thisTf, modules=[""]) api = TF.load("{} {} {}".format(LANG_FEATURE, LEX_FEATURE, OCC_FEATURE)) api.makeAvailableIn(globals()) hasLex = "lex" in set(F.otype.all) # In[10]: utils.caption(0, "Counting occurrences") wstats = { "freqs": { "lex": collections.defaultdict(lambda: collections.Counter()), "occ": collections.defaultdict(lambda: collections.Counter()), }, "ranks": { "lex": collections.defaultdict(lambda: {}), "occ": collections.defaultdict(lambda: {}), }, } langs = set() for w in F.otype.s("word"): occ = Fs(OCC_FEATURE).v(w) lex = Fs(LEX_FEATURE).v(w) lan = Fs(LANG_FEATURE).v(w) wstats["freqs"]["lex"][lan][lex] += 1 wstats["freqs"]["occ"][lan][occ] += 1 langs.add(lan) for lan in langs: for tp in ["lex", "occ"]: rank = -1 prev_n = -1 amount = 1 for (x, n) in sorted( wstats["freqs"][tp][lan].items(), key=lambda y: (-y[1], y[0]) ): if n == prev_n: amount += 1 else: rank += amount amount = 1 prev_n = n wstats["ranks"][tp][lan][x] = rank # In[11]: utils.caption(0, "Making statistical features") metaData = { "": dict( dataset="BHSA", version=VERSION, datasetName="Biblia Hebraica Stuttgartensia Amstelodamensis", author="Eep Talstra Centre for Bible and Computer", provenance="computed addition to core set of features", encoders="Dirk Roorda (TF)", website="https://shebanq.ancient-data.org", email="shebanq@ancient-data.org", ), } nodeFeatures = {} edgeFeatures = {} for ft in newFeatures: nodeFeatures[ft] = {} metaData.setdefault(ft, {})["valueType"] = "int" for w in F.otype.s("word"): lan = Fs(LANG_FEATURE).v(w) occ = Fs(OCC_FEATURE).v(w) lex = Fs(LEX_FEATURE).v(w) nodeFeatures["freq_occ"][w] = str(wstats["freqs"]["occ"][lan][occ]) nodeFeatures["rank_occ"][w] = str(wstats["ranks"]["occ"][lan][occ]) nodeFeatures["freq_lex"][w] = str(wstats["freqs"]["lex"][lan][lex]) nodeFeatures["rank_lex"][w] = str(wstats["ranks"]["lex"][lan][lex]) if hasLex: for lx in F.otype.s("lex"): firstOcc = L.d(lx, otype="word")[0] nodeFeatures["freq_lex"][lx] = nodeFeatures["freq_lex"][firstOcc] nodeFeatures["rank_lex"][lx] = nodeFeatures["rank_lex"][firstOcc] # In[12]: utils.caption(4, "Write statistical features as TF") TF = Fabric(locations=thisTempTf, silent=True) TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData) # # Diffs # # Check differences with previous versions. # In[13]: utils.checkDiffs(thisTempTf, thisTf, only=set(newFeatures)) # # Deliver # # Copy the new TF features from the temporary location where they have been created to their final destination. # In[14]: utils.deliverFeatures(thisTempTf, thisTf, newFeatures) # # Compile TF # In[17]: utils.caption(4, "Load and compile the new TF features") TF = Fabric(locations=thisTf, modules=[""]) api = TF.load("{} {}".format(LEX_FEATURE, newFeaturesStr)) api.makeAvailableIn(globals()) # # Examples # In[16]: utils.caption(4, "Basic test") mostFrequent = set() topX = 10 lexIndex = {} utils.caption(0, "Top {} freqent lexemes (computed on otype=word)".format(topX)) for w in sorted(F.otype.s("word"), key=lambda w: -F.freq_lex.v(w)): lex = Fs(LEX_FEATURE).v(w) mostFrequent.add(lex) lexIndex[lex] = w if len(mostFrequent) == topX: break mostFrequentWord = sorted((-F.freq_lex.v(lexIndex[lex]), lex) for lex in mostFrequent) for (freq, lex) in mostFrequentWord: utils.caption(0, "{:<10} {:>6}x".format(lex, -freq)) if hasLex: utils.caption(4, "Top {} freqent lexemes (computed on otype=lex)".format(topX)) mostFrequentLex = sorted( (-F.freq_lex.v(lx), F.lex.v(lx)) for lx in F.otype.s("lex") )[0:10] for (freq, lex) in mostFrequentLex: utils.caption(0, "{:<10} {:>6}x".format(lex, -freq)) if mostFrequentWord != mostFrequentLex: utils.caption( 0, "\tWARNING: Mismatch in lexeme frequencies computed by lex vs by word" ) else: utils.caption(0, "\tINFO: Same lexeme frequencies computed by lex vs by word") utils.caption(0, "Done") # In[14]: if SCRIPT: stop(good=True)