#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
# # Statistics
#
# This notebook adds statistical features to a
# [BHSA](https://github.com/ETCBC/bhsa) dataset in
# [text-Fabric](https://github.com/Dans-labs/text-fabric)
# format.
#
# ## Discussion
#
# We add the features
# `freq_occ freq_lex rank_occ rank_lex`.
#
# We assume that the dataset has these features present:
#
# * LANG_FEATURE (typically `language`) for determining if the word is Hebrew or Aramaic
# * OCC_FEATURE (typically `g_cons`) to get the word string in consonantal transcription
# * LEX_FEATURE (typically `lex`) to get the lexical identifier in consonantal transcription
#
# This program works for all datasets and versions that have these features with the
# intended meanings. The exact names of these features can be passed as parameters.
# Note that the old version `3` uses very different names for many features.
#
# #### Languages
# We will not identify lexemes and word occurrences across language.
# So if two occurrences or lexemes exhibit the same string, but they are categorized as belonging
# to different languages, they will not be identified.
#
# #### Occurrences
# We group occurrences by their consonantal transcriptions.
# So if two occurrences differ only in pointing, we count them as two occurrences of the same value.
#
# #### Lexemes
# Lexemes are identified by the `lex` feature within a biblical language.
# We will not identify lexemes across language.
# In[1]:
import os,sys,re,collections
import utils
from tf.fabric import Fabric
# # Pipeline
# See [operation](https://github.com/ETCBC/pipeline/blob/master/README.md#operation)
# for how to run this script in the pipeline.
# In[2]:
if 'SCRIPT' not in locals():
SCRIPT = False
FORCE = True
CORE_NAME = 'bhsa'
VERSION= 'c'
LANG_FEATURE = 'language'
OCC_FEATURE = 'g_cons'
LEX_FEATURE = 'lex'
def stop(good=False):
if SCRIPT: sys.exit(0 if good else 1)
# # Setting up the context: source file and target directories
#
# The conversion is executed in an environment of directories, so that sources, temp files and
# results are in convenient places and do not have to be shifted around.
# In[3]:
repoBase = os.path.expanduser('~/github/etcbc')
thisRepo = '{}/{}'.format(repoBase, CORE_NAME)
thisTemp = '{}/_temp/{}'.format(thisRepo, VERSION)
thisTempTf = '{}/tf'.format(thisTemp)
thisTf = '{}/tf/{}'.format(thisRepo, VERSION)
# In[4]:
newFeaturesStr = '''
freq_occ
freq_lex
rank_occ
rank_lex
'''
newFeatures = newFeaturesStr.strip().split()
# # Test
#
# Check whether this conversion is needed in the first place.
# Only when run as a script.
# In[5]:
if SCRIPT:
(good, work) = utils.mustRun(None, '{}/.tf/{}.tfx'.format(thisTf, newFeatures[0]), force=FORCE)
if not good: stop(good=False)
if not work: stop(good=True)
# # Collect
#
# We collect the statistics.
# In[6]:
utils.caption(4, 'Loading relevant features')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('{} {} {}'.format(LANG_FEATURE, LEX_FEATURE, OCC_FEATURE))
api.makeAvailableIn(globals())
hasLex = 'lex' in set(F.otype.all)
# In[7]:
utils.caption(0, 'Counting occurrences')
wstats = {
'freqs': {
'lex': collections.defaultdict(lambda: collections.Counter()),
'occ': collections.defaultdict(lambda: collections.Counter()),
},
'ranks': {
'lex': collections.defaultdict(lambda: {}),
'occ': collections.defaultdict(lambda: {}),
},
}
langs = set()
for w in F.otype.s('word'):
occ = Fs(OCC_FEATURE).v(w)
lex = Fs(LEX_FEATURE).v(w)
lan = Fs(LANG_FEATURE).v(w)
wstats['freqs']['lex'][lan][lex] += 1
wstats['freqs']['occ'][lan][occ] += 1
langs.add(lan)
for lan in langs:
for tp in ['lex', 'occ']:
rank = -1
prev_n = -1
amount = 1
for (x, n) in sorted(wstats['freqs'][tp][lan].items(), key=lambda y: (-y[1], y[0])):
if n == prev_n:
amount += 1
else:
rank += amount
amount = 1
prev_n = n
wstats['ranks'][tp][lan][x] = rank
# In[8]:
utils.caption(0, 'Making statistical features')
metaData={
'': dict(
dataset='BHSA',
version=VERSION,
datasetName='Biblia Hebraica Stuttgartensia Amstelodamensis',
author='Eep Talstra Centre for Bible and Computer',
provenance='computed addition to core set of features',
encoders='Dirk Roorda (TF)',
website='https://shebanq.ancient-data.org',
email='shebanq@ancient-data.org',
),
}
nodeFeatures = {}
edgeFeatures = {}
for ft in (newFeatures):
nodeFeatures[ft] = {}
metaData.setdefault(ft, {})['valueType'] = 'int'
for w in F.otype.s('word'):
lan = Fs(LANG_FEATURE).v(w)
occ = Fs(OCC_FEATURE).v(w)
lex = Fs(LEX_FEATURE).v(w)
nodeFeatures['freq_occ'][w] = str(wstats['freqs']['occ'][lan][occ])
nodeFeatures['rank_occ'][w] = str(wstats['ranks']['occ'][lan][occ])
nodeFeatures['freq_lex'][w] = str(wstats['freqs']['lex'][lan][lex])
nodeFeatures['rank_lex'][w] = str(wstats['ranks']['lex'][lan][lex])
if hasLex:
for lx in F.otype.s('lex'):
firstOcc = L.d(lx, otype='word')[0]
nodeFeatures['freq_lex'][lx] = nodeFeatures['freq_lex'][firstOcc]
nodeFeatures['rank_lex'][lx] = nodeFeatures['rank_lex'][firstOcc]
# In[9]:
utils.caption(4, 'Write statistical features as TF')
TF = Fabric(locations=thisTempTf, silent=True)
TF.save(nodeFeatures=nodeFeatures, edgeFeatures=edgeFeatures, metaData=metaData)
# # Diffs
#
# Check differences with previous versions.
# In[10]:
utils.checkDiffs(thisTempTf, thisTf, only=set(newFeatures))
# # Deliver
#
# Copy the new TF features from the temporary location where they have been created to their final destination.
# In[11]:
utils.deliverFeatures(thisTempTf, thisTf, newFeatures)
# # Compile TF
# In[12]:
utils.caption(4, 'Load and compile the new TF features')
TF = Fabric(locations=thisTf, modules=[''])
api = TF.load('{} {}'.format(LEX_FEATURE, newFeaturesStr))
api.makeAvailableIn(globals())
# # Examples
# In[13]:
utils.caption(4, 'Basic test')
mostFrequent = set()
topX = 10
lexIndex = {}
utils.caption(0, 'Top {} freqent lexemes (computed on otype=word)'.format(topX))
for w in sorted(F.otype.s('word'), key=lambda w: -F.freq_lex.v(w)):
lex = Fs(LEX_FEATURE).v(w)
mostFrequent.add(lex)
lexIndex[lex] = w
if len(mostFrequent) == topX: break
mostFrequentWord = sorted((-F.freq_lex.v(lexIndex[lex]), lex) for lex in mostFrequent)
for (freq, lex) in mostFrequentWord:
utils.caption(0, '{:<10} {:>6}x'.format(lex, -freq))
if hasLex:
utils.caption(4, 'Top {} freqent lexemes (computed on otype=lex)'.format(topX))
mostFrequentLex = sorted((-F.freq_lex.v(lx), F.lex.v(lx)) for lx in F.otype.s('lex'))[0:10]
for (freq, lex) in mostFrequentLex:
utils.caption(0, '{:<10} {:>6}x'.format(lex, -freq))
if mostFrequentWord != mostFrequentLex:
utils.caption(0, '\tWARNING: Mismatch in lexeme frequencies computed by lex vs by word')
else:
utils.caption(0, '\tINFO: Same lexeme frequencies computed by lex vs by word')
utils.caption(0, 'Done')
# In[ ]: