#!/usr/bin/env python
# coding: utf-8
#
#
#
#
# # BHSA as a Big Table
# This notebook exports the [BHSA](etcbc.png) database to an R data frame.
# The nodes are exported as rows, they correspond to the text objects such as word, phrase, clause, sentence, verse, chapter, book and a few others.
#
# The BHSA features become the columns, so each row tells what values the features have for the corresponding object.
#
# The edges corresponding to the BHSA features *mother*, *functional_parent*, *distributional_parent* are
# exported as extra columns. For each row, such a column indicates the target of a corresponding outgoing edge.
#
# We also write the data that says which objects are contained in which.
# To each row we add the following columns:
#
# * for each object type, except `word` there is a column with name that object type and containing
# the identifier of the containing object of that type of the row object (if any).
#
# Extra data such as lexicon (including frequency and rank features), phonetic transcription, and ketiv-qere are also included.
# We compose the big table and save it as a tab delimited files.
# The result can be processed by R and Pandas,
# who may converted the table to internal formats
# for quicker loading.
# It turns out that for this size of the data Pandas is a bit quicker than R.
#
# Also, because we remain in a Python environment, working with Pandas
# is easier when you want to use configurations ad libraries from the text-fabric sphere.
#
# See
# [bigTablesR](bigTablesR.ipynb)
# and
# [bigTablesP](bigTablesP.ipynb)
# In[9]:
import os, sys, collections
from tf.fabric import Fabric
# # Data source
# In[10]:
locations = '~/github/etcbc'
coreModule = 'bhsa'
sources = [coreModule, 'phono']
version = '2017'
tempDir = os.path.expanduser('{}/{}/_temp/{}/r'.format(locations, coreModule, version))
tableFile = '{}/{}{}.txt'.format(tempDir, coreModule, version)
# In[11]:
modules = ['{}/tf/{}'.format(s, version) for s in sources]
TF = Fabric(locations=locations, modules=modules)
# # Load ALL features
# In[12]:
api = TF.load('')
allFeatures = TF.explore(silent=False, show=True)
loadableFeatures = allFeatures['nodes'] + allFeatures['edges']
api = TF.load(loadableFeatures)
api.makeAvailableIn(globals())
# # Writing R data
# In[43]:
## info("Writing R feature data")
if not os.path.exists(tempDir):
os.makedirs(tempDir)
hr = open(tableFile, 'w')
skipFeatures = '''
otype
oslots
'''.strip().split()
for f in (Fall() + Eall()):
if '@' in f: skipFeatures.append(f)
levelFeatures = '''
subphrase phrase_atom phrase clause_atom clause sentence_atom sentence
half_verse verse chapter book
'''.strip().split()
inLevelFeatures = ['in.'+x for x in levelFeatures]
allNodeFeatures = sorted(set(Fall()) - set(skipFeatures))
allEdgeFeatures = sorted(set(Eall()) - set(skipFeatures))
hr.write('{}\t{}\t{}\t{}\t{}\n'.format(
'n',
'otype',
'\t'.join(inLevelFeatures),
'\t'.join(allEdgeFeatures),
'\t'.join(allNodeFeatures),
))
chunkSize = 100000
i = 0
s = 0
NA = ['']
NAe = [['']]
for n in N():
levelValues = [(L.u(n, otype=level) or NA)[0] for level in levelFeatures]
edgeValues = [str((Es(f).f(n) or NA)[0]) for f in allEdgeFeatures]
nodeValues = [(str(Fs(f).v(n) or '')) for f in allNodeFeatures]
hr.write('{}\t{}\t{}\t{}\t{}\n'.format(
n,
F.otype.v(n),
('\t'.join(str(x) for x in levelValues)),
('\t'.join(edgeValues)),
('\t'.join(nodeValues)).replace('\n',''),
))
i += 1
s += 1
if s == chunkSize:
s = 0
info('{:>7} nodes written'.format(i))
hr.close()
info('{:>7} nodes written and done'.format(i))
# In[44]:
get_ipython().system('ls -lh {tempDir}')
# The R export is ready now, but it is a bit large.
# We can get a much leaner file by using R to load this file and save it in .rds format.
#
# We do that in a separate notebook, not running Python, but R: bigTablesR in this same directory.
# In[ ]: