#!/usr/bin/env python # coding: utf-8 # # # # # # BHSA as a Big Table # This notebook exports the [BHSA](etcbc.png) database to an R data frame. # The nodes are exported as rows, they correspond to the text objects such as word, phrase, clause, sentence, verse, chapter, book and a few others. # # The BHSA features become the columns, so each row tells what values the features have for the corresponding object. # # The edges corresponding to the BHSA features *mother*, *functional_parent*, *distributional_parent* are # exported as extra columns. For each row, such a column indicates the target of a corresponding outgoing edge. # # We also write the data that says which objects are contained in which. # To each row we add the following columns: # # * for each object type, except `word` there is a column with name that object type and containing # the identifier of the containing object of that type of the row object (if any). # # Extra data such as lexicon (including frequency and rank features), phonetic transcription, and ketiv-qere are also included. # We compose the big table and save it as a tab delimited files. # The result can be processed by R and Pandas, # who may converted the table to internal formats # for quicker loading. # It turns out that for this size of the data Pandas is a bit quicker than R. # # Also, because we remain in a Python environment, working with Pandas # is easier when you want to use configurations ad libraries from the text-fabric sphere. # # See # [bigTablesR](bigTablesR.ipynb) # and # [bigTablesP](bigTablesP.ipynb) # In[9]: import os, sys, collections from tf.fabric import Fabric # # Data source # In[10]: locations = '~/github/etcbc' coreModule = 'bhsa' sources = [coreModule, 'phono'] version = '2017' tempDir = os.path.expanduser('{}/{}/_temp/{}/r'.format(locations, coreModule, version)) tableFile = '{}/{}{}.txt'.format(tempDir, coreModule, version) # In[11]: modules = ['{}/tf/{}'.format(s, version) for s in sources] TF = Fabric(locations=locations, modules=modules) # # Load ALL features # In[12]: api = TF.load('') allFeatures = TF.explore(silent=False, show=True) loadableFeatures = allFeatures['nodes'] + allFeatures['edges'] api = TF.load(loadableFeatures) api.makeAvailableIn(globals()) # # Writing R data # In[43]: ## info("Writing R feature data") if not os.path.exists(tempDir): os.makedirs(tempDir) hr = open(tableFile, 'w') skipFeatures = ''' otype oslots '''.strip().split() for f in (Fall() + Eall()): if '@' in f: skipFeatures.append(f) levelFeatures = ''' subphrase phrase_atom phrase clause_atom clause sentence_atom sentence half_verse verse chapter book '''.strip().split() inLevelFeatures = ['in.'+x for x in levelFeatures] allNodeFeatures = sorted(set(Fall()) - set(skipFeatures)) allEdgeFeatures = sorted(set(Eall()) - set(skipFeatures)) hr.write('{}\t{}\t{}\t{}\t{}\n'.format( 'n', 'otype', '\t'.join(inLevelFeatures), '\t'.join(allEdgeFeatures), '\t'.join(allNodeFeatures), )) chunkSize = 100000 i = 0 s = 0 NA = [''] NAe = [['']] for n in N(): levelValues = [(L.u(n, otype=level) or NA)[0] for level in levelFeatures] edgeValues = [str((Es(f).f(n) or NA)[0]) for f in allEdgeFeatures] nodeValues = [(str(Fs(f).v(n) or '')) for f in allNodeFeatures] hr.write('{}\t{}\t{}\t{}\t{}\n'.format( n, F.otype.v(n), ('\t'.join(str(x) for x in levelValues)), ('\t'.join(edgeValues)), ('\t'.join(nodeValues)).replace('\n',''), )) i += 1 s += 1 if s == chunkSize: s = 0 info('{:>7} nodes written'.format(i)) hr.close() info('{:>7} nodes written and done'.format(i)) # In[44]: get_ipython().system('ls -lh {tempDir}') # The R export is ready now, but it is a bit large. # We can get a much leaner file by using R to load this file and save it in .rds format. # # We do that in a separate notebook, not running Python, but R: bigTablesR in this same directory. # In[ ]: