#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# # Converter file # ## Latest and Greatest # In[184]: import os import re import collections import json import csv # from glob import glob from tf.fabric import Fabric from tf.convert.walker import CV # from tf.compose import modify source_dirs = 'input' # "input" is the name of the input folder that contains the source file output_dirs = 'output' # "output" is the name of the output folder to which the finished TF files will be dumped into bo2book = {line.split()[0]:line.split()[1] for line in ''' OTt4 Old_Testament '''.split('\n') if line} # "OT" is the name of the file in the input folder AND "split()" splits at space # patts = {'section': re.compile('(\d*):(\d*)\.(\d*)')} def director(cv): ''' Walks through LXX and triggers slot and node creation events. ''' # process books in order for bo, book in bo2book.items(): book_loc = os.path.join(source_dirs, f'{bo}.txt') print(f'\thandling {book_loc}...') with open(book_loc, 'r', encoding="utf8") as infile: text = [w for w in infile.read().split('\n') if w] this_book = cv.node('book') # keep track of when to trigger paragraph, chapter, and verse objects # para_track = 1 # keep counts of paragraphs prev_book = "Gen" # start at Genesis prev_chap = 1 # start at 1 prev_verse = 1 # start at 1 prev_subverse = '' wrdnum = 0 # start at 0 this_chap = cv.node('chapter') # this_para = cv.node('paragraph') this_verse = cv.node('verse') this_subverse = cv.node('subverse') # iterate through words and construct objects for word in text: wrdnum += 1 data = word.split('\t') # word_data, lemmas = data[:7], data[7:] word_data = data[:26] #the number here is the amount of columns morphology = ' '.join(data[26:]) #the number here is the amount of columns # segment out word data # bo_code, ref, brake, ketiv, qere, morph, strongs = word_data orig_order, book, chapter, verse, subverse, word, lex_utf8, g_cons_utf8, translit_SBL, lemma_gloss, strong, sp, morphology, case, nu, gn, degree, tense, voice, mood, ps, lemma_translit, abc_order, freq_lemma, BOL_lexeme_dict, BOL_gloss = word_data # if chapter == "Prolog": # chapter = 0 subverse == "" #try: # verse = int(verse) #except ValueError: # subverse = verse[-1:] # verse = verse[:-1] if verse == "": print(f'{orig_order}: {verse} {subverse}') # strongs_lemma, anlex_lemma = ' '.join(lemmas).split('!') # reconstitute lemmas and split on ! # chapt, verse, wrdnum = [int(v) for v in patts['section'].match(ref).groups()] # -- handle TF events -- # detect book boundary if prev_book != book: # end subverse cv.feature(this_subverse, subverse=prev_subverse) cv.terminate(this_subverse) # end verse cv.feature(this_verse, verse=prev_verse) cv.terminate(this_verse) # end chapter cv.feature(this_chap, chapter=prev_chap) cv.terminate(this_chap) # end book cv.feature(this_book, book=prev_book) cv.terminate(this_book) # new book, chapter, verse, and subverse begin this_book = cv.node('book') prev_book = book this_chap = cv.node('chapter') prev_chap = chapter this_verse = cv.node('verse') prev_verse = verse this_subverse = cv.node('subverse') prev_subverse = subverse wrdnum = 1 # detect chapter boundary elif prev_chap != chapter: # end subverse cv.feature(this_subverse, subverse=prev_subverse) cv.terminate(this_subverse) # end verse cv.feature(this_verse, verse=prev_verse) cv.terminate(this_verse) # end chapter cv.feature(this_chap, chapter=prev_chap) cv.terminate(this_chap) # new chapter, verse, and subverse begin this_chap = cv.node('chapter') prev_chap = chapter this_verse = cv.node('verse') prev_verse = verse this_subverse = cv.node('subverse') prev_subverse = subverse wrdnum = 1 # detect verse boundary elif prev_verse != verse: # end subverse cv.feature(this_subverse, subverse=prev_subverse) cv.terminate(this_subverse) # end verse cv.feature(this_verse, verse=prev_verse) cv.terminate(this_verse) # new verse and subverse begin this_verse = cv.node('verse') prev_verse = verse this_subverse = cv.node('subverse') prev_subverse = subverse wrdnum = 1 # detect subverse boundary elif prev_subverse != subverse: cv.feature(this_subverse, subverse=prev_subverse) cv.terminate(this_subverse) this_subverse = cv.node('subverse') prev_subverse = subverse # detect paragraph boundary # if brake == 'P': # cv.feature(this_para, para=para_track) # cv.terminate(this_para) # this_para = cv.node('paragraph') # start a new paragraph # para_track += 1 # count paragraphs in the book # make word object this_word = cv.slot() cv.feature(this_word, orig_order=orig_order, book=book, chapter=chapter, verse=verse, subverse=subverse, word=word, lex_utf8=lex_utf8, g_cons_utf8=g_cons_utf8, translit_SBL=translit_SBL, lemma_gloss=lemma_gloss, strong=strong, sp=sp, morphology=morphology, case=case, nu=nu, gn=gn, degree=degree, tense=tense, voice=voice, mood=mood, ps=ps, lemma_translit=lemma_translit, abc_order=abc_order, freq_lemma=freq_lemma, BOL_lexeme_dict=BOL_lexeme_dict, BOL_gloss=BOL_gloss, # ketiv=ketiv, # qere=qere, # strongs=strongs, # str_lem=strongs_lemma.strip(), # anlex_lem=anlex_lemma.strip() ) cv.terminate(this_word) # end book and its objects # - end subverse cv.feature(this_subverse, subverse=prev_subverse) cv.terminate(this_subverse) # - end verse cv.feature(this_verse, verse=prev_verse) cv.terminate(this_verse) # - end paragraph # cv.feature(this_para, para=para_track) # cv.terminate(this_para) # - end chapter cv.feature(this_chap, chapter=prev_chap) cv.terminate(this_chap) # - end book cv.feature(this_book, book=prev_book) cv.terminate(this_book) slotType = 'word' otext = {'fmt:text-orig-full':'{word} ', 'sectionTypes':'book,chapter,verse', 'sectionFeatures':'book,chapter,verse'} generic = {'Name': 'LXX', 'Version': '1935', 'Author': 'Rahlfs', 'Editors': 'CCAT, Eliran Wong', 'Converter': 'Adrian Negrea, Oliver Glanz', 'Source:':'https://github.com/eliranwong/LXX-Rahlfs-1935', 'Note':'?'} intFeatures = {'chapter', 'verse'} featureMeta = { 'orig_order': {'description': 'original word order in corpus'}, 'book': {'description': 'book'}, 'chapter': {'description': 'chapter'}, 'verse': {'description': 'verse'}, 'subverse': {'description': 'subverse'}, 'word': {'description': 'text realized word'}, 'lex_utf8': {'description': 'normalized word'}, 'g_cons_utf8': {'description': 'word without accents'}, 'translit_SBL': {'description': 'SBL transliteration'}, 'lemma_gloss': {'description': 'English gloss'}, 'strong': {'description': 'Strong numbers'}, 'sp': {'description': 'part of speech'}, 'morphology': {'description': 'morphology'}, 'case': {'description': 'case'}, 'nu': {'description': 'number'}, 'gn': {'description': 'gender'}, 'degree': {'description': 'degree'}, 'tense': {'description': 'tense'}, 'voice': {'description': 'voice'}, 'mood': {'description': 'mood'}, 'ps': {'description': 'person'}, 'lemma_translit': {'description': 'lemma transliteration'}, 'abc_order': {'description': 'dictionary order'}, 'freq_lemma': {'description': 'frequency of word in corpus'}, 'BOL_lexeme_dict': {'description': 'BOL dictionary form of lemma'}, 'BOL_gloss': {'description': 'BOL English gloss'}, # 'para': {'description': 'A paragraph number'}, # 'ketiv': {'descrption': 'The text as it is written in the printed Tischendorf'}, # 'qere': {'description': 'The text as the editor thinks it should have been'}, # 'strongs': {'description': 'A word\'s number in Strongs'}, # 'str_lem': {'description': 'Word lemma that corresponds to The NEW Strong\'sComplete Dictionary of Bible Words'}, # 'anlex_lem': {'description': 'Word lemma that corresponds to Friberg, Friberg and Miller\'s ANLEX'} } # configure metadata/output version = '1935' generic['Version'] = version output = os.path.join(output_dirs, version) print(f'Processing Version {version}') output_dir = output_dirs.format(version=version) TF = Fabric(locations=output_dir, silent=True) cv = CV(TF) cv.walk(director, slotType, otext=otext, generic=generic, intFeatures=intFeatures, featureMeta=featureMeta, warn=True, force=False,) # In[6]: # First, I have to laod different modules that I use for analyzing the data and for plotting: import sys, os, collections import pandas as pd import numpy as np import re import csv import seaborn as sns import matplotlib.pyplot as plt; plt.rcdefaults() from matplotlib.pyplot import figure from collections import Counter # Second, I have to load the Text Fabric app from tf.fabric import Fabric from tf.app import use # In[133]: featureadd=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.3.xlsx',sheet_name='FULL_data') pd.set_option('display.max_columns', 50) featureadd.head(10) # In[134]: from unidecode import unidecode # In[135]: featureadd['lemma_translit']=featureadd['lex_utf8'].apply(unidecode) featureadd.head(5) # In[136]: ABC1=featureadd[['orig_order','lex_utf8']] ABC1.head(5) # In[137]: ABC1.describe() # In[138]: ABCdict = ABC1.drop_duplicates(['lex_utf8']).sort_values(by='lex_utf8', ascending=[True]) ABCdict.head(10) # In[139]: ABCdict.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC1order.xlsx') # In[140]: ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC2order.xlsx') pd.set_option('display.max_columns', 50) ABC2.head(10) # In[141]: ABC2=ABC2.drop(['orig_order'], axis=1) ABC2.head() # In[142]: featureadd.describe() # In[143]: featureadd=pd.merge (featureadd, ABC2, on='lex_utf8', how='outer') featureadd.head(5) # In[144]: featureadd.describe() # In[159]: featureaddstage2 = featureadd featureaddstage2.head(5) # In[160]: featureaddstage2.describe() # In[161]: featureaddstage2["freq_lemma"]=featureaddstage2.groupby(["lex_utf8"])["lex_utf8"].transform("count") featureaddstage2.head(5) # In[162]: featureaddstage2.describe() # In[163]: featureaddstage2.sort_values(['orig_order'], ascending=True).head(10) # In[164]: featureaddstage2.describe() # In[165]: BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx') pd.set_option('display.max_columns', 50) BOLgreekDICT.head(10) # In[166]: BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'gloss']] BOLgreekDICT.head(10) # In[174]: BOLgreekDICT = BOLgreekDICT.rename({'Lexeme':'lex_utf8', 'Lexeme_dict':'BOL_lexeme_dict', 'gloss':'BOL_gloss'}, axis=1) BOLgreekDICT.head(5) # In[175]: BOLgreekDICT.describe() # In[176]: featureaddstage3=featureaddstage2 # In[177]: featureaddstage3.describe() # In[178]: featureaddstage4=pd.merge (featureaddstage3, BOLgreekDICT, on='lex_utf8', how='left') featureaddstage4.head(5) # In[180]: featureaddstage4.describe() # In[181]: featureaddstage4 = featureaddstage4.drop_duplicates(['orig_order']).sort_values(by='orig_order', ascending=[True]) featureaddstage4.head(10) # In[182]: featureaddstage4.describe() # In[183]: featureaddstage4.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.4.xlsx') # In[ ]: # # Testing # In[185]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[1]: # First, I have to laod different modules that I use for analyzing the data and for plotting: import sys, os, collections import pandas as pd import numpy as np import re import csv import seaborn as sns import matplotlib.pyplot as plt; plt.rcdefaults() from matplotlib.pyplot import figure from collections import Counter # Second, I have to load the Text Fabric app from tf.fabric import Fabric from tf.app import use # In[4]: #LXX = use('CCATLXX/tf/1994_v1', hoist=globals()) LXX = use('CCATLXX/tf/1994_v2', hoist=globals()) LXX = use('D:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/CCATLXX/tf/1994_v2', hoist=globals()) # In[3]: Search0 = ''' book book=Gen chapter chapter=1 verse verse=1 word ''' Search0 = LXX.search(Search0) LXX.show(Search0, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'orig_order','book','chapter','verse','subverse','word','lex_utf8','g_cons_utf8','translit_SBL','lemma_gloss','strong','sp','morphology','case','nu','gn','degree','tense','voice','mood','ps','lemma_translit','abc_order','freq_lemma','BOL_lexeme_dict','BOL_gloss'}) # In[7]: Search1 = ''' verse book=Gen chapter=1 verse=1 word ''' Search1 = LXX.search(Search1) LXX.show(Search1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'}) # In[8]: Search2 = ''' book book=Gen chapter chapter=1 verse verse=1 word word* lex_utf8 g_cons_utf8 morphology* translit_SBL ''' Search2 = LXX.search(Search2) LXX.show(Search2, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'}) # In[18]: Search3 = ''' book book=Num chapter chapter=21 verse verse=3 ''' Search3 = LXX.search(Search3) LXX.show(Search3, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case'}) # In[39]: Eisakouw = ''' book book#Esth verse word lemma_translit=epakouo word lemma_translit=theos|kurios word lemma_translit=phone case=Gen ''' Eisakouw = LXX.search(Eisakouw) LXX.show(Eisakouw, start=1, end=100, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case', 'lex_utf8', 'BOL_lexeme_dict'}) # # New Feature Development # # Morphology # ## MorphGNT:sblgnt Data Description # Here: https://github.com/morphgnt/sblgnt # # In[406]: translitadd['lemma']=translitadd[0] translitadd.head(5) # In[407]: translitadd=translitadd[['lemma']] translitadd.head(5) # In[408]: translitadd['orig_order'] = translitadd.index +1 translitadd.head(5) # In[409]: from unidecode import unidecode # In[410]: s = "βίβλος" s = unidecode(s) print(s) # In[411]: translitadd['translit'] = translitadd['lemma'].apply(unidecode) # In[412]: translitadd.head(5) # In[413]: translitadd['translit'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_translit.tf', index=None) # # ABC dictionary order # # In[414]: ABC1=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8') pd.set_option('display.max_columns', 50) ABC1.head(10) # In[415]: ABC1['lemma']=lemma[0] ABC1.head(5) # In[416]: ABC1['orig_order'] = ABC1.index +1 ABC1.head(5) # In[417]: ABC1=ABC1[['orig_order','lemma']] ABC1.head(5) # In[418]: ABC1.describe() # In[419]: ABCdict = ABC1.drop_duplicates(['lemma']).sort_values(by='lemma', ascending=[True]) ABCdict.head(10) # In[420]: ABCdict.describe() # In[421]: ABC1.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC1order.xlsx', encoding='utf-8') # Now I am ordering the word alphabetically iwth libreoffice writer since I cannot do that in pandas (yet?). # # In[422]: ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC2order.xlsx') pd.set_option('display.max_columns', 50) ABC2.head(10) # Now we merge the ABCorder dataframe with the original lemma DF. # In[423]: lemma_ABC=pd.merge (ABC1, ABC2, on='lemma', how='outer') lemma_ABC.head(5) # In[424]: lemma_ABC.describe() # In[425]: lemma_ABC.sort_values(['orig_order_x'], ascending=True).head(10) # In[426]: lemma_ABC.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_abc.xlsx') # # Word Frequency # In[427]: frequencyadd=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8') pd.set_option('display.max_columns', 50) frequencyadd.head(20) # In[428]: frequencyadd['orig_order'] = frequencyadd.index +1 frequencyadd['lemma']=frequencyadd[0] frequencyadd=frequencyadd[['orig_order','lemma']] frequencyadd.head(5) # In[429]: frequencyadd["freq_lemma"]=frequencyadd.groupby(["lemma"])["lemma"].transform("count") #("count") is actually utilizing the 'count' function! frequencyadd.head(20) # In[430]: frequencyadd.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_freq.xlsx') # # English Dictionary # Lets first load the NA1904 BibleOL dictionary: # In[431]: BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx') pd.set_option('display.max_columns', 50) BOLgreekDICT.head(20) # In[432]: BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'Strong\'s number', 'gloss']] BOLgreekDICT.head(10) # In[433]: BOLgreekDICT.describe() # Lets load the SBLGNT lemmas # In[434]: SBLGNTlemmas=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8') pd.set_option('display.max_columns', 50) SBLGNTlemmas.head(2) # In[436]: SBLGNTlemmas['orig_order']=SBLGNTlemmas.index +1 SBLGNTlemmas['Lexeme']=SBLGNTlemmas[0] SBLGNTlemmas=SBLGNTlemmas[['orig_order','Lexeme']] SBLGNTlemmas.head(5) # In[437]: SBLGNTlemmas.describe() # Now lets try a merge of the two files # In[438]: SBLGNTglosses=pd.merge (SBLGNTlemmas,BOLgreekDICT, on='Lexeme', how='outer') SBLGNTglosses.head(5) # In[439]: SBLGNTglosses.describe() # In[440]: SBLGNTglosses.head(20) # In[441]: SBLGNTglosses.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/SBLGNTglosses.xlsx') # In[ ]: # In[ ]: