#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# # Converter file
# ## Latest and Greatest
# In[184]:
import os
import re
import collections
import json
import csv
# from glob import glob
from tf.fabric import Fabric
from tf.convert.walker import CV
# from tf.compose import modify
source_dirs = 'input' # "input" is the name of the input folder that contains the source file
output_dirs = 'output' # "output" is the name of the output folder to which the finished TF files will be dumped into
bo2book = {line.split()[0]:line.split()[1] for line in '''
OTt4 Old_Testament
'''.split('\n') if line} # "OT" is the name of the file in the input folder AND "split()" splits at space
# patts = {'section': re.compile('(\d*):(\d*)\.(\d*)')}
def director(cv):
'''
Walks through LXX and triggers
slot and node creation events.
'''
# process books in order
for bo, book in bo2book.items():
book_loc = os.path.join(source_dirs, f'{bo}.txt')
print(f'\thandling {book_loc}...')
with open(book_loc, 'r', encoding="utf8") as infile:
text = [w for w in infile.read().split('\n') if w]
this_book = cv.node('book')
# keep track of when to trigger paragraph, chapter, and verse objects
# para_track = 1 # keep counts of paragraphs
prev_book = "Gen" # start at Genesis
prev_chap = 1 # start at 1
prev_verse = 1 # start at 1
prev_subverse = ''
wrdnum = 0 # start at 0
this_chap = cv.node('chapter')
# this_para = cv.node('paragraph')
this_verse = cv.node('verse')
this_subverse = cv.node('subverse')
# iterate through words and construct objects
for word in text:
wrdnum += 1
data = word.split('\t')
# word_data, lemmas = data[:7], data[7:]
word_data = data[:26] #the number here is the amount of columns
morphology = ' '.join(data[26:]) #the number here is the amount of columns
# segment out word data
# bo_code, ref, brake, ketiv, qere, morph, strongs = word_data
orig_order, book, chapter, verse, subverse, word, lex_utf8, g_cons_utf8, translit_SBL, lemma_gloss, strong, sp, morphology, case, nu, gn, degree, tense, voice, mood, ps, lemma_translit, abc_order, freq_lemma, BOL_lexeme_dict, BOL_gloss = word_data
# if chapter == "Prolog":
# chapter = 0
subverse == ""
#try:
# verse = int(verse)
#except ValueError:
# subverse = verse[-1:]
# verse = verse[:-1]
if verse == "":
print(f'{orig_order}: {verse} {subverse}')
# strongs_lemma, anlex_lemma = ' '.join(lemmas).split('!') # reconstitute lemmas and split on !
# chapt, verse, wrdnum = [int(v) for v in patts['section'].match(ref).groups()]
# -- handle TF events --
# detect book boundary
if prev_book != book:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# end book
cv.feature(this_book, book=prev_book)
cv.terminate(this_book)
# new book, chapter, verse, and subverse begin
this_book = cv.node('book')
prev_book = book
this_chap = cv.node('chapter')
prev_chap = chapter
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect chapter boundary
elif prev_chap != chapter:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# new chapter, verse, and subverse begin
this_chap = cv.node('chapter')
prev_chap = chapter
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect verse boundary
elif prev_verse != verse:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# new verse and subverse begin
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect subverse boundary
elif prev_subverse != subverse:
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
this_subverse = cv.node('subverse')
prev_subverse = subverse
# detect paragraph boundary
# if brake == 'P':
# cv.feature(this_para, para=para_track)
# cv.terminate(this_para)
# this_para = cv.node('paragraph') # start a new paragraph
# para_track += 1 # count paragraphs in the book
# make word object
this_word = cv.slot()
cv.feature(this_word,
orig_order=orig_order,
book=book,
chapter=chapter,
verse=verse,
subverse=subverse,
word=word,
lex_utf8=lex_utf8,
g_cons_utf8=g_cons_utf8,
translit_SBL=translit_SBL,
lemma_gloss=lemma_gloss,
strong=strong,
sp=sp,
morphology=morphology,
case=case,
nu=nu,
gn=gn,
degree=degree,
tense=tense,
voice=voice,
mood=mood,
ps=ps,
lemma_translit=lemma_translit,
abc_order=abc_order,
freq_lemma=freq_lemma,
BOL_lexeme_dict=BOL_lexeme_dict,
BOL_gloss=BOL_gloss,
# ketiv=ketiv,
# qere=qere,
# strongs=strongs,
# str_lem=strongs_lemma.strip(),
# anlex_lem=anlex_lemma.strip()
)
cv.terminate(this_word)
# end book and its objects
# - end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# - end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# - end paragraph
# cv.feature(this_para, para=para_track)
# cv.terminate(this_para)
# - end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# - end book
cv.feature(this_book, book=prev_book)
cv.terminate(this_book)
slotType = 'word'
otext = {'fmt:text-orig-full':'{word} ',
'sectionTypes':'book,chapter,verse',
'sectionFeatures':'book,chapter,verse'}
generic = {'Name': 'LXX',
'Version': '1935',
'Author': 'Rahlfs',
'Editors': 'CCAT, Eliran Wong',
'Converter': 'Adrian Negrea, Oliver Glanz',
'Source:':'https://github.com/eliranwong/LXX-Rahlfs-1935',
'Note':'?'}
intFeatures = {'chapter', 'verse'}
featureMeta = {
'orig_order': {'description': 'original word order in corpus'},
'book': {'description': 'book'},
'chapter': {'description': 'chapter'},
'verse': {'description': 'verse'},
'subverse': {'description': 'subverse'},
'word': {'description': 'text realized word'},
'lex_utf8': {'description': 'normalized word'},
'g_cons_utf8': {'description': 'word without accents'},
'translit_SBL': {'description': 'SBL transliteration'},
'lemma_gloss': {'description': 'English gloss'},
'strong': {'description': 'Strong numbers'},
'sp': {'description': 'part of speech'},
'morphology': {'description': 'morphology'},
'case': {'description': 'case'},
'nu': {'description': 'number'},
'gn': {'description': 'gender'},
'degree': {'description': 'degree'},
'tense': {'description': 'tense'},
'voice': {'description': 'voice'},
'mood': {'description': 'mood'},
'ps': {'description': 'person'},
'lemma_translit': {'description': 'lemma transliteration'},
'abc_order': {'description': 'dictionary order'},
'freq_lemma': {'description': 'frequency of word in corpus'},
'BOL_lexeme_dict': {'description': 'BOL dictionary form of lemma'},
'BOL_gloss': {'description': 'BOL English gloss'},
# 'para': {'description': 'A paragraph number'},
# 'ketiv': {'descrption': 'The text as it is written in the printed Tischendorf'},
# 'qere': {'description': 'The text as the editor thinks it should have been'},
# 'strongs': {'description': 'A word\'s number in Strongs'},
# 'str_lem': {'description': 'Word lemma that corresponds to The NEW Strong\'sComplete Dictionary of Bible Words'},
# 'anlex_lem': {'description': 'Word lemma that corresponds to Friberg, Friberg and Miller\'s ANLEX'}
}
# configure metadata/output
version = '1935'
generic['Version'] = version
output = os.path.join(output_dirs, version)
print(f'Processing Version {version}')
output_dir = output_dirs.format(version=version)
TF = Fabric(locations=output_dir, silent=True)
cv = CV(TF)
cv.walk(director,
slotType,
otext=otext,
generic=generic,
intFeatures=intFeatures,
featureMeta=featureMeta,
warn=True,
force=False,)
# In[6]:
# First, I have to laod different modules that I use for analyzing the data and for plotting:
import sys, os, collections
import pandas as pd
import numpy as np
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib.pyplot import figure
from collections import Counter
# Second, I have to load the Text Fabric app
from tf.fabric import Fabric
from tf.app import use
# In[133]:
featureadd=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.3.xlsx',sheet_name='FULL_data')
pd.set_option('display.max_columns', 50)
featureadd.head(10)
# In[134]:
from unidecode import unidecode
# In[135]:
featureadd['lemma_translit']=featureadd['lex_utf8'].apply(unidecode)
featureadd.head(5)
# In[136]:
ABC1=featureadd[['orig_order','lex_utf8']]
ABC1.head(5)
# In[137]:
ABC1.describe()
# In[138]:
ABCdict = ABC1.drop_duplicates(['lex_utf8']).sort_values(by='lex_utf8', ascending=[True])
ABCdict.head(10)
# In[139]:
ABCdict.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC1order.xlsx')
# In[140]:
ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC2order.xlsx')
pd.set_option('display.max_columns', 50)
ABC2.head(10)
# In[141]:
ABC2=ABC2.drop(['orig_order'], axis=1)
ABC2.head()
# In[142]:
featureadd.describe()
# In[143]:
featureadd=pd.merge (featureadd, ABC2,
on='lex_utf8',
how='outer')
featureadd.head(5)
# In[144]:
featureadd.describe()
# In[159]:
featureaddstage2 = featureadd
featureaddstage2.head(5)
# In[160]:
featureaddstage2.describe()
# In[161]:
featureaddstage2["freq_lemma"]=featureaddstage2.groupby(["lex_utf8"])["lex_utf8"].transform("count")
featureaddstage2.head(5)
# In[162]:
featureaddstage2.describe()
# In[163]:
featureaddstage2.sort_values(['orig_order'], ascending=True).head(10)
# In[164]:
featureaddstage2.describe()
# In[165]:
BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx')
pd.set_option('display.max_columns', 50)
BOLgreekDICT.head(10)
# In[166]:
BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'gloss']]
BOLgreekDICT.head(10)
# In[174]:
BOLgreekDICT = BOLgreekDICT.rename({'Lexeme':'lex_utf8', 'Lexeme_dict':'BOL_lexeme_dict', 'gloss':'BOL_gloss'}, axis=1)
BOLgreekDICT.head(5)
# In[175]:
BOLgreekDICT.describe()
# In[176]:
featureaddstage3=featureaddstage2
# In[177]:
featureaddstage3.describe()
# In[178]:
featureaddstage4=pd.merge (featureaddstage3, BOLgreekDICT,
on='lex_utf8',
how='left')
featureaddstage4.head(5)
# In[180]:
featureaddstage4.describe()
# In[181]:
featureaddstage4 = featureaddstage4.drop_duplicates(['orig_order']).sort_values(by='orig_order', ascending=[True])
featureaddstage4.head(10)
# In[182]:
featureaddstage4.describe()
# In[183]:
featureaddstage4.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.4.xlsx')
# In[ ]:
# # Testing
# In[185]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[1]:
# First, I have to laod different modules that I use for analyzing the data and for plotting:
import sys, os, collections
import pandas as pd
import numpy as np
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib.pyplot import figure
from collections import Counter
# Second, I have to load the Text Fabric app
from tf.fabric import Fabric
from tf.app import use
# In[4]:
#LXX = use('CCATLXX/tf/1994_v1', hoist=globals())
LXX = use('CCATLXX/tf/1994_v2', hoist=globals())
LXX = use('D:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/CCATLXX/tf/1994_v2', hoist=globals())
# In[3]:
Search0 = '''
book book=Gen
chapter chapter=1
verse verse=1
word
'''
Search0 = LXX.search(Search0)
LXX.show(Search0, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'orig_order','book','chapter','verse','subverse','word','lex_utf8','g_cons_utf8','translit_SBL','lemma_gloss','strong','sp','morphology','case','nu','gn','degree','tense','voice','mood','ps','lemma_translit','abc_order','freq_lemma','BOL_lexeme_dict','BOL_gloss'})
# In[7]:
Search1 = '''
verse book=Gen chapter=1 verse=1
word
'''
Search1 = LXX.search(Search1)
LXX.show(Search1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'})
# In[8]:
Search2 = '''
book book=Gen
chapter chapter=1
verse verse=1
word word* lex_utf8 g_cons_utf8 morphology* translit_SBL
'''
Search2 = LXX.search(Search2)
LXX.show(Search2, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'})
# In[18]:
Search3 = '''
book book=Num
chapter chapter=21
verse verse=3
'''
Search3 = LXX.search(Search3)
LXX.show(Search3, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case'})
# In[39]:
Eisakouw = '''
book book#Esth
verse
word lemma_translit=epakouo
word lemma_translit=theos|kurios
word lemma_translit=phone case=Gen
'''
Eisakouw = LXX.search(Eisakouw)
LXX.show(Eisakouw, start=1, end=100, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case', 'lex_utf8', 'BOL_lexeme_dict'})
# # New Feature Development
# # Morphology
# ## MorphGNT:sblgnt Data Description
# Here: https://github.com/morphgnt/sblgnt
#
# In[406]:
translitadd['lemma']=translitadd[0]
translitadd.head(5)
# In[407]:
translitadd=translitadd[['lemma']]
translitadd.head(5)
# In[408]:
translitadd['orig_order'] = translitadd.index +1
translitadd.head(5)
# In[409]:
from unidecode import unidecode
# In[410]:
s = "βίβλος"
s = unidecode(s)
print(s)
# In[411]:
translitadd['translit'] = translitadd['lemma'].apply(unidecode)
# In[412]:
translitadd.head(5)
# In[413]:
translitadd['translit'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_translit.tf', index=None)
# # ABC dictionary order
#
# In[414]:
ABC1=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
ABC1.head(10)
# In[415]:
ABC1['lemma']=lemma[0]
ABC1.head(5)
# In[416]:
ABC1['orig_order'] = ABC1.index +1
ABC1.head(5)
# In[417]:
ABC1=ABC1[['orig_order','lemma']]
ABC1.head(5)
# In[418]:
ABC1.describe()
# In[419]:
ABCdict = ABC1.drop_duplicates(['lemma']).sort_values(by='lemma', ascending=[True])
ABCdict.head(10)
# In[420]:
ABCdict.describe()
# In[421]:
ABC1.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC1order.xlsx', encoding='utf-8')
# Now I am ordering the word alphabetically iwth libreoffice writer since I cannot do that in pandas (yet?).
#
# In[422]:
ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC2order.xlsx')
pd.set_option('display.max_columns', 50)
ABC2.head(10)
# Now we merge the ABCorder dataframe with the original lemma DF.
# In[423]:
lemma_ABC=pd.merge (ABC1, ABC2,
on='lemma',
how='outer')
lemma_ABC.head(5)
# In[424]:
lemma_ABC.describe()
# In[425]:
lemma_ABC.sort_values(['orig_order_x'], ascending=True).head(10)
# In[426]:
lemma_ABC.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_abc.xlsx')
# # Word Frequency
# In[427]:
frequencyadd=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
frequencyadd.head(20)
# In[428]:
frequencyadd['orig_order'] = frequencyadd.index +1
frequencyadd['lemma']=frequencyadd[0]
frequencyadd=frequencyadd[['orig_order','lemma']]
frequencyadd.head(5)
# In[429]:
frequencyadd["freq_lemma"]=frequencyadd.groupby(["lemma"])["lemma"].transform("count")
#("count") is actually utilizing the 'count' function!
frequencyadd.head(20)
# In[430]:
frequencyadd.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_freq.xlsx')
# # English Dictionary
# Lets first load the NA1904 BibleOL dictionary:
# In[431]:
BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx')
pd.set_option('display.max_columns', 50)
BOLgreekDICT.head(20)
# In[432]:
BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'Strong\'s number', 'gloss']]
BOLgreekDICT.head(10)
# In[433]:
BOLgreekDICT.describe()
# Lets load the SBLGNT lemmas
# In[434]:
SBLGNTlemmas=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
SBLGNTlemmas.head(2)
# In[436]:
SBLGNTlemmas['orig_order']=SBLGNTlemmas.index +1
SBLGNTlemmas['Lexeme']=SBLGNTlemmas[0]
SBLGNTlemmas=SBLGNTlemmas[['orig_order','Lexeme']]
SBLGNTlemmas.head(5)
# In[437]:
SBLGNTlemmas.describe()
# Now lets try a merge of the two files
# In[438]:
SBLGNTglosses=pd.merge (SBLGNTlemmas,BOLgreekDICT,
on='Lexeme',
how='outer')
SBLGNTglosses.head(5)
# In[439]:
SBLGNTglosses.describe()
# In[440]:
SBLGNTglosses.head(20)
# In[441]:
SBLGNTglosses.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/SBLGNTglosses.xlsx')
# In[ ]:
# In[ ]: