import os
import re
import collections
import json
import csv
# from glob import glob
from tf.fabric import Fabric
from tf.convert.walker import CV
# from tf.compose import modify
source_dirs = 'input' # "input" is the name of the input folder that contains the source file
output_dirs = 'output' # "output" is the name of the output folder to which the finished TF files will be dumped into
bo2book = {line.split()[0]:line.split()[1] for line in '''
OTt4 Old_Testament
'''.split('\n') if line} # "OT" is the name of the file in the input folder AND "split()" splits at space
# patts = {'section': re.compile('(\d*):(\d*)\.(\d*)')}
def director(cv):
'''
Walks through LXX and triggers
slot and node creation events.
'''
# process books in order
for bo, book in bo2book.items():
book_loc = os.path.join(source_dirs, f'{bo}.txt')
print(f'\thandling {book_loc}...')
with open(book_loc, 'r', encoding="utf8") as infile:
text = [w for w in infile.read().split('\n') if w]
this_book = cv.node('book')
# keep track of when to trigger paragraph, chapter, and verse objects
# para_track = 1 # keep counts of paragraphs
prev_book = "Gen" # start at Genesis
prev_chap = 1 # start at 1
prev_verse = 1 # start at 1
prev_subverse = ''
wrdnum = 0 # start at 0
this_chap = cv.node('chapter')
# this_para = cv.node('paragraph')
this_verse = cv.node('verse')
this_subverse = cv.node('subverse')
# iterate through words and construct objects
for word in text:
wrdnum += 1
data = word.split('\t')
# word_data, lemmas = data[:7], data[7:]
word_data = data[:26] #the number here is the amount of columns
morphology = ' '.join(data[26:]) #the number here is the amount of columns
# segment out word data
# bo_code, ref, brake, ketiv, qere, morph, strongs = word_data
orig_order, book, chapter, verse, subverse, word, lex_utf8, g_cons_utf8, translit_SBL, lemma_gloss, strong, sp, morphology, case, nu, gn, degree, tense, voice, mood, ps, lemma_translit, abc_order, freq_lemma, BOL_lexeme_dict, BOL_gloss = word_data
# if chapter == "Prolog":
# chapter = 0
subverse == ""
#try:
# verse = int(verse)
#except ValueError:
# subverse = verse[-1:]
# verse = verse[:-1]
if verse == "":
print(f'{orig_order}: {verse} {subverse}')
# strongs_lemma, anlex_lemma = ' '.join(lemmas).split('!') # reconstitute lemmas and split on !
# chapt, verse, wrdnum = [int(v) for v in patts['section'].match(ref).groups()]
# -- handle TF events --
# detect book boundary
if prev_book != book:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# end book
cv.feature(this_book, book=prev_book)
cv.terminate(this_book)
# new book, chapter, verse, and subverse begin
this_book = cv.node('book')
prev_book = book
this_chap = cv.node('chapter')
prev_chap = chapter
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect chapter boundary
elif prev_chap != chapter:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# new chapter, verse, and subverse begin
this_chap = cv.node('chapter')
prev_chap = chapter
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect verse boundary
elif prev_verse != verse:
# end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# new verse and subverse begin
this_verse = cv.node('verse')
prev_verse = verse
this_subverse = cv.node('subverse')
prev_subverse = subverse
wrdnum = 1
# detect subverse boundary
elif prev_subverse != subverse:
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
this_subverse = cv.node('subverse')
prev_subverse = subverse
# detect paragraph boundary
# if brake == 'P':
# cv.feature(this_para, para=para_track)
# cv.terminate(this_para)
# this_para = cv.node('paragraph') # start a new paragraph
# para_track += 1 # count paragraphs in the book
# make word object
this_word = cv.slot()
cv.feature(this_word,
orig_order=orig_order,
book=book,
chapter=chapter,
verse=verse,
subverse=subverse,
word=word,
lex_utf8=lex_utf8,
g_cons_utf8=g_cons_utf8,
translit_SBL=translit_SBL,
lemma_gloss=lemma_gloss,
strong=strong,
sp=sp,
morphology=morphology,
case=case,
nu=nu,
gn=gn,
degree=degree,
tense=tense,
voice=voice,
mood=mood,
ps=ps,
lemma_translit=lemma_translit,
abc_order=abc_order,
freq_lemma=freq_lemma,
BOL_lexeme_dict=BOL_lexeme_dict,
BOL_gloss=BOL_gloss,
# ketiv=ketiv,
# qere=qere,
# strongs=strongs,
# str_lem=strongs_lemma.strip(),
# anlex_lem=anlex_lemma.strip()
)
cv.terminate(this_word)
# end book and its objects
# - end subverse
cv.feature(this_subverse, subverse=prev_subverse)
cv.terminate(this_subverse)
# - end verse
cv.feature(this_verse, verse=prev_verse)
cv.terminate(this_verse)
# - end paragraph
# cv.feature(this_para, para=para_track)
# cv.terminate(this_para)
# - end chapter
cv.feature(this_chap, chapter=prev_chap)
cv.terminate(this_chap)
# - end book
cv.feature(this_book, book=prev_book)
cv.terminate(this_book)
slotType = 'word'
otext = {'fmt:text-orig-full':'{word} ',
'sectionTypes':'book,chapter,verse',
'sectionFeatures':'book,chapter,verse'}
generic = {'Name': 'LXX',
'Version': '1935',
'Author': 'Rahlfs',
'Editors': 'CCAT, Eliran Wong',
'Converter': 'Adrian Negrea, Oliver Glanz',
'Source:':'https://github.com/eliranwong/LXX-Rahlfs-1935',
'Note':'?'}
intFeatures = {'chapter', 'verse'}
featureMeta = {
'orig_order': {'description': 'original word order in corpus'},
'book': {'description': 'book'},
'chapter': {'description': 'chapter'},
'verse': {'description': 'verse'},
'subverse': {'description': 'subverse'},
'word': {'description': 'text realized word'},
'lex_utf8': {'description': 'normalized word'},
'g_cons_utf8': {'description': 'word without accents'},
'translit_SBL': {'description': 'SBL transliteration'},
'lemma_gloss': {'description': 'English gloss'},
'strong': {'description': 'Strong numbers'},
'sp': {'description': 'part of speech'},
'morphology': {'description': 'morphology'},
'case': {'description': 'case'},
'nu': {'description': 'number'},
'gn': {'description': 'gender'},
'degree': {'description': 'degree'},
'tense': {'description': 'tense'},
'voice': {'description': 'voice'},
'mood': {'description': 'mood'},
'ps': {'description': 'person'},
'lemma_translit': {'description': 'lemma transliteration'},
'abc_order': {'description': 'dictionary order'},
'freq_lemma': {'description': 'frequency of word in corpus'},
'BOL_lexeme_dict': {'description': 'BOL dictionary form of lemma'},
'BOL_gloss': {'description': 'BOL English gloss'},
# 'para': {'description': 'A paragraph number'},
# 'ketiv': {'descrption': 'The text as it is written in the printed Tischendorf'},
# 'qere': {'description': 'The text as the editor thinks it should have been'},
# 'strongs': {'description': 'A word\'s number in Strongs'},
# 'str_lem': {'description': 'Word lemma that corresponds to The NEW Strong\'sComplete Dictionary of Bible Words'},
# 'anlex_lem': {'description': 'Word lemma that corresponds to Friberg, Friberg and Miller\'s ANLEX'}
}
# configure metadata/output
version = '1935'
generic['Version'] = version
output = os.path.join(output_dirs, version)
print(f'Processing Version {version}')
output_dir = output_dirs.format(version=version)
TF = Fabric(locations=output_dir, silent=True)
cv = CV(TF)
cv.walk(director,
slotType,
otext=otext,
generic=generic,
intFeatures=intFeatures,
featureMeta=featureMeta,
warn=True,
force=False,)
Processing Version 1935 0.00s Importing data from walking through the source ... | 0.00s Preparing metadata... | 0.00s No structure nodes will be set up | SECTION TYPES: book, chapter, verse | SECTION FEATURES: book, chapter, verse | STRUCTURE TYPES: | STRUCTURE FEATURES: | TEXT FEATURES: | | text-orig-full word | 0.00s OK | 0.00s Following director... handling input\OTt4.txt... | 11s "edge" actions: 0 | 11s "feature" actions: 685735 | 11s "node" actions: 62042 | 11s "resume" actions: 0 | 11s "slot" actions: 623693 | 11s "terminate" actions: 685735 | 57 x "book" node | 1193 x "chapter" node | 30420 x "subverse" node | 30372 x "verse" node | 623693 x "word" node = slot type | 685735 nodes of all types | 11s OK | 0.03s Removing unlinked nodes ... | | 0.00s 1 unlinked "chapter" node: [1] | | 0.00s 1 unlinked "verse" node: [1] | | 0.00s 1 unlinked "subverse" node: [1] | | 0.00s 3 unlinked nodes | | 0.00s Leaving 685732 nodes | 0.00s checking for nodes and edges ... | 0.00s OK | 0.00s checking features ... | 0.48s OK | 0.00s reordering nodes ... | 0.18s Sorting 57 nodes of type "book" | 0.24s Sorting 1192 nodes of type "chapter" | 0.31s Sorting 30419 nodes of type "subverse" | 0.42s Sorting 30371 nodes of type "verse" | 0.53s Max node = 685732 | 0.54s OK | 0.00s reassigning feature values ... | | 1.14s node feature "BOL_gloss" with 623693 nodes | | 1.36s node feature "BOL_lexeme_dict" with 623693 nodes | | 1.58s node feature "abc_order" with 623693 nodes | | 1.79s node feature "book" with 623750 nodes | | 2.00s node feature "case" with 623693 nodes | | 2.21s node feature "chapter" with 624885 nodes | | 2.43s node feature "degree" with 623693 nodes | | 2.65s node feature "freq_lemma" with 623693 nodes | | 2.89s node feature "g_cons_utf8" with 623693 nodes | | 3.11s node feature "gn" with 623693 nodes | | 3.36s node feature "lemma_gloss" with 623693 nodes | | 3.58s node feature "lemma_translit" with 623693 nodes | | 3.83s node feature "lex_utf8" with 623693 nodes | | 4.07s node feature "mood" with 623693 nodes | | 4.32s node feature "morphology" with 623693 nodes | | 4.53s node feature "nu" with 623693 nodes | | 4.76s node feature "orig_order" with 623693 nodes | | 4.98s node feature "ps" with 623693 nodes | | 5.21s node feature "sp" with 623693 nodes | | 5.41s node feature "strong" with 623693 nodes | | 5.66s node feature "subverse" with 654112 nodes | | 5.91s node feature "tense" with 623693 nodes | | 6.13s node feature "translit_SBL" with 623693 nodes | | 6.36s node feature "verse" with 654064 nodes | | 6.59s node feature "voice" with 623693 nodes | | 6.83s node feature "word" with 623693 nodes | 6.05s OK 0.00s Exporting 27 node and 1 edge and 1 config features to output: 0.00s VALIDATING oslots feature 0.09s VALIDATING oslots feature 0.09s maxSlot= 623693 0.09s maxNode= 685732 0.10s OK: oslots is valid | 1.12s T BOL_gloss to output | 1.21s T BOL_lexeme_dict to output | 1.13s T abc_order to output | 1.14s T book to output | 1.00s T case to output | 1.00s T chapter to output | 1.01s T degree to output | 1.12s T freq_lemma to output | 1.14s T g_cons_utf8 to output | 1.06s T gn to output | 1.15s T lemma_gloss to output | 1.02s T lemma_translit to output | 1.14s T lex_utf8 to output | 0.98s T mood to output | 1.07s T morphology to output | 1.34s T nu to output | 1.66s T orig_order to output | 0.32s T otype to output | 1.20s T ps to output | 1.28s T sp to output | 1.17s T strong to output | 1.08s T subverse to output | 1.13s T tense to output | 1.21s T translit_SBL to output | 1.34s T verse to output | 1.02s T voice to output | 1.32s T word to output | 0.52s T oslots to output | 0.00s M otext to output 31s Exported 27 node features and 1 edge features and 1 config features to output
True
# First, I have to laod different modules that I use for analyzing the data and for plotting:
import sys, os, collections
import pandas as pd
import numpy as np
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib.pyplot import figure
from collections import Counter
# Second, I have to load the Text Fabric app
from tf.fabric import Fabric
from tf.app import use
featureadd=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.3.xlsx',sheet_name='FULL_data')
pd.set_option('display.max_columns', 50)
featureadd.head(10)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 2 | Gen | 1 | 1 | NaN | ἀρχῇ | ἀρχή | αρχη | archē | origin; beginning | G746 | noun | N.DSF | Dat | Sing | Fem | NaN | NaN | NaN | NaN | NaN |
2 | 3 | Gen | 1 | 1 | NaN | ἐποίησεν | ποιέω | ποιεω | epoiēsen | do; make | G4160 | verb | V.AAI3S | NaN | Sing | NaN | NaN | Aor | Act | Ind | 3rd |
3 | 4 | Gen | 1 | 1 | NaN | ὁ | ὁ | ο | ho | the | G3588 | pronoun, article | RA.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN |
4 | 5 | Gen | 1 | 1 | NaN | θεὸς | θεός | θεος | theos | God | G2316 | noun | N.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN |
5 | 6 | Gen | 1 | 1 | NaN | τὸν | ὁ | ο | ton | the | G3588 | pronoun, article | RA.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN |
6 | 7 | Gen | 1 | 1 | NaN | οὐρανὸν | οὐρανός | ουρανος | ouranon | sky; heaven | G3772 | noun | N.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN |
7 | 8 | Gen | 1 | 1 | NaN | καὶ | καί | και | kai | and; even | G2532 | conjunction | C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | 9 | Gen | 1 | 1 | NaN | τὴν | ὁ | ο | tēn | the | G3588 | pronoun, article | RA.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN |
9 | 10 | Gen | 1 | 1 | NaN | γῆν | γῆ | γη | gēn | earth; land | G1093 | noun | N.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN |
from unidecode import unidecode
featureadd['lemma_translit']=featureadd['lex_utf8'].apply(unidecode)
featureadd.head(5)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en |
1 | 2 | Gen | 1 | 1 | NaN | ἀρχῇ | ἀρχή | αρχη | archē | origin; beginning | G746 | noun | N.DSF | Dat | Sing | Fem | NaN | NaN | NaN | NaN | NaN | arkhe |
2 | 3 | Gen | 1 | 1 | NaN | ἐποίησεν | ποιέω | ποιεω | epoiēsen | do; make | G4160 | verb | V.AAI3S | NaN | Sing | NaN | NaN | Aor | Act | Ind | 3rd | poieo |
3 | 4 | Gen | 1 | 1 | NaN | ὁ | ὁ | ο | ho | the | G3588 | pronoun, article | RA.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | o |
4 | 5 | Gen | 1 | 1 | NaN | θεὸς | θεός | θεος | theos | God | G2316 | noun | N.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | theos |
ABC1=featureadd[['orig_order','lex_utf8']]
ABC1.head(5)
orig_order | lex_utf8 | |
---|---|---|
0 | 1 | ἐν |
1 | 2 | ἀρχή |
2 | 3 | ποιέω |
3 | 4 | ὁ |
4 | 5 | θεός |
ABC1.describe()
orig_order | |
---|---|
count | 623693.000000 |
mean | 311847.000000 |
std | 180044.805057 |
min | 1.000000 |
25% | 155924.000000 |
50% | 311847.000000 |
75% | 467770.000000 |
max | 623693.000000 |
ABCdict = ABC1.drop_duplicates(['lex_utf8']).sort_values(by='lex_utf8', ascending=[True])
ABCdict.head(10)
orig_order | lex_utf8 | |
---|---|---|
141279 | 141280 | Ααλαφ |
256210 | 256211 | Ααρα |
302080 | 302081 | Αβαδια |
296208 | 296209 | Αβαδιας |
256426 | 256427 | Αβαδων |
274105 | 274106 | Αβαιαν |
149542 | 149543 | Αβαισαν |
598778 | 598779 | Αβαλ |
579703 | 579704 | Αβαμα |
236283 | 236284 | Αβανα |
ABCdict.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC1order.xlsx')
ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/feature-dev/ABC2order.xlsx')
pd.set_option('display.max_columns', 50)
ABC2.head(10)
orig_order | lex_utf8 | abc_order | |
---|---|---|---|
0 | 1 | ἐν | 4638 |
1 | 2 | ἀρχή | 1835 |
2 | 3 | ποιέω | 10580 |
3 | 4 | ὁ | 9434 |
4 | 5 | θεός | 6191 |
5 | 7 | οὐρανός | 9842 |
6 | 8 | καί | 7030 |
7 | 10 | γῆ | 3082 |
8 | 12 | δέ | 3302 |
9 | 14 | εἰμί | 4092 |
ABC2=ABC2.drop(['orig_order'], axis=1)
ABC2.head()
lex_utf8 | abc_order | |
---|---|---|
0 | ἐν | 4638 |
1 | ἀρχή | 1835 |
2 | ποιέω | 10580 |
3 | ὁ | 9434 |
4 | θεός | 6191 |
featureadd.describe()
orig_order | chapter | verse | |
---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 |
std | 180044.805057 | 20.679681 | 13.994139 |
min | 1.000000 | 0.000000 | 0.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 |
max | 623693.000000 | 151.000000 | 176.000000 |
featureadd=pd.merge (featureadd, ABC2,
on='lex_utf8',
how='outer')
featureadd.head(5)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 |
1 | 86 | Gen | 1 | 6 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 |
2 | 232 | Gen | 1 | 11 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 |
3 | 264 | Gen | 1 | 12 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 |
4 | 291 | Gen | 1 | 14 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 |
featureadd.describe()
orig_order | chapter | verse | abc_order | |
---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 |
std | 180044.805058 | 20.679681 | 13.994139 | 3447.810620 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 |
featureaddstage2 = featureadd
featureaddstage2.head(5)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | freq_lemma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
1 | 86 | Gen | 1 | 6 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
2 | 232 | Gen | 1 | 11 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
3 | 264 | Gen | 1 | 12 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
4 | 291 | Gen | 1 | 14 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
featureaddstage2.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 | 21743.617390 |
std | 180044.805058 | 20.679681 | 13.994139 | 3447.810620 | 32782.706194 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 | 244.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 | 2522.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
featureaddstage2["freq_lemma"]=featureaddstage2.groupby(["lex_utf8"])["lex_utf8"].transform("count")
featureaddstage2.head(5)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | freq_lemma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
1 | 86 | Gen | 1 | 6 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
2 | 232 | Gen | 1 | 11 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
3 | 264 | Gen | 1 | 12 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
4 | 291 | Gen | 1 | 14 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
featureaddstage2.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 | 21743.617390 |
std | 180044.805058 | 20.679681 | 13.994139 | 3447.810620 | 32782.706194 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 | 244.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 | 2522.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
featureaddstage2.sort_values(['orig_order'], ascending=True).head(10)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | freq_lemma | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 |
14316 | 2 | Gen | 1 | 1 | NaN | ἀρχῇ | ἀρχή | αρχη | archē | origin; beginning | G746 | noun | N.DSF | Dat | Sing | Fem | NaN | NaN | NaN | NaN | NaN | arkhe | 1835 | 236 |
14552 | 3 | Gen | 1 | 1 | NaN | ἐποίησεν | ποιέω | ποιεω | epoiēsen | do; make | G4160 | verb | V.AAI3S | NaN | Sing | NaN | NaN | Aor | Act | Ind | 3rd | poieo | 10580 | 3386 |
17938 | 4 | Gen | 1 | 1 | NaN | ὁ | ὁ | ο | ho | the | G3588 | pronoun, article | RA.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 |
106382 | 5 | Gen | 1 | 1 | NaN | θεὸς | θεός | θεος | theos | God | G2316 | noun | N.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | theos | 6191 | 4009 |
17939 | 6 | Gen | 1 | 1 | NaN | τὸν | ὁ | ο | ton | the | G3588 | pronoun, article | RA.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 |
110391 | 7 | Gen | 1 | 1 | NaN | οὐρανὸν | οὐρανός | ουρανος | ouranon | sky; heaven | G3772 | noun | N.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN | ouranos | 9842 | 682 |
111073 | 8 | Gen | 1 | 1 | NaN | καὶ | καί | και | kai | and; even | G2532 | conjunction | C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | kai | 7030 | 62231 |
17940 | 9 | Gen | 1 | 1 | NaN | τὴν | ὁ | ο | tēn | the | G3588 | pronoun, article | RA.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 |
173304 | 10 | Gen | 1 | 1 | NaN | γῆν | γῆ | γη | gēn | earth; land | G1093 | noun | N.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN | ge | 3082 | 3173 |
featureaddstage2.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 | 21743.617390 |
std | 180044.805058 | 20.679681 | 13.994139 | 3447.810620 | 32782.706194 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 | 244.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 | 2522.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx')
pd.set_option('display.max_columns', 50)
BOLgreekDICT.head(10)
orig abc order | Occurrences | Lexeme | orig abc order.1 | Lexeme_dict | Strong's number | Strong's unreliable? | gloss | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | Ἀαρών | 1 | Ἀαρών, ὁ | 2 | no | Aaron |
1 | 2 | 1 | Ἀβαδδών | 2 | Ἀβαδδών, ὁ | 3 | no | Abaddon |
2 | 3 | 1 | ἀβαρής | 3 | ἀβαρής, -ές | 4 | no | not burdensome |
3 | 4 | 3 | ἀββά | 4 | ἀββά, ὁ | 5 | no | Father |
4 | 5 | 4 | Ἅβελ | 5 | Ἅβελ, ὁ | 6 | no | Abel |
5 | 6 | 3 | Ἀβιά | 6 | Ἀβιά, ὁ | 7 | no | Abijah |
6 | 7 | 1 | Ἀβιαθάρ | 7 | Ἀβιαθάρ, ὁ | 8 | no | Abiathar |
7 | 8 | 1 | Ἀβιληνή | 8 | Ἀβιληνή, -ῆς, ἡ | 9 | no | Abilene |
8 | 9 | 2 | Ἀβιούδ | 9 | Ἀβιούδ, ὁ | 10 | no | Abiud |
9 | 10 | 73 | Ἀβραάμ | 10 | Ἀβραάμ, ὁ | 11 | no | Abraham |
BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'gloss']]
BOLgreekDICT.head(10)
Lexeme | Lexeme_dict | gloss | |
---|---|---|---|
0 | Ἀαρών | Ἀαρών, ὁ | Aaron |
1 | Ἀβαδδών | Ἀβαδδών, ὁ | Abaddon |
2 | ἀβαρής | ἀβαρής, -ές | not burdensome |
3 | ἀββά | ἀββά, ὁ | Father |
4 | Ἅβελ | Ἅβελ, ὁ | Abel |
5 | Ἀβιά | Ἀβιά, ὁ | Abijah |
6 | Ἀβιαθάρ | Ἀβιαθάρ, ὁ | Abiathar |
7 | Ἀβιληνή | Ἀβιληνή, -ῆς, ἡ | Abilene |
8 | Ἀβιούδ | Ἀβιούδ, ὁ | Abiud |
9 | Ἀβραάμ | Ἀβραάμ, ὁ | Abraham |
BOLgreekDICT = BOLgreekDICT.rename({'Lexeme':'lex_utf8', 'Lexeme_dict':'BOL_lexeme_dict', 'gloss':'BOL_gloss'}, axis=1)
BOLgreekDICT.head(5)
lex_utf8 | BOL_lexeme_dict | BOL_gloss | |
---|---|---|---|
0 | Ἀαρών | Ἀαρών, ὁ | Aaron |
1 | Ἀβαδδών | Ἀβαδδών, ὁ | Abaddon |
2 | ἀβαρής | ἀβαρής, -ές | not burdensome |
3 | ἀββά | ἀββά, ὁ | Father |
4 | Ἅβελ | Ἅβελ, ὁ | Abel |
BOLgreekDICT.describe()
lex_utf8 | BOL_lexeme_dict | BOL_gloss | |
---|---|---|---|
count | 5433 | 5335 | 5433 |
unique | 5400 | 5297 | 5172 |
top | ταχύς | ταχύς, -εῖα | leave behind |
freq | 4 | 4 | 4 |
featureaddstage3=featureaddstage2
featureaddstage3.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 | 21743.617390 |
std | 180044.805058 | 20.679681 | 13.994139 | 3447.810620 | 32782.706194 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 | 244.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 | 2522.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
featureaddstage4=pd.merge (featureaddstage3, BOLgreekDICT,
on='lex_utf8',
how='left')
featureaddstage4.head(5)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | freq_lemma | BOL_lexeme_dict | BOL_gloss | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
1 | 86 | Gen | 1 | 6 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
2 | 232 | Gen | 1 | 11 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
3 | 264 | Gen | 1 | 12 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
4 | 291 | Gen | 1 | 14 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
featureaddstage4.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 628353.000000 | 628353.000000 | 628353.000000 | 628353.000000 | 628353.000000 |
mean | 312013.178089 | 19.180892 | 16.949748 | 7267.016749 | 21585.945001 |
std | 179969.157374 | 20.658195 | 13.994791 | 3445.968577 | 32711.827142 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 156202.000000 | 6.000000 | 7.000000 | 4425.000000 | 243.000000 |
50% | 312320.000000 | 13.000000 | 14.000000 | 7103.000000 | 2223.000000 |
75% | 467690.000000 | 25.000000 | 23.000000 | 9497.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
featureaddstage4 = featureaddstage4.drop_duplicates(['orig_order']).sort_values(by='orig_order', ascending=[True])
featureaddstage4.head(10)
orig_order | book | chapter | verse | subverse | word | lex_utf8 | g_cons_utf8 | translit_SBL | lemma_gloss | strong | sp | morphology | case | nu | gn | degree | tense | voice | mood | ps | lemma_translit | abc_order | freq_lemma | BOL_lexeme_dict | BOL_gloss | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Gen | 1 | 1 | NaN | ἐν | ἐν | εν | en | in | G1722 | preposition | P | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | en | 4638 | 14316 | ἐν | in, on, among |
14316 | 2 | Gen | 1 | 1 | NaN | ἀρχῇ | ἀρχή | αρχη | archē | origin; beginning | G746 | noun | N.DSF | Dat | Sing | Fem | NaN | NaN | NaN | NaN | NaN | arkhe | 1835 | 236 | ἀρχή, -ῆς, ἡ | ruler, beginning |
14552 | 3 | Gen | 1 | 1 | NaN | ἐποίησεν | ποιέω | ποιεω | epoiēsen | do; make | G4160 | verb | V.AAI3S | NaN | Sing | NaN | NaN | Aor | Act | Ind | 3rd | poieo | 10580 | 3386 | ποιέω | do, make |
17938 | 4 | Gen | 1 | 1 | NaN | ὁ | ὁ | ο | ho | the | G3588 | pronoun, article | RA.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 | ὁ, ἡ, τό | the |
106382 | 5 | Gen | 1 | 1 | NaN | θεὸς | θεός | θεος | theos | God | G2316 | noun | N.NSM | Nom | Sing | Masc | NaN | NaN | NaN | NaN | NaN | theos | 6191 | 4009 | θεός, -οῦ, ὁ | God, god |
17939 | 6 | Gen | 1 | 1 | NaN | τὸν | ὁ | ο | ton | the | G3588 | pronoun, article | RA.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 | ὁ, ἡ, τό | the |
110391 | 7 | Gen | 1 | 1 | NaN | οὐρανὸν | οὐρανός | ουρανος | ouranon | sky; heaven | G3772 | noun | N.ASM | Acc | Sing | Masc | NaN | NaN | NaN | NaN | NaN | ouranos | 9842 | 682 | οὐρανός, -οῦ, ὁ | sky, heaven |
111073 | 8 | Gen | 1 | 1 | NaN | καὶ | καί | και | kai | and; even | G2532 | conjunction | C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | kai | 7030 | 62231 | καί | and, even, also, namely |
17940 | 9 | Gen | 1 | 1 | NaN | τὴν | ὁ | ο | tēn | the | G3588 | pronoun, article | RA.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN | o | 9434 | 88444 | ὁ, ἡ, τό | the |
173304 | 10 | Gen | 1 | 1 | NaN | γῆν | γῆ | γη | gēn | earth; land | G1093 | noun | N.ASF | Acc | Sing | Fem | NaN | NaN | NaN | NaN | NaN | ge | 3082 | 3173 | γῆ, γῆς, ἡ | earth, soil, land |
featureaddstage4.describe()
orig_order | chapter | verse | abc_order | freq_lemma | |
---|---|---|---|---|---|
count | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 | 623693.000000 |
mean | 311847.000000 | 19.200501 | 16.949878 | 7261.430627 | 21743.617390 |
std | 180044.805057 | 20.679681 | 13.994139 | 3447.810620 | 32782.706194 |
min | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
25% | 155924.000000 | 6.000000 | 7.000000 | 4421.000000 | 244.000000 |
50% | 311847.000000 | 13.000000 | 14.000000 | 7096.000000 | 2522.000000 |
75% | 467770.000000 | 25.000000 | 23.000000 | 9456.000000 | 29396.000000 |
max | 623693.000000 | 151.000000 | 176.000000 | 14174.000000 | 88444.000000 |
featureaddstage4.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/LXX_source_v1.4.xlsx')
%load_ext autoreload
%autoreload 2
# First, I have to laod different modules that I use for analyzing the data and for plotting:
import sys, os, collections
import pandas as pd
import numpy as np
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib.pyplot import figure
from collections import Counter
# Second, I have to load the Text Fabric app
from tf.fabric import Fabric
from tf.app import use
#LXX = use('CCATLXX/tf/1994_v1', hoist=globals())
LXX = use('CCATLXX/tf/1994_v2', hoist=globals())
LXX = use('D:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_CCATLXX/CCATLXX/tf/1994_v2', hoist=globals())
This is Text-Fabric 9.1.11 Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html 29 features found and 0 ignored
rate limit is 60 requests per hour, with 57 left for this hour To increase the rate,see https://annotation.github.io/text-fabric/tf/advanced/repo.html/ connecting to online GitHub repo annotation/app-D ... failed GitHub says: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/reference/repos#get-a-repository"} The requested TF-app is not available offline
No online connection
This is Text-Fabric 9.1.11 Api reference : https://annotation.github.io/text-fabric/tf/cheatsheet.html 0 features found and 0 ignored 0.00s Not all of the warp features otype and oslots are present in None/D/ 0.00s Only the Feature and Edge APIs will be enabled 0.00s Warp feature "otext" not found. Working without Text-API
Search0 = '''
book book=Gen
chapter chapter=1
verse verse=1
word
'''
Search0 = LXX.search(Search0)
LXX.show(Search0, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'orig_order','book','chapter','verse','subverse','word','lex_utf8','g_cons_utf8','translit_SBL','lemma_gloss','strong','sp','morphology','case','nu','gn','degree','tense','voice','mood','ps','lemma_translit','abc_order','freq_lemma','BOL_lexeme_dict','BOL_gloss'})
0.42s 10 results
verse 1
Search1 = '''
verse book=Gen chapter=1 verse=1
word
'''
Search1 = LXX.search(Search1)
LXX.show(Search1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'})
0.15s 0 results
Search2 = '''
book book=Gen
chapter chapter=1
verse verse=1
word word* lex_utf8 g_cons_utf8 morphology* translit_SBL
'''
Search2 = LXX.search(Search2)
LXX.show(Search2, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'subverse'})
2.74s 10 results
verse 1
Search3 = '''
book book=Num
chapter chapter=21
verse verse=3
'''
Search3 = LXX.search(Search3)
LXX.show(Search3, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case'})
0.03s 1 result
verse 1
Eisakouw = '''
book book#Esth
verse
word lemma_translit=epakouo
word lemma_translit=theos|kurios
word lemma_translit=phone case=Gen
'''
Eisakouw = LXX.search(Eisakouw)
LXX.show(Eisakouw, start=1, end=100, condensed=True, colorMap={1:'pink'}, extraFeatures={'word', 'lemma_translit', 'case', 'lex_utf8', 'BOL_lexeme_dict'})
1.98s 12 results
verse 1
verse 2
verse 3
verse 4
verse 5
translitadd['lemma']=translitadd[0]
translitadd.head(5)
0 | lemma | |
---|---|---|
0 | βίβλος | βίβλος |
1 | γένεσις | γένεσις |
2 | Ἰησοῦς | Ἰησοῦς |
3 | Χριστός | Χριστός |
4 | υἱός | υἱός |
translitadd=translitadd[['lemma']]
translitadd.head(5)
lemma | |
---|---|
0 | βίβλος |
1 | γένεσις |
2 | Ἰησοῦς |
3 | Χριστός |
4 | υἱός |
translitadd['orig_order'] = translitadd.index +1
translitadd.head(5)
lemma | orig_order | |
---|---|---|
0 | βίβλος | 1 |
1 | γένεσις | 2 |
2 | Ἰησοῦς | 3 |
3 | Χριστός | 4 |
4 | υἱός | 5 |
from unidecode import unidecode
s = "βίβλος"
s = unidecode(s)
print(s)
biblos
translitadd['translit'] = translitadd['lemma'].apply(unidecode)
translitadd.head(5)
lemma | orig_order | translit | |
---|---|---|---|
0 | βίβλος | 1 | biblos |
1 | γένεσις | 2 | genesis |
2 | Ἰησοῦς | 3 | Iesous |
3 | Χριστός | 4 | Khristos |
4 | υἱός | 5 | uios |
translitadd['translit'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_translit.tf', index=None)
ABC1=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
ABC1.head(10)
0 | |
---|---|
0 | βίβλος |
1 | γένεσις |
2 | Ἰησοῦς |
3 | Χριστός |
4 | υἱός |
5 | Δαυίδ |
6 | υἱός |
7 | Ἀβραάμ |
8 | Ἀβραάμ |
9 | γεννάω |
ABC1['lemma']=lemma[0]
ABC1.head(5)
0 | lemma | |
---|---|---|
0 | βίβλος | βίβλος |
1 | γένεσις | γένεσις |
2 | Ἰησοῦς | Ἰησοῦς |
3 | Χριστός | Χριστός |
4 | υἱός | υἱός |
ABC1['orig_order'] = ABC1.index +1
ABC1.head(5)
0 | lemma | orig_order | |
---|---|---|---|
0 | βίβλος | βίβλος | 1 |
1 | γένεσις | γένεσις | 2 |
2 | Ἰησοῦς | Ἰησοῦς | 3 |
3 | Χριστός | Χριστός | 4 |
4 | υἱός | υἱός | 5 |
ABC1=ABC1[['orig_order','lemma']]
ABC1.head(5)
orig_order | lemma | |
---|---|---|
0 | 1 | βίβλος |
1 | 2 | γένεσις |
2 | 3 | Ἰησοῦς |
3 | 4 | Χριστός |
4 | 5 | υἱός |
ABC1.describe()
orig_order | |
---|---|
count | 137554.000000 |
mean | 68777.500000 |
std | 39708.563801 |
min | 1.000000 |
25% | 34389.250000 |
50% | 68777.500000 |
75% | 103165.750000 |
max | 137554.000000 |
ABCdict = ABC1.drop_duplicates(['lemma']).sort_values(by='lemma', ascending=[True])
ABCdict.head(10)
orig_order | lemma | |
---|---|---|
68479 | 68480 | Αἰγύπτιος |
69633 | 69634 | Αἰθίοψ |
70464 | 70465 | Αἰνέας |
50739 | 50740 | Αἰνών |
679 | 680 | Αἴγυπτος |
30811 | 30812 | Αὐγοῦστος |
87589 | 87590 | Βάαλ |
75520 | 75521 | Βέροια |
171 | 172 | Βαβυλών |
128506 | 128507 | Βαλάκ |
ABCdict.describe()
orig_order | |
---|---|
count | 5461.000000 |
mean | 55051.110969 |
std | 42441.209940 |
min | 1.000000 |
25% | 12643.000000 |
50% | 48785.000000 |
75% | 90141.000000 |
max | 137334.000000 |
ABC1.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC1order.xlsx', encoding='utf-8')
Now I am ordering the word alphabetically iwth libreoffice writer since I cannot do that in pandas (yet?).
ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC2order.xlsx')
pd.set_option('display.max_columns', 50)
ABC2.head(10)
Unnamed: 0 | orig_order | lemma | ABC order | |
---|---|---|---|---|
0 | 29678 | 29679 | Ἀαρών | 1 |
1 | 131340 | 131341 | Ἀβαδδών | 2 |
2 | 100253 | 100254 | ἀβαρής | 3 |
3 | 28001 | 28002 | αββα | 4 |
4 | 14094 | 14095 | Ἅβελ | 5 |
5 | 108 | 109 | Ἀβιά | 6 |
6 | 19523 | 19524 | Ἀβιαθάρ | 7 |
7 | 31682 | 31683 | Ἀβιληνή | 8 |
8 | 190 | 191 | Ἀβιούδ | 9 |
9 | 7 | 8 | Ἀβραάμ | 10 |
Now we merge the ABCorder dataframe with the original lemma DF.
lemma_ABC=pd.merge (ABC1, ABC2,
on='lemma',
how='outer')
lemma_ABC.head(5)
orig_order_x | lemma | Unnamed: 0 | orig_order_y | ABC order | |
---|---|---|---|---|---|
0 | 1 | βίβλος | 0 | 1 | 970 |
1 | 26440 | βίβλος | 0 | 1 | 970 |
2 | 31717 | βίβλος | 0 | 1 | 970 |
3 | 45660 | βίβλος | 0 | 1 | 970 |
4 | 64886 | βίβλος | 0 | 1 | 970 |
lemma_ABC.describe()
orig_order_x | Unnamed: 0 | orig_order_y | ABC order | |
---|---|---|---|---|
count | 137554.000000 | 137554.000000 | 137554.000000 | 137554.00000 |
mean | 68777.500000 | 7050.531566 | 7051.531566 | 2676.19798 |
std | 39708.563801 | 20152.248998 | 20152.248998 | 1339.74175 |
min | 1.000000 | 0.000000 | 1.000000 | 1.00000 |
25% | 34389.250000 | 25.000000 | 26.000000 | 1501.00000 |
50% | 68777.500000 | 400.000000 | 401.000000 | 2727.00000 |
75% | 103165.750000 | 2097.250000 | 2098.250000 | 3598.00000 |
max | 137554.000000 | 137333.000000 | 137334.000000 | 5461.00000 |
lemma_ABC.sort_values(['orig_order_x'], ascending=True).head(10)
orig_order_x | lemma | Unnamed: 0 | orig_order_y | ABC order | |
---|---|---|---|---|---|
0 | 1 | βίβλος | 0 | 1 | 970 |
10 | 2 | γένεσις | 1 | 2 | 1074 |
15 | 3 | Ἰησοῦς | 2 | 3 | 2406 |
921 | 4 | Χριστός | 3 | 4 | 5385 |
1449 | 5 | υἱός | 4 | 5 | 5053 |
1824 | 6 | Δαυίδ | 5 | 6 | 1156 |
1450 | 7 | υἱός | 4 | 5 | 5053 |
1883 | 8 | Ἀβραάμ | 7 | 8 | 10 |
1884 | 9 | Ἀβραάμ | 7 | 8 | 10 |
1956 | 10 | γεννάω | 9 | 10 | 1077 |
lemma_ABC.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_abc.xlsx')
frequencyadd=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
frequencyadd.head(20)
0 | |
---|---|
0 | βίβλος |
1 | γένεσις |
2 | Ἰησοῦς |
3 | Χριστός |
4 | υἱός |
5 | Δαυίδ |
6 | υἱός |
7 | Ἀβραάμ |
8 | Ἀβραάμ |
9 | γεννάω |
10 | ὁ |
11 | Ἰσαάκ |
12 | Ἰσαάκ |
13 | δέ |
14 | γεννάω |
15 | ὁ |
16 | Ἰακώβ |
17 | Ἰακώβ |
18 | δέ |
19 | γεννάω |
frequencyadd['orig_order'] = frequencyadd.index +1
frequencyadd['lemma']=frequencyadd[0]
frequencyadd=frequencyadd[['orig_order','lemma']]
frequencyadd.head(5)
orig_order | lemma | |
---|---|---|
0 | 1 | βίβλος |
1 | 2 | γένεσις |
2 | 3 | Ἰησοῦς |
3 | 4 | Χριστός |
4 | 5 | υἱός |
frequencyadd["freq_lemma"]=frequencyadd.groupby(["lemma"])["lemma"].transform("count")
#("count") is actually utilizing the 'count' function!
frequencyadd.head(20)
orig_order | lemma | freq_lemma | |
---|---|---|---|
0 | 1 | βίβλος | 10 |
1 | 2 | γένεσις | 5 |
2 | 3 | Ἰησοῦς | 906 |
3 | 4 | Χριστός | 528 |
4 | 5 | υἱός | 375 |
5 | 6 | Δαυίδ | 59 |
6 | 7 | υἱός | 375 |
7 | 8 | Ἀβραάμ | 73 |
8 | 9 | Ἀβραάμ | 73 |
9 | 10 | γεννάω | 97 |
10 | 11 | ὁ | 19769 |
11 | 12 | Ἰσαάκ | 20 |
12 | 13 | Ἰσαάκ | 20 |
13 | 14 | δέ | 2766 |
14 | 15 | γεννάω | 97 |
15 | 16 | ὁ | 19769 |
16 | 17 | Ἰακώβ | 27 |
17 | 18 | Ἰακώβ | 27 |
18 | 19 | δέ | 2766 |
19 | 20 | γεννάω | 97 |
frequencyadd.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_freq.xlsx')
Lets first load the NA1904 BibleOL dictionary:
BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx')
pd.set_option('display.max_columns', 50)
BOLgreekDICT.head(20)
orig abc order | Occurrences | Lexeme | Lexeme_dict | Strong's number | Strong's unreliable? | gloss | |
---|---|---|---|---|---|---|---|
0 | 1 | 5 | Ἀαρών | Ἀαρών, ὁ | 2 | no | Aaron |
1 | 2 | 1 | Ἀβαδδών | Ἀβαδδών, ὁ | 3 | no | Abaddon |
2 | 3 | 1 | ἀβαρής | ἀβαρής, -ές | 4 | no | not burdensome |
3 | 4 | 3 | ἀββά | ἀββά, ὁ | 5 | no | Father |
4 | 5 | 4 | Ἅβελ | Ἅβελ, ὁ | 6 | no | Abel |
5 | 6 | 3 | Ἀβιά | Ἀβιά, ὁ | 7 | no | Abijah |
6 | 7 | 1 | Ἀβιαθάρ | Ἀβιαθάρ, ὁ | 8 | no | Abiathar |
7 | 8 | 1 | Ἀβιληνή | Ἀβιληνή, -ῆς, ἡ | 9 | no | Abilene |
8 | 9 | 2 | Ἀβιούδ | Ἀβιούδ, ὁ | 10 | no | Abiud |
9 | 10 | 73 | Ἀβραάμ | Ἀβραάμ, ὁ | 11 | no | Abraham |
10 | 11 | 9 | ἄβυσσος | ἄβυσσος, -ου, ἡ | 12 | no | abyss, unfathomable depth |
11 | 12 | 2 | Ἅγαβος | Ἅγαβος, -ου, ὁ | 13 | no | Agabus |
12 | 13 | 2 | ἀγαθοεργέω | ἀγαθοεργέω | 14 | no | perform good deeds |
13 | 14 | 9 | ἀγαθοποιέω | ἀγαθοποιέω | 15 | no | do that which is good |
14 | 15 | 1 | ἀγαθοποιΐα | ἀγαθοποιΐα, -ας, ἡ | 16 | no | doing of that which is good |
15 | 16 | 1 | ἀγαθοποιός | ἀγαθοποιός, -οῦ, ὁ | 17 | no | a doer of that which is good |
16 | 17 | 102 | ἀγαθός | ἀγαθός, -ή, -όν | 18 | no | good |
17 | 18 | 4 | ἀγαθωσύνη | ἀγαθωσύνη, -ης, ἡ | 19 | no | goodness |
18 | 19 | 5 | ἀγαλλίασις | ἀγαλλίασις, -εως, ἡ | 20 | no | exultation, exhilaration |
19 | 20 | 11 | ἀγαλλιάω | ἀγαλλιάω | 21 | no | exult, am full of joy |
BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'Strong\'s number', 'gloss']]
BOLgreekDICT.head(10)
Lexeme | Lexeme_dict | Strong's number | gloss | |
---|---|---|---|---|
0 | Ἀαρών | Ἀαρών, ὁ | 2 | Aaron |
1 | Ἀβαδδών | Ἀβαδδών, ὁ | 3 | Abaddon |
2 | ἀβαρής | ἀβαρής, -ές | 4 | not burdensome |
3 | ἀββά | ἀββά, ὁ | 5 | Father |
4 | Ἅβελ | Ἅβελ, ὁ | 6 | Abel |
5 | Ἀβιά | Ἀβιά, ὁ | 7 | Abijah |
6 | Ἀβιαθάρ | Ἀβιαθάρ, ὁ | 8 | Abiathar |
7 | Ἀβιληνή | Ἀβιληνή, -ῆς, ἡ | 9 | Abilene |
8 | Ἀβιούδ | Ἀβιούδ, ὁ | 10 | Abiud |
9 | Ἀβραάμ | Ἀβραάμ, ὁ | 11 | Abraham |
BOLgreekDICT.describe()
Strong's number | |
---|---|
count | 5433.000000 |
mean | 2798.407878 |
std | 1638.197697 |
min | 1.000000 |
25% | 1370.000000 |
50% | 2754.000000 |
75% | 4237.000000 |
max | 5624.000000 |
Lets load the SBLGNT lemmas
SBLGNTlemmas=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
SBLGNTlemmas.head(2)
0 | |
---|---|
0 | βίβλος |
1 | γένεσις |
SBLGNTlemmas['orig_order']=SBLGNTlemmas.index +1
SBLGNTlemmas['Lexeme']=SBLGNTlemmas[0]
SBLGNTlemmas=SBLGNTlemmas[['orig_order','Lexeme']]
SBLGNTlemmas.head(5)
orig_order | Lexeme | |
---|---|---|
0 | 1 | βίβλος |
1 | 2 | γένεσις |
2 | 3 | Ἰησοῦς |
3 | 4 | Χριστός |
4 | 5 | υἱός |
SBLGNTlemmas.describe()
orig_order | |
---|---|
count | 137554.000000 |
mean | 68777.500000 |
std | 39708.563801 |
min | 1.000000 |
25% | 34389.250000 |
50% | 68777.500000 |
75% | 103165.750000 |
max | 137554.000000 |
Now lets try a merge of the two files
SBLGNTglosses=pd.merge (SBLGNTlemmas,BOLgreekDICT,
on='Lexeme',
how='outer')
SBLGNTglosses.head(5)
orig_order | Lexeme | Lexeme_dict | Strong's number | gloss | |
---|---|---|---|---|---|
0 | 1.0 | βίβλος | NaN | NaN | NaN |
1 | 26440.0 | βίβλος | NaN | NaN | NaN |
2 | 31717.0 | βίβλος | NaN | NaN | NaN |
3 | 45660.0 | βίβλος | NaN | NaN | NaN |
4 | 64886.0 | βίβλος | NaN | NaN | NaN |
SBLGNTglosses.describe()
orig_order | Strong's number | |
---|---|---|
count | 138318.000000 | 54761.000000 |
mean | 68756.590379 | 3008.488377 |
std | 39712.532678 | 1209.032138 |
min | 1.000000 | 1.000000 |
25% | 34366.250000 | 2041.000000 |
50% | 68753.500000 | 3588.000000 |
75% | 103162.750000 | 3706.000000 |
max | 137554.000000 | 5624.000000 |
SBLGNTglosses.head(20)
orig_order | Lexeme | Lexeme_dict | Strong's number | gloss | |
---|---|---|---|---|---|
0 | 1.0 | βίβλος | NaN | NaN | NaN |
1 | 26440.0 | βίβλος | NaN | NaN | NaN |
2 | 31717.0 | βίβλος | NaN | NaN | NaN |
3 | 45660.0 | βίβλος | NaN | NaN | NaN |
4 | 64886.0 | βίβλος | NaN | NaN | NaN |
5 | 68873.0 | βίβλος | NaN | NaN | NaN |
6 | 76865.0 | βίβλος | NaN | NaN | NaN |
7 | 107214.0 | βίβλος | NaN | NaN | NaN |
8 | 128928.0 | βίβλος | NaN | NaN | NaN |
9 | 136490.0 | βίβλος | NaN | NaN | NaN |
10 | 2.0 | γένεσις | NaN | NaN | NaN |
11 | 281.0 | γένεσις | NaN | NaN | NaN |
12 | 29821.0 | γένεσις | NaN | NaN | NaN |
13 | 120472.0 | γένεσις | NaN | NaN | NaN |
14 | 121080.0 | γένεσις | NaN | NaN | NaN |
15 | 3.0 | Ἰησοῦς | Ἰησοῦς | 2424.0 | Jesus |
16 | 243.0 | Ἰησοῦς | Ἰησοῦς | 2424.0 | Jesus |
17 | 278.0 | Ἰησοῦς | Ἰησοῦς | 2424.0 | Jesus |
18 | 357.0 | Ἰησοῦς | Ἰησοῦς | 2424.0 | Jesus |
19 | 436.0 | Ἰησοῦς | Ἰησοῦς | 2424.0 | Jesus |
SBLGNTglosses.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/SBLGNTglosses.xlsx')