#!/usr/bin/env python
# coding: utf-8
#
Table of Contents
#
# In[2]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[3]:
# First, I have to laod different modules that I use for analyzing the data and for plotting:
import sys, os, collections
import pandas as pd
import numpy as np
import re
import csv
import seaborn as sns
import matplotlib.pyplot as plt; plt.rcdefaults()
from matplotlib.pyplot import figure
from collections import Counter
# Second, I have to load the Text Fabric app
from tf.fabric import Fabric
from tf.app import use
# In[35]:
SBLGNTv1 = use('SBLGNT/tf/6.12_v1', hoist=globals())
# In[72]:
SBLGNTv2 = use('SBLGNT/tf/6.12_v2', hoist=globals())
# In[57]:
Searchv1_0 = '''
book book=Matthew book_code*
chapter chapter=1
sentence sentence=1
verse
'''
Searchv1_0 = SBLGNTv1.search(Searchv1_0)
SBLGNTv1.show(Searchv1_0, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'verse','vrsnum','orig_order'})
# In[76]:
Searchv2_0 = '''
book book=Matthew
chapter chapter=1
verse verse=21
'''
Searchv2_0 = SBLGNTv2.search(Searchv2_0)
SBLGNTv2.show(Searchv2_0, start=1, end=5, condensed=True, colorMap={3:'pink'}, extraFeatures={'verse','vrsnum', 'orig_order'})
# In[17]:
Search1 = '''
verse book=Matthew chapter=1 verse=1
word
'''
Search1 = SBLGNT.search(Search1)
SBLGNT.show(Search1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'verse','vrsnum'})
# In[15]:
Search2 = '''
book book=Revelation book_code*
chapter chapter=1
sentence
clause
word lemma* morph* normalized_word* case* degree* gn* mood* nu* orig_order* ps* voice* tense* lemma_translit* dict_abc_order* lemma_freq* lemma_dictform* lemma_freq* lemma_gloss* lemma_strongs*
'''
Search2 = SBLGNT.search(Search2)
SBLGNT.show(Search2, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'verse','vrsnum'})
# # New Feature Development
# # Morphology
# ## MorphGNT:sblgnt Data Description
# Here: https://github.com/morphgnt/sblgnt
#
# ![image.png](attachment:image.png)
#
# ## Extracting morphological features
# In[381]:
featureprep=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/morph_copy.tf',delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
featureprep.head(5)
# In[382]:
featureprep['orig_order'] = featureprep.index +1
featureprep.head(5)
# ### Adding Part of Speech `sp`
# In[383]:
def spconditions(row):
if re.search('^A.*', str(row)):
return 'adj'
if re.search('^C.*', str(row)):
return 'conj'
if re.search('^D.*', str(row)):
return 'adv'
if re.search('^I.*', str(row)):
return 'interj'
if re.search('^N-.*', str(row)):
return 'noun'
if re.search('^P.*', str(row)):
return 'prep'
if re.search('^RA.*', str(row)):
return 'art-def'
if re.search('^RD.*', str(row)):
return 'pron-dem'
if re.search('^RI.*', str(row)):
return 'pron-inter'
if re.search('^RP.*', str(row)):
return 'pron-prs'
if re.search('^RR.*', str(row)):
return 'pron-rela'
if re.search('^V-.*', str(row)):
return 'verb'
if re.search('^X.*', str(row)):
return 'partcl'
else:
return ''
# In[384]:
featureprep['sp']=featureprep['morphology'].apply(spconditions)
featureprep.head(20)
# ### Adding Gender `gn`
# In[385]:
def gender(row):
if re.search('.*F.$', str(row)):
return 'f'
if re.search('.*M.$', str(row)):
return 'm'
if re.search('.*N.$', str(row)):
return 'n'
else:
return ''
# In[386]:
featureprep['gn']=featureprep['morphology'].apply(gender)
featureprep.head(20)
# ### Adding Number `nu`
# In[387]:
def number(row):
if re.search('.*S..$', str(row)):
return 'sg'
if re.search('.*S[MFN].$', str(row)):
return 'sg'
if re.search('.*P..$', str(row)):
return 'pl'
if re.search('.*P[MFN].$', str(row)):
return 'pl'
else:
return ''
# In[388]:
featureprep['nu']=featureprep['morphology'].apply(number)
featureprep.head(50)
# ### Adding Person `ps`
# In[389]:
def person(row):
if re.search('.*1.*', str(row)):
return 'p1'
if re.search('.*2.*', str(row)):
return 'p2'
if re.search('.*3.*', str(row)):
return 'p3'
else:
return ''
# In[390]:
featureprep['ps']=featureprep['morphology'].apply(person)
featureprep.head(20)
# ### Adding Case `case`
# In[391]:
def case(row):
if re.search('......N...$', str(row)):
return 'nominative'
if re.search('......G...$', str(row)):
return 'genitive'
if re.search('......D...$', str(row)):
return 'dative'
if re.search('......A...$', str(row)):
return 'accusative'
else:
return ''
# In[392]:
featureprep['case']=featureprep['morphology'].apply(case)
featureprep.head(20)
# ### Adding Tense `vt`
# In[393]:
def tense(row):
if re.search('...A.*', str(row)):
return 'aorist'
if re.search('...P.*', str(row)):
return 'present'
if re.search('...F.*', str(row)):
return 'future'
if re.search('...I.*', str(row)):
return 'imperfect'
if re.search('...Y.*', str(row)):
return 'plusquamperfect'
if re.search('...X.*', str(row)):
return 'perfect'
else:
return ''
# In[394]:
featureprep['vt']=featureprep['morphology'].apply(tense)
featureprep.head(20)
# ### Adding Voice `voice`
# In[395]:
def voice(row):
if re.search('....A.....$', str(row)):
return 'active'
if re.search('....M.....$', str(row)):
return 'middle'
if re.search('....P.....$', str(row)):
return 'passive'
else:
return ''
# In[396]:
featureprep['voice']=featureprep['morphology'].apply(voice)
featureprep.head(20)
# ### Adding Mood `mood`
# In[397]:
def mood(row):
if re.search('.....I....$', str(row)):
return 'indicative'
if re.search('.....D....$', str(row)):
return 'imperative'
if re.search('.....N....$', str(row)):
return 'infinitive'
if re.search('.....O....$', str(row)):
return 'optative'
if re.search('.....P....$', str(row)):
return 'participle'
if re.search('.....S....$', str(row)):
return 'subjunctive'
else:
return ''
# In[398]:
featureprep['mood']=featureprep['morphology'].apply(mood)
featureprep.head(50)
# ### Adding degree `degree`
# In[399]:
def degree(row):
if re.search('C$', str(row)):
return 'comparative'
if re.search('S$', str(row)):
return 'superlative'
else:
return ''
# In[400]:
featureprep['degree']=featureprep['morphology'].apply(propernoun)
featureprep.head(50)
# ### Exporting proces
# In[401]:
featureprep.head()
# #### reorder
# sorting first...
# In[402]:
featureprep.sort_values(['orig_order'], ascending=True).head(10)
# #### to excel spreadsheet...
# In[403]:
featureprep.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/morphology.xlsx', index=None)
# In[404]:
# export single features into tf files
featureprep['sp'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/sp2.tf', index=None)
featureprep['gn'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/gn2.tf', index=None)
featureprep['nu'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/nu.tf', index=None)
featureprep['ps'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ps2.tf', index=None)
featureprep['case'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/case.tf', index=None)
featureprep['vt'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/vt.tf', index=None)
featureprep['voice'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/voice.tf', index=None)
featureprep['mood'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/mood.tf', index=None)
featureprep['degree'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/degree.tf', index=None)
# # Translitaration
# In[4]:
translitadd=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
translitadd.head(10)
# In[5]:
translitadd['lemma']=translitadd[0]
translitadd.head(5)
# In[6]:
translitadd=translitadd[['lemma']]
translitadd.head(5)
# In[7]:
translitadd['orig_order'] = translitadd.index +1
translitadd.head(5)
# In[9]:
from unidecode import unidecode
# In[10]:
s = "βίβλος"
s = unidecode(s)
print(s)
# In[11]:
translitadd['translit'] = translitadd['lemma'].apply(unidecode)
# In[12]:
translitadd.head(5)
# In[413]:
translitadd['translit'].to_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_translit.tf', index=None)
# # ABC dictionary order
#
# In[414]:
ABC1=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
ABC1.head(10)
# In[415]:
ABC1['lemma']=lemma[0]
ABC1.head(5)
# In[416]:
ABC1['orig_order'] = ABC1.index +1
ABC1.head(5)
# In[417]:
ABC1=ABC1[['orig_order','lemma']]
ABC1.head(5)
# In[418]:
ABC1.describe()
# In[419]:
ABCdict = ABC1.drop_duplicates(['lemma']).sort_values(by='lemma', ascending=[True])
ABCdict.head(10)
# In[420]:
ABCdict.describe()
# In[421]:
ABC1.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC1order.xlsx', encoding='utf-8')
# Now I am ordering the word alphabetically iwth libreoffice writer since I cannot do that in pandas (yet?).
#
# In[422]:
ABC2=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/ABC2order.xlsx')
pd.set_option('display.max_columns', 50)
ABC2.head(10)
# Now we merge the ABCorder dataframe with the original lemma DF.
# In[423]:
lemma_ABC=pd.merge (ABC1, ABC2,
on='lemma',
how='outer')
lemma_ABC.head(5)
# In[424]:
lemma_ABC.describe()
# In[425]:
lemma_ABC.sort_values(['orig_order_x'], ascending=True).head(10)
# In[426]:
lemma_ABC.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_abc.xlsx')
# # Word Frequency
# In[427]:
frequencyadd=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
frequencyadd.head(20)
# In[428]:
frequencyadd['orig_order'] = frequencyadd.index +1
frequencyadd['lemma']=frequencyadd[0]
frequencyadd=frequencyadd[['orig_order','lemma']]
frequencyadd.head(5)
# In[429]:
frequencyadd["freq_lemma"]=frequencyadd.groupby(["lemma"])["lemma"].transform("count")
#("count") is actually utilizing the 'count' function!
frequencyadd.head(20)
# In[430]:
frequencyadd.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_freq.xlsx')
# # English Dictionary
# Lets first load the NA1904 BibleOL dictionary:
# In[431]:
BOLgreekDICT=pd.read_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/NA1904_dictionary_v1.0.xlsx')
pd.set_option('display.max_columns', 50)
BOLgreekDICT.head(20)
# In[432]:
BOLgreekDICT=BOLgreekDICT[['Lexeme','Lexeme_dict', 'Strong\'s number', 'gloss']]
BOLgreekDICT.head(10)
# In[433]:
BOLgreekDICT.describe()
# Lets load the SBLGNT lemmas
# In[434]:
SBLGNTlemmas=pd.read_csv('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/lemma_copy.tf',header=None, delimiter='\t',encoding='utf-8')
pd.set_option('display.max_columns', 50)
SBLGNTlemmas.head(2)
# In[436]:
SBLGNTlemmas['orig_order']=SBLGNTlemmas.index +1
SBLGNTlemmas['Lexeme']=SBLGNTlemmas[0]
SBLGNTlemmas=SBLGNTlemmas[['orig_order','Lexeme']]
SBLGNTlemmas.head(5)
# In[437]:
SBLGNTlemmas.describe()
# Now lets try a merge of the two files
# In[438]:
SBLGNTglosses=pd.merge (SBLGNTlemmas,BOLgreekDICT,
on='Lexeme',
how='outer')
SBLGNTglosses.head(5)
# In[439]:
SBLGNTglosses.describe()
# In[440]:
SBLGNTglosses.head(20)
# In[441]:
SBLGNTglosses.to_excel('d:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/feature-dev/SBLGNTglosses.xlsx')
# In[ ]:
# # Comparing with Tischendorf
# In[442]:
TISCH = use('tisch', hoist=globals())
# In[443]:
Tisch1 = '''
book book=Matthew book_code*
chapter chapter=1
word lex_og*
'''
Tisch1 = TISCH.search(Tisch1)
TISCH.show(Tisch1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'verse','vrsnum'})
# # Testing SBLGNT
# In[444]:
SBLGNT = use('SBLGNT/tf/6.12', hoist=globals())
# In[445]:
Search1 = '''
book book=Revelation book_code*
chapter chapter=1
sentence
clause
word lemma* morph* normalized_word* case* degree* gn* mood* nu* orig_order* ps* voice* vt* lemma_translit* dict_abc_order* lemma_freq* lemma_dictform* lemma_freq* lemma_gloss* lemma_strongs*
'''
Search1 = SBLGNT.search(Search1)
SBLGNT.show(Search1, start=1, end=1, condensed=True, colorMap={1:'pink'}, extraFeatures={'verse','vrsnum'})
# ### Searching for word combinations
# #### Searching for "the testimony of Jesus"
# As we search for the phrase we want to make sure that "martyria" can have and case and "Jaesu" needs to be in the genetive case.
# The operator `<:` determines that `w2` needs to follow `<` `w1` without any word inbetween `:`.
# In[446]:
TestimonyOfJesus = '''
clause
w1:word lemma=μαρτυρία
w2:word lemma=Ἰησοῦς case=genitive
w1 <: w2
'''
TestimonyOfJesus=SBLGNT.search(TestimonyOfJesus)
SBLGNT.show(TestimonyOfJesus, start=1, end=10, condensed=True, colorMap={2: 'red', 3: 'orange'})
# ## Complex searches
# ### Lets search for corellated conjunctions
# In Col 2:16-17 we find a syntactical construction that involves a series of conjunctions (καί and ἤ). The critical edition of the [NA28](https://ref.ly/logosres/na28?ref=BibleNA27.Col2.18) presents the text in the following way:
# ![image.png](attachment:image.png)
#
# The syntactical structure is differently analyzed by scholars. Below you see a comparison between the [cascadia](https://ref.ly/logosres/csgntsbl?ref=BibleSBLGNT.Col2.16) and [opentext](https://ref.ly/logosres/opentextgraph?ref=BibleNA27.Col2.16) analysis. One needs to keep in mind that the NA edition renders the text slightly different (see ἤ => καί) than the Mehrheitstext that is followed by the SBL edition (see καί => ἤ) :
# ![image-2.png](attachment:image-2.png)
#
# Lets look first at the conjunction sequence that we have in the NA28. Does this appear elsewhere in the SBLGNT text?
#
# In[447]:
KaiHeHe ='''
v1:verse
w1:word
w2:word lemma=καί
w3:word lemma=ἤ
w4:word lemma=ἤ
w1 < w2
w2 < w3
w3 < w4
'''
KaiHeHe=SBLGNT.search(KaiHeHe)
SBLGNT.show(KaiHeHe, start=1, end=20, condensed=True, colorMap={2: 'grey', 3: 'red', 4: 'magenta', 5: 'magenta'})
# In[448]:
KaiHeHe2 ='''
v1:verse
w1:word
w2:word lemma=καί
w3:word lemma=ἤ
w4:word lemma=ἤ
v1 =20: w1
w1 < w2
w1 <10: w2
w2 < w3
w2 <6: w3
w3 < w4
w3 <6: w4
'''
KaiHeHe2=SBLGNT.search(KaiHeHe2)
SBLGNT.show(KaiHeHe2, start=1, end=200, condensed=True, colorMap={2: 'grey', 3: 'red', 4: 'magenta', 5: 'magenta'})
# No "either ... or" construction in:
# - Mat 6:31 (NA28)
# - Mark 10:29
# - Luk 18:29
# - Rom 8:35
# - 1 Cor 5:11 (its not the first ἢ but the previous ἐάν that initiates the "either")
# - 1 Cor 14:6 (its not the first ἢ but the previous ἐάν that initiates the "either")
# - 1 Thes 2:19
# - 1 Peter 4:15
#
# The only exception is Mk 13:35. But the construction is not triggering the "either ... or" function (see also modern translations). It rather hints at a textcritical issue which is also well documented in the text critical apparatus:
#
# ![image.png](attachment:image.png)
#
# Consequently, the proper translation is "X or Y or Z...".
#
#
# **Conclusion: it is highly unlikely that Col 2:16 resembles an "either ... or" construction. By default if there are three ἢ conjunctions appearing in a sequence they trigger the meaning "or ... or".**
#
# The following query does not exclude a preceding καὶ:
# In[449]:
HeHeHe ='''
verse
word lemma=ἤ
< word lemma=ἤ
< word lemma=ἤ
'''
HeHeHe=SBLGNT.search(HeHeHe)
SBLGNT.show(HeHeHe, start=1, end=10, condensed=True, colorMap={2: 'magenta', 3: 'magenta', 4: 'magenta'})
# # Producing some graphical representations
# Since we will be using the python based visualization tools we need to first export our TF search results as a table and then load that table as a pandas dataframe.
# ## The distribution of Jesus as subject (nominative case) in the NT
# In[450]:
JesusInNT = '''
book
clause
word lemma=Ἰησοῦς case=nominative
'''
JesusInNT=SBLGNT.search(JesusInNT)
SBLGNT.show(JesusInNT, start=1, end=3, condensed=True, colorMap={2: 'gold'})
# In[451]:
SBLGNT.export(JesusInNT, toDir='D:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/', toFile='JesusInNT.tsv')
# In[452]:
JesusInNT=pd.read_csv('D:/OneDrive/1200_AUS-research/Fabric-TEXT/0_data_SBLGNT/JesusInNT.tsv',delimiter='\t',encoding='utf-16')
JesusInNT.head()
# In[453]:
figure(num=None, figsize=(5, 5), dpi=80, facecolor='w', edgecolor='k')
JesusInNT.groupby("S1").size().sort_values(ascending=True).plot.barh()
plt.xlabel('occurence of Jesus as subject')
plt.ylabel('NT books')
plt.title('Jesus as subject (case=nominative)')
plt.show()
# In[454]:
JesusInNT.S1.value_counts(sort=False).plot.pie(autopct='%1.0f%%', shadow=True, startangle=90)
plt.show()
# In[ ]:
# In[ ]:
# In[ ]: