#!/usr/bin/env python
# coding: utf-8
# # Identifying use of critical signs in the text (N1904LFT)
# ## Table of content
# * 1 - Introduction]
# * 2 - Load Text-Fabric app and data
# * 3 - Performing the queries
# * 3.1 - Getting an overview of leading critical signs
# * 3.2 - Query for all words that contain some critical marks
# * 3.3 - Collect critical marks before and after word
# * 3.4 - Comparing with print edition
# * 3.5 - Freqency of markorder
# # 1 - Introduction
# ##### [Back to TOC](#TOC)
#
# This Jupyter Notebook investigates the pressense of 'odd' values for feature 'after'.
# # 2 - Load Text-Fabric app and data
# ##### [Back to TOC](#TOC)
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[1]:
# Loading the New Testament TextFabric code
# Note: it is assumed Text-Fabric is installed in your environment.
from tf.fabric import Fabric
from tf.app import use
# In[2]:
# load the app and data
N1904 = use ("tonyjurg/Nestle1904LFT", version="0.5", hoist=globals())
# # 3 - Performing the queries
# ##### [Back to TOC](#TOC)
# ## 3.1 - Getting an overview of leading critical signs
# ##### [Back to TOC](#TOC)
# First get a list of all unique words in unicode (including punctuations, critical signs and trailing spaces):
# In[3]:
unicodeList = F.unicode.freqList()
print ('Number of results:',len(unicodeList))
# Now just look at the first character:
# In[4]:
# Initialize an empty dictionary to store the frequencies
frequencyList = {}
criticalSignsList ={}
criticalSigns={"—","[","("}
# Iterate through the list (which is a list of ordered tuples)
for item in unicodeList:
# Get the first character of the item
firstChar = item[0][0]
# Update the frequency in the dictionary for the full list
frequencyList[firstChar] = frequencyList.get(firstChar, 0) + 1
# add to other list if critical sign
if firstChar in criticalSigns:
criticalSignsList[firstChar]=criticalSignsList.get(firstChar, 0) + 1
print("Frequency list of all first character:")
print(frequencyList)
print("\nFrequency list of critical character:")
print(criticalSignsList)
# ## 3.2 - Query for all words that contain some critical marks
# ##### [Back to TOC](#TOC)
# In[53]:
# Library to format table
from tabulate import tabulate
# The actual query
SearchCriticalMarks = '''
word word~[(\(\[—\)\])]
'''
MarksList = N1904.search(SearchCriticalMarks)
# Postprocess the query results
Results=[]
for tuple in MarksList:
node=tuple[0]
location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node))
result=(location,F.unicode.v(node),F.word.v(node),F.after.v(node))
Results.append(result)
# Produce the table
headers = ["location","unicode","word","after"]
print(tabulate(Results, headers=headers, tablefmt='fancy_grid'))
# Note: The following site can be used to build and verify a regular expression: [regex101.com](https://regex101.com/) (choose the 'Pyton flavor')
# ## 3.3 - Collect critical marks before and after word
# ##### [Back to TOC](#TOC)
# In[5]:
# Library to format table
from tabulate import tabulate
# Creating a translation table to remove unwanted characters
criticalMarkCharacters = "[]()—"
punctuationCharacters = ",.;·"
translationTableMarkers = str.maketrans("", "", criticalMarkCharacters)
translationTablePunctuations = str.maketrans("", "", punctuationCharacters)
punctuations=('.',',',';','·')
# Query for words containing critical markers
SearchCriticalMarkers = '''
word unicode~[(\(\[—\)\])]
'''
MarksList = N1904.search(SearchCriticalMarkers)
# Postprocess the query results
Results=[]
for tuple in MarksList:
node=tuple[0]
location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node))
rawWord=F.unicode.v(node)
cleanWord= rawWord.translate(translationTableMarkers)
rawWithoutPunctuations=rawWord.translate(translationTablePunctuations)
PunctuationMarkOrder="No mark"
if cleanWord[-1] in punctuations:
punctuation=cleanWord[-1]
after=punctuation+' '
word=cleanWord[:-1]
else:
after=' '
word=cleanWord
punctuation=''
if rawWithoutPunctuations!=word:
markAfter=markBefore=''
if rawWord.find(word)==0:
markAfter=rawWithoutPunctuations.replace(word,"")
if punctuation!='':
if rawWord.find(markAfter)-rawWord.find(punctuation)>0:
PunctuationMarkOrder="(-1) punct. before mark."
else:
PunctuationMarkOrder="(1) punct. after mark."
else:
PunctuationMarkOrder="(0) no punctuation, mark after word"
else:
markBefore=rawWithoutPunctuations.replace(word,"")
PunctuationMarkOrder="(na) mark is before word"
# built in Python function repr() explicitly showing spaces (incl. whitespace characters like space, tab, and newline),
result=(location,repr(rawWord),repr(markBefore),repr(word),repr(markAfter),repr(after),PunctuationMarkOrder)
Results.append(result)
# Produce the table
headers = ["location","rawWord","markBefore","word","markAfter","after","punct. mark. order"]
print(tabulate(Results, headers=headers, tablefmt='fancy_grid'))
# ## 3.4 - Comparing with the print edition
# ##### [Back to TOC](#TOC)
#
# Some selections from the Nestle print edition @ [archive.org](https://archive.org/details/the-greek-new-testament-nestle-1904-us-edition):
#
#
# **Mark 7:2-4:**
#
#
#
# **Luke 2:35-36:**
#
#
#
# **Luke 2:35-36:**
#
#
#
# **John 10:12-13:**
#
#
#
# **2 Cor 12:2:**
#
#
#
#
#
#
# ## 3.5 - Freqency of markorder
# ##### [Back to TOC](#TOC)
# In[6]:
F.markorder.freqList()
# Put into a table:
#
# markorder | Description | Frequency
# --- | --- | ---
# ` ` | No critical marks | 137694
# `0` | Mark is before word | 34
# `1` | Mark is after word, no punctuations after word | 9
# `2` | Mark is after word, punctuations is after mark | 10
# `3` | Mark is after word, punctuations is before mark | 32
# In[ ]: