#!/usr/bin/env python
# coding: utf-8
# # Various text formats (Nestle1904LFT)
# ## Table of content
# * 1 - Introduction
# * 2 - Load Text-Fabric app and data
# * 3 - Performing the queries
# * 3.1 - Display the formatting options available for this corpus
# * 3.2 - Showcasing the various formats
# * 3.3 - Normalized text
# * 3.4 - Unaccented text
# * 3.5 - Transliterated text
# * 3.6 - Text with text critical markers
# * 3.7 - Nestle version 1904 and version 1913 (Mark 1:1)
#
# # 1 - Introduction
# ##### [Back to TOC](#TOC)
#
# This Jupyter Notebook is designed to demonstrate the predefined text formats available in this Text-Fabric dataset, specifically focusing on displaying the Greek surface text of the New Testament.
# # 2 - Load Text-Fabric app and data
# ##### [Back to TOC](#TOC)
# In[1]:
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
# In[2]:
# Loading the Text-Fabric code
# Note: it is assumed Text-Fabric is installed in your environment
from tf.fabric import Fabric
from tf.app import use
# In[3]:
# load the N1904 app and data
N1904 = use ("tonyjurg/Nestle1904LFT", version="0.7", hoist=globals())
# In[4]:
# The following will push the Text-Fabric stylesheet to this notebook (to facilitate proper display with notebook viewer)
N1904.dh(N1904.getCss())
# In[5]:
# Set default view in a way to limit noise as much as possible.
N1904.displaySetup(condensed=True, multiFeatures=False, queryFeatures=False)
# # 3 - Performing the queries
# ##### [Back to TOC](#TOC)
# ## 3.1 - Display the formatting options available for this corpus
# ##### [Back to TOC](#TOC)
#
# The output of the following command provides details on available formats to present the text of the corpus.
#
# See also [module tf.advanced.options
# Display Settings](https://annotation.github.io/text-fabric/tf/advanced/options.html).
# In[7]:
N1904.showFormats()
# Note 1: This data originates from the file `otext.tf`:
#
# >
# ```
# @config
# ...
# @fmt:text-orig-full={word}{after}
# ...
# ```
#
# Note 2: The names of the available formats can also be obtaind by using the following call. However, this will not display the features that are included into the format. The function will return a list of ordered tuples that can easily be postprocessed:
# In[8]:
T.formats
# ## 3.2 - Showcasing the various formats
# ##### [Back to TOC](#TOC)
#
# The following will show the differences between the displayed text for the various formats. The verse to be printed is from Mark 1:1. The assiated verse node is 139200.
# In[9]:
for formats in T.formats:
print(f'fmt={formats}\t: {T.text(139200,formats)}')
# ## 3.3 - Normalized text
# ##### [Back to TOC](#TOC)
#
# The normalized Greek text refers to a standardized and consistent representation of Greek characters and linguistic elements in a text. Using normalized text ensures a consistent presentation, which, in turn, allows for easier postprocessing. The relevance of normalized text becomes evident through the following demonstration.
#
# In the upcoming code segment, a list will be created to display the top 10 differences in values between the "word" feature and the "normalized" feature on the same word node.
# In[10]:
# Library to format table
from tabulate import tabulate
# get a node list for all word nodes
WordQuery = '''
word
'''
# The option 'silent=True' has been added in the next line to prevent printing the number of nodes found
WordResult = N1904.search(WordQuery,silent=True)
# Gather the results where feature normalized is different from feature word
ResultDict = {}
NumberOfChanges=0
for tuple in WordResult:
word=F.word.v(tuple[0])
normalized=F.normalized.v(tuple[0])
if word!=normalized:
Change=f"{word} -> {normalized}"
NumberOfChanges+=1
if Change in ResultDict:
# If it exists, add the count to the existing value
ResultDict[Change]+=1
else:
# If it doesn't exist, initialize the count as the value
ResultDict[Change]=1
print(f"{NumberOfChanges} differences found between feature word and feature normalized.")
# Convert the dictionary into a list of key-value pairs and sort it according to frequency
UnsortedTableData = [[key, value] for key, value in ResultDict.items()]
TableData= sorted(UnsortedTableData, key=lambda row: row[1], reverse=True)
# In this example the table will be truncated
max_rows = 10 # Set your desired number of rows here
TruncatedTable = TableData[:max_rows]
# Produce the table
headers = ["word -> normalized","frequency"]
print(tabulate(TruncatedTable, headers=headers, tablefmt='fancy_grid'))
# Add a warning using markdown (API call A.dm) allowing it to be printed in bold type
N1904.dm("**Warning: table truncated!**")
# Now, it would be interesting to check whether καί and δέ already exist (with these accents) in the feature "word."
# In[11]:
# get a node list for all word nodes with feature word=καί
KaiQuery = '''
word word=καί
'''
KaiResult = N1904.search(KaiQuery)
# get a node list for all word nodes with feature word=δέ
DeQuery = '''
word word=δέ
'''
DeResult = N1904.search(DeQuery)
# This demonstrates the presence of variant accents for καί and δέ in the feature word. Consequently, constructing queries based on a single accent variant would result in the omission of certain results.
# ## 3.4 - Unaccented text
# ##### [Back to TOC](#TOC)
# A similar case can be made regarding postprocessing with respect to the unaccented text; however, accents do play a significant role in understanding some Greek words (homographs). It is important to realize that the accents were not part of the original text, which was in unaccented capital letters (uncials) without spaces between words.
# In[12]:
# get a node list for all word nodes containing some variants in accents
KosmosQuery = '''
word word~λ[όο]γ[όο]ς
'''
PneumaQuery = '''
word word~πν[εέ][ῦυ]μα
'''
KuriosQuery = '''
word word~κ[ύυ]ρ[ίι]ος
'''
HemeraQuery = '''
word word~ἡμ[έε]ρα
'''
Result = N1904.search(KuriosQuery)
for tuple in Result:
word=F.word.v(tuple[0])
print(word)
# ## 3.5 - Transliterated text
# ##### [Back to TOC](#TOC)
# Using transliterated text can be convenient when creating queries, as it allows you to use your normal keyboard without the need to include Greek characters. See the following example:
# In[13]:
LatinQuery = '''
word wordtranslit=logos
'''
Result = N1904.search(LatinQuery)
for tuple in Result:
word=F.word.v(tuple[0])
print(word)
break
# ## 3.6 - Text with text critical markers
# ##### [Back to TOC](#TOC)
#
# A limited number of critical markers are included in the dataset, stored in the features "markbefore" and "markafter." To get an impression of their quantity:
# In[14]:
F.markafter.freqList()
# In[15]:
F.markbefore.freqList()
# A quick investigation was conducted to check the dataset's consistency. Note that an automated check for '—' is not implemented below, as it is difficult to determine whether this marker indicates a start or an end.
# In[16]:
# get a node list for all word nodes
WordQuery = '''
word
'''
BracketList=("(" , ")" , "[" , "]" , "[[" , "]]")
# The option 'silent=True' has been added in the next line to prevent printing the number of nodes found
WordResult = N1904.search(WordQuery,silent=True)
SingleRound=SingleSquare=DoubleSquare=False
for tuple in WordResult:
node=tuple[0]
MarkAfter=F.markafter.v(node)
MarkBefore=F.markbefore.v(node)
Mark=MarkAfter+MarkBefore
location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node))
if (Mark in BracketList):
if Mark=="(":
if SingleRound==True: print ("Sequence problem?")
SingleRound=True
print (f"{location}: set single round")
if Mark==")":
if SingleRound==False: print ("Sequence problem?")
SingleRound=False
print (f"{location}: unset single round\n")
if Mark=="[":
if SingleSquare==True: print ("Sequence problem?")
SingleSquare=True
print (f"{location}: set single square")
if Mark=="]":
if SingleSquare==False: print ("Sequence problem?")
SingleSquare=False
print (f"{location}: unset single square\n")
if Mark=="[[":
if DoubleSquare==True: print ("Sequence problem?")
DoubleSquare=True
print (f"{location}: set double square")
if Mark=="]]":
if DoubleSquare==False: print ("Sequence problem?")
DoubleSquare=False
print (f"{location}: unset double square\n")
# ## 3.7 - Nestle version 1904 and version 1913 (Mark 1:1)
# ##### [Back to TOC](#TOC)
# The dataset seems to be (also) compiled based upon the Nestle version or 1913, as explained on [https://sites.google.com/site/nestle1904/faq]:
#
# > *What are your sources?*
# > For the text, I used the scanned books available at the Internet Archive (The first edition of 1904, and a reprinting from 1913 – the latter one has a better quality).
#
# Print Mark 1:1 from Text-Fabric data:
# In[17]:
T.text(139200,fmt='text-critical')
# The result can be verified by examining the scans of the following printed versions:
# * Nestle version 1904: [@ archive.org](https://archive.org/details/the-greek-new-testament-nestle-1904-us-edition/page/84/mode/2up)
# * Nestle version 1913: [@ archive.org](https://archive.org/details/hkainediathekete00lond/page/88/mode/1up)
#
# Or, in an image, placed side by side:
#