#!/usr/bin/env python # coding: utf-8 # # Identifying use of critical signs in the text (N1904LFT) # ## Table of content # * 1 - Introduction] # * 2 - Load Text-Fabric app and data # * 3 - Performing the queries # * 3.1 - Getting an overview of leading critical signs # * 3.2 - Query for all words that contain some critical marks # * 3.3 - Collect critical marks before and after word # * 3.4 - Comparing with print edition # * 3.5 - Freqency of markorder # # 1 - Introduction # ##### [Back to TOC](#TOC) # # This Jupyter Notebook investigates the pressense of 'odd' values for feature 'after'. # # 2 - Load Text-Fabric app and data # ##### [Back to TOC](#TOC) # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[1]: # Loading the New Testament TextFabric code # Note: it is assumed Text-Fabric is installed in your environment. from tf.fabric import Fabric from tf.app import use # In[2]: # load the app and data N1904 = use ("tonyjurg/Nestle1904LFT", version="0.5", hoist=globals()) # # 3 - Performing the queries # ##### [Back to TOC](#TOC) # ## 3.1 - Getting an overview of leading critical signs # ##### [Back to TOC](#TOC) # First get a list of all unique words in unicode (including punctuations, critical signs and trailing spaces): # In[3]: unicodeList = F.unicode.freqList() print ('Number of results:',len(unicodeList)) # Now just look at the first character: # In[4]: # Initialize an empty dictionary to store the frequencies frequencyList = {} criticalSignsList ={} criticalSigns={"—","[","("} # Iterate through the list (which is a list of ordered tuples) for item in unicodeList: # Get the first character of the item firstChar = item[0][0] # Update the frequency in the dictionary for the full list frequencyList[firstChar] = frequencyList.get(firstChar, 0) + 1 # add to other list if critical sign if firstChar in criticalSigns: criticalSignsList[firstChar]=criticalSignsList.get(firstChar, 0) + 1 print("Frequency list of all first character:") print(frequencyList) print("\nFrequency list of critical character:") print(criticalSignsList) # ## 3.2 - Query for all words that contain some critical marks # ##### [Back to TOC](#TOC) # In[53]: # Library to format table from tabulate import tabulate # The actual query SearchCriticalMarks = ''' word word~[(\(\[—\)\])] ''' MarksList = N1904.search(SearchCriticalMarks) # Postprocess the query results Results=[] for tuple in MarksList: node=tuple[0] location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node)) result=(location,F.unicode.v(node),F.word.v(node),F.after.v(node)) Results.append(result) # Produce the table headers = ["location","unicode","word","after"] print(tabulate(Results, headers=headers, tablefmt='fancy_grid')) # Note: The following site can be used to build and verify a regular expression: [regex101.com](https://regex101.com/) (choose the 'Pyton flavor') # ## 3.3 - Collect critical marks before and after word # ##### [Back to TOC](#TOC) # In[5]: # Library to format table from tabulate import tabulate # Creating a translation table to remove unwanted characters criticalMarkCharacters = "[]()—" punctuationCharacters = ",.;·" translationTableMarkers = str.maketrans("", "", criticalMarkCharacters) translationTablePunctuations = str.maketrans("", "", punctuationCharacters) punctuations=('.',',',';','·') # Query for words containing critical markers SearchCriticalMarkers = ''' word unicode~[(\(\[—\)\])] ''' MarksList = N1904.search(SearchCriticalMarkers) # Postprocess the query results Results=[] for tuple in MarksList: node=tuple[0] location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node)) rawWord=F.unicode.v(node) cleanWord= rawWord.translate(translationTableMarkers) rawWithoutPunctuations=rawWord.translate(translationTablePunctuations) PunctuationMarkOrder="No mark" if cleanWord[-1] in punctuations: punctuation=cleanWord[-1] after=punctuation+' ' word=cleanWord[:-1] else: after=' ' word=cleanWord punctuation='' if rawWithoutPunctuations!=word: markAfter=markBefore='' if rawWord.find(word)==0: markAfter=rawWithoutPunctuations.replace(word,"") if punctuation!='': if rawWord.find(markAfter)-rawWord.find(punctuation)>0: PunctuationMarkOrder="(-1) punct. before mark." else: PunctuationMarkOrder="(1) punct. after mark." else: PunctuationMarkOrder="(0) no punctuation, mark after word" else: markBefore=rawWithoutPunctuations.replace(word,"") PunctuationMarkOrder="(na) mark is before word" # built in Python function repr() explicitly showing spaces (incl. whitespace characters like space, tab, and newline), result=(location,repr(rawWord),repr(markBefore),repr(word),repr(markAfter),repr(after),PunctuationMarkOrder) Results.append(result) # Produce the table headers = ["location","rawWord","markBefore","word","markAfter","after","punct. mark. order"] print(tabulate(Results, headers=headers, tablefmt='fancy_grid')) # ## 3.4 - Comparing with the print edition # ##### [Back to TOC](#TOC) # # Some selections from the Nestle print edition @ [archive.org](https://archive.org/details/the-greek-new-testament-nestle-1904-us-edition): # # # **Mark 7:2-4:** # # # # **Luke 2:35-36:** # # # # **Luke 2:35-36:** # # # # **John 10:12-13:** # # # # **2 Cor 12:2:** # # # # # # # ## 3.5 - Freqency of markorder # ##### [Back to TOC](#TOC) # In[6]: F.markorder.freqList() # Put into a table: # # markorder | Description | Frequency # --- | --- | --- # ` ` | No critical marks | 137694 # `0` | Mark is before word | 34 # `1` | Mark is after word, no punctuations after word | 9 # `2` | Mark is after word, punctuations is after mark | 10 # `3` | Mark is after word, punctuations is before mark | 32 # In[ ]: