#!/usr/bin/env python # coding: utf-8 # # Identifying use of critical signs in the text (N1904GBI) # ## Table of content # * 1 - Introduction] # * 2 - Load Text-Fabric app and data # * 3 - Performing the queries # * 3.1 - Getting an overview of leading critical signs # * 3.2 - Query for all words that contain some critical marks # # 1 - Introduction # ##### [Back to TOC](#TOC) # # This Jupyter Notebook investigates the pressense of 'odd' values for feature 'after'. # # 2 - Load Text-Fabric app and data # ##### [Back to TOC](#TOC) # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[1]: # Loading the New Testament TextFabric code # Note: it is assumed Text-Fabric is installed in your environment. from tf.fabric import Fabric from tf.app import use # In[2]: # load the app and data N1904 = use ("tonyjurg/Nestle1904GBI", version="0.4", hoist=globals()) # # 3 - Performing the queries # ##### [Back to TOC](#TOC) # ## 3.1 - Getting an overview of leading critical signs # ##### [Back to TOC](#TOC) # First get a list of all unique words in unicode (including punctuations, critical signs and trailing spaces): # In[4]: unicodeList = F.word.freqList() print ('Number of results:',len(unicodeList)) # Now just look at the first character: # In[8]: # Initialize an empty dictionary to store the frequencies frequencyList = {} criticalSignsList ={} criticalSigns={"—","[","("} # Iterate through the list (which is a list of ordered tuples) for item in unicodeList: # Get the first character of the item firstChar = item[0][0] # Update the frequency in the dictionary for the full list frequencyList[firstChar] = frequencyList.get(firstChar, 0) + 1 # add to other list if critical sign if firstChar in criticalSigns: criticalSignsList[firstChar]=criticalSignsList.get(firstChar, 0) + 1 print("Frequency list of all first character:") print(frequencyList) print("\nFrequency list of critical character:") print(criticalSignsList) # ## 3.2 - Query for all words that contain some critical marks # ##### [Back to TOC](#TOC) # In[10]: # Library to format table from tabulate import tabulate # The actual query SearchLeadingCriticalMarks = ''' word word~[(\(\[—\)\])] ''' MarksList = N1904.search(SearchLeadingCriticalMarks) # Postprocess the query results Results=[] for tuple in MarksList: node=tuple[0] location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node)) result=(location,F.word.v(node),F.after.v(node)) Results.append(result) # Produce the table headers = ["location","unicode","word","after"] print(tabulate(Results, headers=headers, tablefmt='fancy_grid')) # Note: The following site can be used to build and verify a regular expression: [regex101.com](https://regex101.com/) (choose the 'Pyton flavor') # In[ ]: