#!/usr/bin/env python
# coding: utf-8

# Identifying use of critical signs in the text (N1904GBI)

# 1 - Introduction

This Jupyter Notebook investigates the pressense of 'odd' values for feature 'after'.

# 2 - Load Text-Fabric app and data

get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')

# Loading the New Testament TextFabric code
# Note: it is assumed Text-Fabric is installed in your environment.
from tf.fabric import Fabric
from tf.app import use

# load the app and data
N1904 = use ("tonyjurg/Nestle1904GBI", version="0.4", hoist=globals())

# 3 - Performing the queries

## 3.1 - Getting an overview of leading critical signs

First get a list of all unique words in unicode (including punctuations, critical signs and trailing spaces):

unicodeList = F.word.freqList()
print ('Number of results:',len(unicodeList))

Now just look at the first character:

# Initialize an empty dictionary to store the frequencies
frequencyList = {}
criticalSignsList ={}
criticalSigns={"—","[","("}

# Iterate through the list (which is a list of ordered tuples)
for item in unicodeList:
    # Get the first character of the item
    firstChar = item[0][0]
    # Update the frequency in the dictionary for the full list
    frequencyList[firstChar] = frequencyList.get(firstChar, 0) + 1
    # add to other list if critical sign
    if firstChar in criticalSigns:
        criticalSignsList[firstChar]=criticalSignsList.get(firstChar, 0) + 1

print("Frequency list of all first character:")
print(frequencyList)
print("\nFrequency list of critical character:")
print(criticalSignsList)

## 3.2 - Query for all words that contain some critical marks

# Library to format table
from tabulate import tabulate

# The actual query
SearchLeadingCriticalMarks = '''
word
  word~[(\(\[—\)\])]
'''
MarksList = N1904.search(SearchLeadingCriticalMarks)

# Postprocess the query results
Results=[]
for tuple in MarksList:
    node=tuple[0]
    location="{} {}:{}".format(F.book.v(node),F.chapter.v(node),F.verse.v(node))
    result=(location,F.word.v(node),F.after.v(node))
    Results.append(result)

# Produce the table
headers = ["location","unicode","word","after"]
print(tabulate(Results, headers=headers, tablefmt='fancy_grid'))

Note: The following site can be used to build and verify a regular expression: [regex101.com](https://regex101.com/) (choose the 'Pyton flavor')