#!/usr/bin/env python
# coding: utf-8

# # Vocalisation of the Tetragrammaton (BHSA)

# # 1 - Introduction

# The Old Testament contains the how the Tetragrammaton יהוה written with different vowels, for example with the vowals of of אֲדֹנַי (Adonai, ETCBC transliteration: >:ADON@J).

# 2 - Load Text-Fabric app and data

# In[1]:


get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')


# In[2]:


# Loading the Text-Fabric code
# Note: it is assumed Text-Fabric is installed in your environment.
from tf.fabric import Fabric
from tf.app import use


# In[3]:


# load the BHS app and data
BHS = use ("etcbc/BHSA",hoist=globals())


# Note: Thefeature documentation can be found at [ETCBC GitHub](https://github.com/ETCBC/bhsa/blob/master/docs/features/0_home.md)

# In[4]:


# The following will push the Text-Fabric stylesheet to this notebook (to facilitate proper display with notebook viewer)
BHS.dh(BHS.getCss())


# # 3 - Performing the queries

# ## 3.1 - Get overview of all pointed versions

# First get all occurances of the Tetragrammaton יהוה (so without vowel pointing and other diacritical marks). See also notes on [feature g_word](https://github.com/ETCBC/bhsa/blob/master/docs/features/g_word.md).

# In[5]:


JHWHQuery = '''
book
  chapter
    verse
      word g_cons=JHWH
'''
JHWHResults = BHS.search(JHWHQuery)


# Now post process the results to create a nice table.

# In[6]:


# Libraries for table formatting and regular expressions
import re
import pandas as pd
from IPython.display import display

# Initialize dictionary for storing results
resultDict = {}

# Process each item in the JHWHResults
for item in JHWHResults:
    node = item[3]
    
    # Get the pointed and unpointed representation of a word occurrence
    pointedWord = F.g_word.v(node)
    hebrewWord = F.g_word_utf8.v(node)
    
    # Remove cantillations in the BSHA (presented by digits)
    vocalizedWord = re.sub(r'\d', '', pointedWord)
    
    if vocalizedWord in resultDict:
        # If exists, increment the frequency count
        resultDict[vocalizedWord][0] += 1
    else:
        # Initialize count and store the first occurrence
        firstOccurrence = T.sectionFromNode(node)
        resultDict[vocalizedWord] = [1, firstOccurrence, hebrewWord]

# Convert the dictionary into a DataFrame and sort by frequency
tableData = pd.DataFrame(
    [[key, value[0], value[1], value[2]] for key, value in resultDict.items()],
    columns=["Pointed Word", "Frequency", "First Occurrence", "Hebrew Word"]
)
tableData = tableData.sort_values(by="Frequency", ascending=False)

# Display the table
display(tableData)


# ## 3.2 Plotting the punctuations of the Tetragrammaton

# In[7]:


import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.layouts import column

# Enable Bokeh output in the notebook
output_notebook()

# Ensure tableData has the exact column names you need
tableData.columns = ["Pointed Word", "Frequency", "First Occurrence", "Hebrew Word"]

# Create a ColumnDataSource for the Bokeh plot
source = ColumnDataSource(tableData)

# Create a Bokeh figure for the bar chart
p = figure(
    x_range=tableData['Hebrew Word'].tolist(),  # convert x_range to list explicitly
    height=800,
    width=1000,
    title="Frequency of Tetragrammaton vocalisation in biblical text",
    toolbar_location="right"
)

# Create bar chart
p.vbar(x='Hebrew Word', top='Frequency', width=0.5, source=source)

# Add labels and customizations
p.xaxis.axis_label = "Hebrew Word"
p.yaxis.axis_label = "Frequency"
p.xaxis.major_label_orientation = "horizontal"
p.xaxis.major_label_text_font_size = "26pt"  # Increase font size of x-axis labels

# Add hover tool
hover = HoverTool()
hover.tooltips = [
    ("Pointed Word", "@{Pointed Word}"),
    ("Frequency", "@Frequency"),
    ("First Occurrence", "@{First Occurrence}"),
    ("Hebrew Word", "@{Hebrew Word}")
]
p.add_tools(hover)

# Show the interactive plot
show(p)


# ## 3.3 Some other playing around

# Add another condition to the query. This is to select for the wowels for adOnAi, translatiteratd as O and @, which should be around the Wav. # In[8]:


adonaiQuery = '''
word g_cons=JHWH
     g_word~O.*W.*@
'''
adonaiResults = BHS.search(adonaiQuery)


# In[9]:


BHS.table(adonaiResults, condensed=False, extraFeatures={'voc_lex'})


# In[10]:


adonaiQuery2 = '''
word lex=JHWH/
     g_word~O.*W.*@
'''
adonaiResults2 = BHS.search(adonaiQuery2)


# Print the features associated with word nodes that containing data

# In[11]:


featureList=Fall()
for item in adonaiResults2:
    Node=item[0]
    for feature in featureList:
        featureValue=Fs(feature).v(Node)
        if type(featureValue)!=type(None):
            print (feature,'=',featureValue)
    break


# In[12]:


import re
import pandas as pd
from IPython.display import display

# Initialize dictionary for storing results
resultDict = {}

# Process each item in the JHWHResults
for item in JHWHResults:
    node = item[3]
    
    # Get the pointed and unpointed representation of a word occurrence
    pointedWord = F.g_word.v(node)
    hebrewWord = F.g_word_utf8.v(node)
    
    # Remove cantillations in the BHSA (presented by digits)
    vocalizedWord = re.sub(r'\d', '', pointedWord)
    
    if vocalizedWord in resultDict:
        # If it exists, add the count to the existing value
        resultDict[vocalizedWord][0] += 1  # Increase frequency count
    else:
        # If it doesn't exist, initialize the count and store firstOccurrence
        firstOccurrence = T.sectionFromNode(node)
        resultDict[vocalizedWord] = [1, firstOccurrence, hebrewWord]

# Convert the dictionary into a DataFrame and sort by frequency
tableData = pd.DataFrame(
    [[key, value[0], value[1], value[2]] for key, value in resultDict.items()],
    columns=["Pointing", "Frequency", "First Occurrence", "Hebrew Word"]
)
tableData = tableData.sort_values(by="Frequency", ascending=False)

# Display the table
display(tableData)


# In[13]:


qereQuery = '''
word qere_utf8
     g_cons=JHWH
'''
qereResults = BHS.search(qereQuery)


# In[14]:


for item in qereResults:
    node = item[0]
    pointedWord = F.g_word.v(node)
    qereWord =F.qere.v(node)
    uncantQereWord=re.sub(r'\d', '', qereWord)
    print (pointedWord,qereWord,uncantQereWord)
    break


# # 4 - Required libraries

# The scripts in this notebook require (beside `text-fabric`) the following Python libraries to be installed in the environment:
# 
# bokeh
# IPython
# pandas
# re
# 
# You can install any missing library from within Jupyter Notebook using either`pip` or `pip3`.

# 5 - Notebook details
AuthorTony Jurg
Date4 Novermber 2024