#!/usr/bin/env python # coding: utf-8 # # Petucha and setuma in the Torah (BHSA) # ## Table of content (TOC) # # * 1 - Introduction # * 2 - Load Text-Fabric app and data # * 3 - Performing the queries # * 3.1 - Locate the parashot petuchot and setumot # * 3.2 - Run some basic stats # * 3.3 - Petucha length distribution over the books # * 3.4 - Plotting Petucha length per books # * 3.5 - Setuma length distribution over the books # * 3.6 - Plotting setuma length per book # * 4 - Attribution and footnotes # * 5 - Required libraries # * 6 - Notebook details # # 1 - Introduction # ##### [Back to TOC](#TOC) # The surface text of the Torah is devided into *pisqot* (units that can be compared to paragrahps). This devision consist of two types of sections which are marked by th Hebrew letters פ (pe) and ס (samekh): # # - *Petucha* (Open Section): Marked by the letter **פ**, it usually starts a new paragraph or major section, often separated by a line break or significant spacing. # - *Setuma* (Closed Section): Marked by the letter **ס**, it indicates a smaller division within the text, often separated by a smaller space. # # These markings help structure the text and convey interpretative cues within the Torah. In this notebook we will perform some statistic analysis on these surface text features. # # Detailed information regarding petuchot and setumot can be found in “The Text of the Tanak” by Russel Fuller.1 # # # 2 - Load Text-Fabric app and data # ##### [Back to TOC](#TOC) # This NoteBook uses the ETCBC BHSA as dataset representing the Hebrew text of the TeNaCh. # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[2]: # Loading the Text-Fabric code # Note: it is assumed Text-Fabric is installed in your environment. from tf.fabric import Fabric from tf.app import use # In[3]: # load the BHSL app and data BHS = use ("etcbc/BHSA",hoist=globals()) # Note: The Text-Fabric feature documentation can be found at [ETCBC GitHub](https://github.com/ETCBC/bhsa/blob/master/docs/features/0_home.md) # In[4]: # The following will push the Text-Fabric stylesheet to this notebook (to facilitate proper display with notebook viewer) BHS.dh(BHS.getCss()) # # 3 - Performing the queries # ##### [Back to TOC](#TOC) # ## 3.1 - Locate the parashot petuchot and setumot # Occurrences of פ (pe) and ס (samekh), which function as section breakers in the BHSA Text-Fabric dataset, are available in the [trailer](https://github.com/ETCBC/bhsa/blob/master/docs/features/trailer.md) feature. # # To begin, we shall generate a frequency table for this feature, noting that it pertains to the full TeNaCh. As observed from the output, the Hebrew letters are displayed in their transliterated format, with trailing P and S representing pe and samekh, respectively. # In[5]: F.trailer.freqList() # In[6]: # find the parashots petuchot petuchaQuery = ''' book book=Genesis|Exodus|Leviticus|Numeri|Deuteronomium word trailer~_P ''' petuchaResults = BHS.search(petuchaQuery) # In[7]: # find the parashot setumot setumaQuery = ''' book book=Genesis|Exodus|Leviticus|Numeri|Deuteronomium word trailer~_S ''' setumaResults = BHS.search(setumaQuery) # ## 3.2 - Run some basic stats # These two scripts count the occurrences of petuchot and setumot per book. They generate formatted tables that summarize these counts. # In[8]: # Import necessary libraries from collections import defaultdict # Initialize a dictionary to store counts per book petuchaCounts = defaultdict(int) # Iterate over the results and count petuchot per book for book, petucha in petuchaResults: petuchaCounts[book] += 1 # Sort the books alphabetically sortedBooks = sorted(petuchaCounts.keys()) # Display the results in a formatted table print(f"{'Book':<20}{'Number of Petuchot'}") print('-' * 35) for book in sortedBooks: print(f"{F.book.v(book):<20}{petuchaCounts[book]}") # In[9]: # Import necessary libraries from collections import defaultdict # Initialize a dictionary to store counts per book setumaCounts = defaultdict(int) # Iterate over the results and count setuma per book for book, setuma in setumaResults: setumaCounts[book] += 1 # Sort the books alphabetically sortedBooks = sorted(setumaCounts.keys()) # Display the results in a formatted table print(f"{'Book':<20}{'Number of Setumot'}") print('-' * 35) for book in sortedBooks: print(f"{F.book.v(book):<20}{setumaCounts[book]}") # ## 3.3 - Petucha length distribution over the books # The following script creates a statistical overview of the petucha length per book. # # In[10]: # Import necessary libraries import pandas as pd from tf.app import use # Function to get reference string from verse nodes def getVerseReference(node): section = T.sectionFromNode(node) return f"{section[0]} {section[1]}:{section[2]}" if section else 'Unknown Reference' # Function to process each petucha and append to list def addPetuchaInfo(petuchaList, index, startWordNode, endWordNode, length, bookName): startVerseNodes = L.u(startWordNode, otype='verse') endVerseNodes = L.u(endWordNode, otype='verse') startRefStr = getVerseReference(startVerseNodes[0]) if startVerseNodes else 'Unknown Reference' endRefStr = getVerseReference(endVerseNodes[0]) if endVerseNodes else 'Unknown Reference' petuchaList.append({ 'Index': index, 'StartRef': startRefStr, 'EndRef': endRefStr, 'Length': length, 'Book': bookName }) # Initialize variables petuchaInfo = [] currentPetuchaLength = 0 currentPetuchaStartWord = None index = 1 # Find all words in the Torah wordsInTorahQuery = ''' book book=Genesis|Exodus|Leviticus|Numeri|Deuteronomium word ''' wordsInTorah = BHS.search(wordsInTorahQuery) # Iterate over all words in the dataset for bookNode, wordNode in wordsInTorah: # Get the trailer feature of the word trailer = F.trailer.v(wordNode) or '' # prevent 'NoneType' errors # If starting a new petucha, record the start word if currentPetuchaStartWord is None: currentPetuchaStartWord = wordNode currentBookName = F.book.v(bookNode) # Increment the length counter currentPetuchaLength += 1 # Check if the word ends with a petucha (represented by 'P' in the trailer) if 'P' in trailer: addPetuchaInfo(petuchaInfo, index, currentPetuchaStartWord, wordNode, currentPetuchaLength, currentBookName) # Reset the variables for the next petucha currentPetuchaLength = 0 currentPetuchaStartWord = None index += 1 # Handle any remaining words after the last petucha if currentPetuchaLength > 0 and currentPetuchaStartWord is not None: addPetuchaInfo(petuchaInfo, index, currentPetuchaStartWord, wordNode, currentPetuchaLength, currentBookName) # Convert the petuchaInfo list to a pandas DataFrame for analysis df = pd.DataFrame(petuchaInfo) # Define the desired book order orderedBooks = ['Genesis', 'Exodus', 'Leviticus', 'Numeri', 'Deuteronomium'] # Display per-book statistics using a specified formatting print("\nStatistical overview of petucha lengths per book:") bookStats = df.groupby('Book')['Length'].describe().round(2) bookStats['count'] = bookStats['count'].astype(int) bookStats['min'] = bookStats['min'].astype(int) bookStats['max'] = bookStats['max'].astype(int) # Calculate total row across all books totalStats = pd.DataFrame({ 'count': [int(bookStats['count'].sum())], 'mean': [round(bookStats['mean'].mean(), 2)], 'std': [round(bookStats['std'].mean(), 2)], 'min': [int(bookStats['min'].min())], '25%': [round(bookStats['25%'].mean(), 2)], '50%': [round(bookStats['50%'].mean(), 2)], '75%': [round(bookStats['75%'].mean(), 2)], 'max': [int(bookStats['max'].max())] }, index=['Total']) # Concatenate the total row with the book_stats DataFrame bookStats = pd.concat([bookStats, totalStats]) # Reorder bookStats based on the original order of books in petuchaInfo bookStats = bookStats.reindex(orderedBooks + ['Total']) # Configure display options to show all data on a single line for each book pd.set_option('display.max_columns', None) # Show all columns pd.set_option('display.width', 1000) # Wide display to avoid line wrapping print(bookStats) # ## 3.4 - Plotting petucha length per book # # The following script creates a scatter plot displaying the length distribution of each petucha sections. # Hovering over the datapoints provids more details like word-count, and the start and end-verse. This script uses the data created by the previous script. # In[11]: from bokeh.plotting import figure, show, output_notebook from bokeh.models import ColumnDataSource, HoverTool from bokeh.transform import factor_cmap from bokeh.palettes import Category10 # Add a petucha index for plotting df['petuchaIndex'] = df.index + 1 # Prepare data for Bokeh source = ColumnDataSource(df) # Define the color palette books = df['Book'].unique() palette = Category10[len(books)] color_map = factor_cmap('Book', palette=palette, factors=books) # Create the figure output_notebook() # To display the plot in a Jupyter notebook p = figure( width=1000, height=700, title='Petucha lengths in the Torah (in words)', x_axis_label='Petucha index', y_axis_label='Length (in words)', tools="pan,wheel_zoom,box_zoom,reset,save" ) # Add the scatter plot using scatter() p.scatter( x='petuchaIndex', y='Length', source=source, size=8, color=color_map, legend_field='Book', marker='circle', line_color='black', fill_alpha=0.8 ) # Add hover tool hover = HoverTool() hover.tooltips = [ ('Petucha index', '@Index'), ('Length', '@Length'), ('Start verse', '@StartRef'), ('End verse', '@EndRef'), (' ', ' ') # to get a blank line when multiple datapoint are grouped when hovering ] p.add_tools(hover) # Customize legend p.legend.location = 'top_right' p.legend.click_policy = 'hide' # Show the plot show(p) # ## 3.5 - Setuma length distribution over the books # The following script creates a statistical overview of the setuma length per book. # In[12]: # Import necessary libraries import pandas as pd from tf.app import use # Function to get reference string from verse nodes def getVerseReference(node): section = T.sectionFromNode(node) return f"{section[0]} {section[1]}:{section[2]}" if section else 'Unknown Reference' # Function to process each setuma and append to list def addSetumaInfo(setumaList, index, startWordNode, endWordNode, length, bookName): startVerseNodes = L.u(startWordNode, otype='verse') endVerseNodes = L.u(endWordNode, otype='verse') startRefStr = getVerseReference(startVerseNodes[0]) if startVerseNodes else 'Unknown Reference' endRefStr = getVerseReference(endVerseNodes[0]) if endVerseNodes else 'Unknown Reference' setumaList.append({ 'Index': index, 'StartRef': startRefStr, 'EndRef': endRefStr, 'Length': length, 'Book': bookName }) # Initialize variables setumaInfo = [] currentSetumaLength = 0 currentSetumaStartWord = None index = 1 # Find all words in the Torah wordsInTorahQuery = ''' book book=Genesis|Exodus|Leviticus|Numeri|Deuteronomium word ''' wordsInTorah = BHS.search(wordsInTorahQuery) # Iterate over all words in the dataset for bookNode, wordNode in wordsInTorah: # Get the trailer feature of the word trailer = F.trailer.v(wordNode) or '' # prevent 'NoneType' errors # If starting a new setuma, record the start word if currentSetumaStartWord is None: currentSetumaStartWord = wordNode currentBookName = F.book.v(bookNode) # Increment the length counter currentSetumaLength += 1 # Check if the word ends with a Setuma (represented by 'S' in the trailer) if 'S' in trailer: addSetumaInfo(setumaInfo, index, currentSetumaStartWord, wordNode, currentSetumaLength, currentBookName) # Reset the variables for the next setuma currentSetumaLength = 0 currentSetumaStartWord = None index += 1 # Handle any remaining words after the last setuma if currentSetumaLength > 0 and currentSetumaStartWord is not None: addSetumaInfo(setumaInfo, index, currentSetumaStartWord, wordNode, currentSetumaLength, currentBookName) # Convert the setumaInfo list to a pandas DataFrame for analysis df = pd.DataFrame(setumaInfo) # Define the desired book order orderedBooks = ['Genesis', 'Exodus', 'Leviticus', 'Numeri', 'Deuteronomium'] # Display per-book statistics using a specified formatting print("\nStatistical overview of setuma lengths per book:") bookStats = df.groupby('Book')['Length'].describe().round(2) bookStats['count'] = bookStats['count'].astype(int) bookStats['min'] = bookStats['min'].astype(int) bookStats['max'] = bookStats['max'].astype(int) # Calculate total row across all books totalStats = pd.DataFrame({ 'count': [int(bookStats['count'].sum())], 'mean': [round(bookStats['mean'].mean(), 2)], 'std': [round(bookStats['std'].mean(), 2)], 'min': [int(bookStats['min'].min())], '25%': [round(bookStats['25%'].mean(), 2)], '50%': [round(bookStats['50%'].mean(), 2)], '75%': [round(bookStats['75%'].mean(), 2)], 'max': [int(bookStats['max'].max())] }, index=['Total']) # Concatenate the total row with the book_stats DataFrame bookStats = pd.concat([bookStats, totalStats]) # Reorder bookStats based on the original order of books in setumaInfo bookStats = bookStats.reindex(orderedBooks + ['Total']) # Configure display options to show all data on a single line for each book pd.set_option('display.max_columns', None) # Show all columns pd.set_option('display.width', 1000) # Wide display to avoid line wrapping print(bookStats) # ## 3.6 - Plotting setuma length per book # # The following script creates a scatter plot displaying the length distribution of each setuma sections. # Hovering over the datapoints provids more details like word-count, and the start and end-verse. This script uses the data created by the previous script. # In[13]: from bokeh.plotting import figure, show, output_notebook from bokeh.models import ColumnDataSource, HoverTool from bokeh.transform import factor_cmap from bokeh.palettes import Category10 # Add a setuma index for plotting df['setumaIndex'] = df.index + 1 # Prepare data for Bokeh source = ColumnDataSource(df) # Define the color palette books = df['Book'].unique() palette = Category10[len(books)] color_map = factor_cmap('Book', palette=palette, factors=books) # Create the figure output_notebook() # To display the plot in a Jupyter notebook p = figure( width=1000, height=700, title='Setuma lengths in the Torah (in words)', x_axis_label='Setuma index', y_axis_label='Length (in words)', tools="pan,wheel_zoom,box_zoom,reset,save" ) # Add the scatter plot using scatter() p.scatter( x='setumaIndex', y='Length', source=source, size=8, color=color_map, legend_field='Book', marker='circle', line_color='black', fill_alpha=0.8 ) # Add hover tool hover = HoverTool() hover.tooltips = [ ('Setuma index', '@Index'), ('Length', '@Length'), ('Start verse', '@StartRef'), ('End verse', '@EndRef'), (' ', ' ') # to get a blank line when multiple datapoint are grouped when hovering ] p.add_tools(hover) # Customize legend p.legend.location = 'top_right' p.legend.click_policy = 'hide' # Show the plot show(p) # # 4 - Attribution and footnotes # ##### [Back to TOC](#TOC) # # #### Footnotes: # # 1 Russell Fuller, “The Text of the Tanak,” in A History of Biblical Interpretation: The Medieval through the Reformation Periods, ed. Alan J. Hauser, Duane F. Watson, and Schuyler Kaufman (Grand Rapids, MI; Cambridge, U.K.: William B. Eerdmans Publishing Company, 2009), 206. # # 5 - Required libraries # ##### [Back to TOC](#TOC) # # The scripts in this notebook require (beside `text-fabric`) the following Python libraries to be installed in the environment: # # collections # pandas # bokeh # IPython # # You can install any missing library from within Jupyter Notebook using either`pip` or `pip3`. # # 6 - Notebook details # ##### [Back to TOC](#TOC) # #
Author | #Tony Jurg | #
Version | #1.0 | #
Date | #4 Novermber 2024 | #