#!/usr/bin/env python # coding: utf-8 # # Parasha statistic analysis with BHSA (Text-Fabric) # # Table of Content (ToC) # # * 1 - Introduction # * 2 - Load Text-Fabric app and data # * 3 - Performing the queries # * 4 - Display the results # * 4.1 - Verbal forms distribution in the parasha # * 4.2 - Ratio of direct speech versus narrative # * 4.3 - Term frequency (excluding stopwords) # * 4.4 - Clause types and phrase functions # * 5 - References # * 6 - Notebook version details # # 1 - Introduction # In this notebook, we will first select a Torah portion by its numeric value (1–54) for BHSaddons feature [`parashanum`](https://tonyjurg.github.io/BHSaddons/docs/features/parashanum.html) and perform various analyses on that parasha: # - *Data Loading:* Initialize Text-Fabric and load the BHSA corpus with BHSaddons parasha features. # - *Selection:* Extract all verses and words belonging to the chosen parasha. # - *Verbal Forms Distribution:* Analyze the frequency of Hebrew verb forms (e.g. wayyiqtol, qatal/perfect, yiqtol/imperfect, etc.) in the parasha. We will use the BHSA morphological feature `vt` (verbal tense) which categorizes verb forms (`perf` = perfect/qatal, `impf` = imperfect/yiqtol, `wayq` = wayyiqtol, etc. ([Vt - BHSA](https://etcbc.github.io/bhsa/features/vt))). # - *Direct Speech vs Narrative:* Compute the ratio of text that is direct speech vs. narrative. BHSA marks each clause's discourse type using the `domain`/*text type* feature (`Q` = quotation/direct speech, `N` = narrative, `D` = discursive) ([Domain - BHSA](https://etcbc.github.io/bhsa/features/domain/#)). We will measure how much of the parasha is in quoted speech versus narrative description. # - *Term Frequency (excluding stopwords):* Determine the most frequent words (lexemes) in the parasha, excluding common function words (like articles, conjunctions, prepositions, pronouns, etc.) to focus on content words. This will be presented as a frequency list and chart. # - *Structural Patterns:* Examine syntactic structure by looking at clause and phrase markers. We’ll mention how BHSA classifies clause types (e.g. `WayX` for a wayyiqtol clause with explicit subject, `Way0` without explicit subject ([Typ - BHSA](https://etcbc.github.io/bhsa/features/typ/#))) and demonstrate analysis of phrase functions (syntactic roles like subject, object, predicate, etc. ([Function - BHSA](https://etcbc.github.io/bhsa/features/function/#))) within the parasha. # # 2 - Load Text-Fabric app and data # ##### [Back to ToC](#TOC) # # First, we load the BHSA corpus with the BHSaddons module for handling parashot. We use `mod="tonyjurg/BHSaddons/tf/"` to include the parasha features. # In[1]: # Import necessary libraries from tf.app import use from collections import Counter import pandas as pd # for easy data manipulation (optional, for frequency tables) from bokeh.io import output_file, show from bokeh.plotting import figure from bokeh.models import ColumnDataSource, Select, CustomJS, HoverTool from wordcloud import WordCloud import matplotlib.pyplot as plt # Load BHSA with BHSaddons features # This will load the BHSA data (version 2021) and the additional parashot markers. A = use("etcbc/BHSA", version="2021", mod="tonyjurg/BHSaddons/tf/", hoist=globals()) # # 3 - Performing the queries # ##### [Back to ToC](#TOC) # # In this step, set the `parasha_num` to the portion we want to analyze. The number corresponds to the traditional sequence of weekly Torah readings (1 = Bereshit, 2 = Noach, ..., 54 = V'Zot HaBerakhah). The code below will find all verses belonging to that parasha, then gather all the words in those verses for analysis. # In[2]: # Select the parasha by its number (1 to 54) parasha_num = 9 # <-- Change this number to select a different parasha # Find all verse nodes that belong to the chosen parasha verses_in_parasha = [v for v in F.otype.s("verse") if F.parashanum.v(v) == parasha_num] if verses_in_parasha: # Identify the range of verses (from first to last) first_verse = verses_in_parasha[0] last_verse = verses_in_parasha[-1] # Get the parasha name in Hebrew and transliteration parasha_name_trans = F.parashatrans.v(first_verse) parasha_name_hebrew = F.parashahebr.v(first_verse) print(f"Selected Parasha {parasha_num}: {parasha_name_trans} ({parasha_name_hebrew})") start_ref = T.sectionFromNode(first_verse) # (Book, chapter, verse) end_ref = T.sectionFromNode(last_verse) print(f"Parasha range: {start_ref[0]} {start_ref[1]}:{start_ref[2]} through {end_ref[0]} {end_ref[1]}:{end_ref[2]}") else: print("No verses found for the given parasha number. Make sure the number is 1-54.") # Now we gather all word tokens in these verses, and also collect clause and phrase units for later structural analysis: # In[3]: # Gather all word nodes in the selected parasha words_in_parasha = [] for v in verses_in_parasha: words_in_parasha += L.d(v, "word") # all word objects descending from the verse # Also gather all clauses and phrases in the parasha (for later analysis) clauses_in_parasha = [] phrases_in_parasha = [] for v in verses_in_parasha: clauses_in_parasha += L.d(v, "clause") phrases_in_parasha += L.d(v, "phrase") print(f"Total verses: {len(verses_in_parasha)}") print(f"Total words: {len(words_in_parasha)}") print(f"Total clauses: {len(clauses_in_parasha)}") print(f"Total phrases: {len(phrases_in_parasha)}") # This will output the counts of verses, words, clauses, and phrases in the parasha. We will use `words_in_parasha` for lexical statistics and `clauses_in_parasha`/`phrases_in_parasha` for discourse and syntax analysis. # # *(Note: "word", "clause", and "phrase" are Text-Fabric object types in BHSA. Each word is a smallest textual unit (generally corresponding to a lexical item, including prefixes if attached). "Clause" here refers to a clause or clause atom in the syntactic hierarchy, and "phrase" to a phrase or subphrase. The BHSA annotations allow us to traverse these levels easily via the `L` api for local relations.)* # # 4 - Display the results # ##### [Back to ToC](#TOC) # # ## 4.1 - Verbal forms distribution in the parasha # # Biblical Hebrew verbs appear in different conjugations / forms such as *qatal* (perfect), *yiqtol* (imperfect), *wayyiqtol* (the narrative past form with prefixed *waw*), *imperative*, *infinitive*, *participle*, etc. We’ll quantify how often each form occurs in this parasha. # # The BHSA feature `vt` (verbal tense) classifies verb *words* by form ([Vt - BHSA](https://etcbc.github.io/bhsa/features/vt/#)). Possible values include `"perf"` (perfect/qatal), `"impf"` (imperfect/yiqtol), `"wayq"` (wayyiqtol), `"impv"` (imperative), `"infa"/"infc"` (infinitive absolute/construct), `"ptca"/"ptcp"` (participle active/passive), or `"NA"` for words that are not verbs. # # Let's count the verb instances by `vt` value: # In[4]: # Filter the words to only verbs and count each verb form (vt value) verb_words = [w for w in words_in_parasha if F.sp.v(w) == "verb"] # sp = part of speech verb_form_counts = Counter(F.vt.v(w) for w in verb_words) # Pretty-print the counts of each verb form print("Verb Form Distribution:") for form, count in verb_form_counts.most_common(): if form == "NA": continue # skip "NA" (if any, non-verbs) print(f" {form}: {count}") # This will list the verb forms present and their frequencies. Now, we create a bar chart to visualize this distribution. Additionally, we want to make it interactive: allow filtering by narrative vs. direct speech contexts. # To enable this, we will use the clause `domain` feature (text type) to separate verbs used in direct speech (domain = `Q`) versus narrative (domain = `N` or `D`). We'll prepare counts for: # - *All* occurrences (default), # - *Narrative* (including discursive, i.e. domain != Q), # - *Direct Speech* (domain = Q). # # Then, using a Bokeh `Select` widget, the user can switch the view between All/Narrative/Direct speech. # In[5]: from bokeh.layouts import column from bokeh.models import ColumnDataSource, CustomJS, Select, Range1d from bokeh.plotting import figure, output_file, show from collections import Counter from bokeh.io import output_notebook # Prepare the Bokeh output to display in the notebook output_notebook() # Prepare data for interactive bar chart verb_form_counts_all = Counter(F.vt.v(w) for w in verb_words) verb_form_counts_narr = Counter(F.vt.v(w) for w in verb_words if F.domain.v(L.u(w, "clause")[0]) != "Q") verb_form_counts_direct = Counter(F.vt.v(w) for w in verb_words if F.domain.v(L.u(w, "clause")[0]) == "Q") # Define the categories (verb forms) to plot (excluding "NA") verb_forms = [vf for vf in verb_form_counts_all.keys() if vf != "NA"] verb_forms.sort() # sort alphabetically # Create data source for Bokeh data = { 'form': verb_forms, 'count_all': [verb_form_counts_all.get(vf, 0) for vf in verb_forms], 'count_narr': [verb_form_counts_narr.get(vf, 0) for vf in verb_forms], 'count_direct': [verb_form_counts_direct.get(vf, 0) for vf in verb_forms], # Use 'count' as the currently selected counts (start with all by default) 'count': [verb_form_counts_all.get(vf, 0) for vf in verb_forms] } source = ColumnDataSource(data=data) # Compute maximum count across all categories and add padding (10%) max_count = max(max(data['count_all']), max(data['count_narr']), max(data['count_direct'])) y_end = max_count * 1.1 # Create a Bokeh bar chart with a fixed y_range p = figure(x_range=verb_forms, height=300, width=500, title=f"Parasha #{parasha_num}: {parasha_name_trans} - Verb form distribution", x_axis_label="Verb Form", y_axis_label="Frequency", toolbar_location='right', y_range=Range1d(start=0, end=y_end)) p.vbar(x='form', top='count', width=0.8, source=source, color="#718dbf") # Lock the y_range so it cannot be changed by interactions p.y_range.bounds = (0, y_end) # Deactivate any active drag or scroll tools (if the toolbar exists) if p.toolbar: p.toolbar.active_drag = None p.toolbar.active_scroll = None # Configure x-axis labels for better readability (rotate if needed) p.xaxis.major_label_orientation = 1.0 # Add a dropdown to filter by context (All / Narrative / Direct) select = Select(title="Filter by Text Type:", value="All", options=["All", "Narrative only", "Direct speech only"]) # JavaScript callback to update the bar heights based on selection callback_code = """ const data = source.data; const filter = cb_obj.value; if (filter === 'Narrative only') { data['count'] = data['count_narr']; } else if (filter === 'Direct speech only') { data['count'] = data['count_direct']; } else { data['count'] = data['count_all']; } source.change.emit(); """ select.js_on_change('value', CustomJS(args={'source': source}, code=callback_code)) # Combine the plot and dropdown in a layout and show layout = column(p, select) show(layout) # Now also make a static image: # In[6]: from bokeh.layouts import column from bokeh.models import ColumnDataSource, Range1d, LabelSet from bokeh.plotting import figure, output_file, show from bokeh.transform import dodge from collections import Counter from bokeh.io import output_notebook # Prepare the Bokeh output to display in the notebook output_notebook() # Prepare data for the grouped bar chart: # Count verb forms for narrative and direct speech verb_form_counts_narr = Counter(F.vt.v(w) for w in verb_words if F.domain.v(L.u(w, "clause")[0]) != "Q") verb_form_counts_direct = Counter(F.vt.v(w) for w in verb_words if F.domain.v(L.u(w, "clause")[0]) == "Q") # Define the categories (verb forms) to plot (excluding "NA") verb_forms = [vf for vf in set(verb_form_counts_narr.keys()) | set(verb_form_counts_direct.keys()) if vf != "NA"] verb_forms.sort() # sort alphabetically # Create data source for Bokeh data = { 'form': verb_forms, 'count_narr': [verb_form_counts_narr.get(vf, 0) for vf in verb_forms], 'count_direct': [verb_form_counts_direct.get(vf, 0) for vf in verb_forms] } source = ColumnDataSource(data=data) # Compute maximum count for setting the y_range (with a 10% padding) max_count = max(max(data['count_narr']), max(data['count_direct'])) y_end = max_count * 1.1 # Create a Bokeh figure for the grouped bar chart p = figure(x_range=verb_forms, height=300, width=500, title=f"Parasha #{parasha_num}: {parasha_name_trans} - Verb form distribution", x_axis_label="Verb Form", y_axis_label="Frequency", toolbar_location='right', y_range=Range1d(start=0, end=y_end)) # Draw bars for direct speech and narrative using dodge to position them side by side bar_width = 0.3 p.vbar(x=dodge('form', -0.15, range=p.x_range), top='count_direct', width=bar_width, source=source, color="#718dbf", legend_label="Direct speech") p.vbar(x=dodge('form', 0.15, range=p.x_range), top='count_narr', width=bar_width, source=source, color="#c9d9d3", legend_label="Narrative") # Add count labels on top of each bar (removed render_mode attribute) labels_direct = LabelSet(x=dodge('form', -0.15, range=p.x_range), y='count_direct', text='count_direct', source=source, text_align='center', text_baseline='bottom') labels_narr = LabelSet(x=dodge('form', 0.15, range=p.x_range), y='count_narr', text='count_narr', source=source, text_align='center', text_baseline='bottom') p.add_layout(labels_direct) p.add_layout(labels_narr) # Lock the y_range so it cannot be changed by interactions p.y_range.bounds = (0, y_end) # Optionally, disable active drag/scroll tools if p.toolbar: p.toolbar.active_drag = None p.toolbar.active_scroll = None # Improve x-axis label readability p.xaxis.major_label_orientation = 1.0 p.legend.location = "top_left" # Display the plot show(p) # note: save image as 'verbform_distribution.png' # ## 4.2 - Ratio of direct speech versus narrative # # Next, let's quantify the portion of the parasha that is direct speech versus narrative description. We can measure this by the number of words in each category. Using the clause text-type (`domain`), we mark each word as belonging to a *Quotation (Q)* or *Non-quotation (N/D)* context. For simplicity, we'll treat both Narrative and Discursive (`N` and `D`) as "narrative" here (i.e., not direct speech). # # We'll calculate the percentage of words in direct speech vs narrative, and display it as a pie chart for a quick overview: # In[7]: # Calculate number of words in direct speech vs narrative words_direct = [w for w in words_in_parasha if F.domain.v(L.u(w, "clause")[0]) == "Q"] words_narrative = [w for w in words_in_parasha if F.domain.v(L.u(w, "clause")[0]) in ("N", "D")] count_direct = len(words_direct) count_narr = len(words_narrative) total_words = count_direct + count_narr pct_direct = (count_direct / total_words * 100) if total_words else 0 pct_narr = (count_narr / total_words * 100) if total_words else 0 print(f"Direct speech word count: {count_direct} ({pct_direct:.1f}%)") print(f"Narrative word count: {count_narr} ({pct_narr:.1f}%)") # This prints out the raw counts and percentage. Now, we create a pie chart using Bokeh's wedge glyph. We will also attach a hover tooltip to display the percentages: # In[8]: from math import pi import numpy as np import pandas as pd from bokeh.io import output_file, output_notebook, show from bokeh.plotting import figure from bokeh.transform import cumsum from bokeh.models import ColumnDataSource, LabelSet # Set up output output_notebook() # Prepare data for the pie chart pie_data = pd.DataFrame({ 'category': ['Direct Speech', 'Narrative'], 'count': [count_direct, count_narr] }) total_count = pie_data['count'].sum() pie_data['angle'] = pie_data['count'] / total_count * 2 * pi pie_data['percentage'] = pie_data['count'] / total_count * 100 # Compute percentages pie_data['color'] = ["#ff7f0e", "#1f77b4"] # Colors for the two categories # Create a label column combining count and percentage pie_data['label'] = pie_data['count'].astype(str) + " (" + pie_data['percentage'].round(1).astype(str) + "%)" # Calculate cumulative angles to determine the middle angle for each wedge pie_data['angle_cumsum'] = pie_data['angle'].cumsum() pie_data['angle_mid'] = pie_data['angle_cumsum'] - pie_data['angle'] / 2 # Calculate label positions (adjust the radius factor as needed) pie_data['x'] = 0.3 * np.cos(pie_data['angle_mid']) pie_data['y'] = 0.3 * np.sin(pie_data['angle_mid']) source_pie = ColumnDataSource(pie_data) # Create pie chart figure p_pie = figure( height=250, width=400, title=f"Parasha #{parasha_num}: {parasha_name_trans} - Speech vs Narrative Ratio", tooltips="@category: @count words (@percentage{0.0}%)", x_range=(-0.5, 1.0), toolbar_location="right" ) # Shift the plot area to align with other plots p_pie.min_border_left = 53 p_pie.wedge( x=0, y=0, radius=0.4, start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'), line_color="white", fill_color='color', legend_field='category', source=source_pie ) # Add labels showing count and percentage inside each wedge labels = LabelSet(x='x', y='y', text='label', source=source_pie, text_align='center') p_pie.add_layout(labels) p_pie.legend.location = "top_right" p_pie.legend.label_text_font_size = "8pt" p_pie.axis.visible = False # Hide axes p_pie.grid.grid_line_color = None show(p_pie) # note: save image as 'speech_narrative_ratio.png' # ## 4.3 - Term frequency (excluding stopwords) # # Now we analyze the vocabulary of the parasha. We want to find the most frequent words (lexemes) and exclude very common "stop words" that are not content-rich. In Hebrew, such stopwords include conjunctions (like "ו" = "and"), prepositions ("ב/כ/ל" = "in/like/to", etc.), the definite article ("ה" = "the"), pronouns, and a few particles like the negative "לא" or the accusative marker "את". # # The following words with part-of-speech tags (`sp` feature) ([Sp - BHSA](https://etcbc.github.io/bhsa/features/sp/#)), that are constituting our "stop words" for this analysis, will be filtered: # - `prep` (preposition), # - `conj` (conjunction), # - `art` (article), # - `prps`/`prde`/`prin` (pronouns: personal, demonstrative, interrogative), # - `nega` (negative), # - `inrg` (interrogative particle), # - `intj` (interjection). # # Next the remaining lexemse are counted. BHSA provides a `lex` feature (the lexical root form) and `gloss` (an English gloss/translation) for each word. We will count by lexeme so that different inflected forms count together. For display, we'll use the English glosses of the top lexemes for readability, and also show the Hebrew lexeme. # In[9]: # Define parts of speech to treat as stop words (function words) stop_sp = {"prep", "conj", "art", "prps", "prde", "prin", "nega", "inrg", "intj"} # Build a frequency counter of lexemes (excluding stopword POS) lex_counts = Counter() for w in words_in_parasha: if F.sp.v(w) in stop_sp: continue lex_node = L.u(w, 'lex')[0] # get the lexeme object for this word lex_counts[lex_node] += 1 # Get the 10 most frequent lexemes (excluding stops) top_lexemes = [lex for lex, cnt in lex_counts.most_common(10)] print("Top 10 frequent content lexemes:") for lex in top_lexemes: gloss = F.gloss.v(lex) # English gloss of the lexeme heb = F.lex_utf8.v(lex) # Hebrew lexeme in Hebrew script freq = lex_counts[lex] print(f" {heb} ({gloss}): {freq}") # Now let's visualize these frequencies in a bar chart. We'll label each bar with the lexeme's English gloss: # In[10]: from bokeh.io import output_notebook # Prepare the Bokeh output to display in the notebook output_notebook() # Prepare data for bar chart of top terms top_lex_nodes = [lex for lex, cnt in lex_counts.most_common(15)] top_lex_heb = [F.lex_utf8.v(lex) for lex in top_lex_nodes] # Hebrew form top_lex_gloss = [F.gloss.v(lex) or "" for lex in top_lex_nodes] # English gloss (if available) top_lex_freq = [lex_counts[lex] for lex in top_lex_nodes] # Create DataFrame or dict for convenience freq_df = pd.DataFrame({ "Lexeme (Hebrew)": top_lex_heb, "Gloss (English)": top_lex_gloss, "Frequency": top_lex_freq }) # Display the table of top words display(freq_df) # This tabular output shows the top 15 content words with their gloss and counts. Now the bar chart using Bokeh: # In[11]: from bokeh.models import Range1d, HoverTool, ColumnDataSource, LabelSet from bokeh.plotting import figure, output_file, show from bokeh.io import export_svgs from bokeh.embed import components from bokeh.models import Div # Create the list of terms, combining gloss and Hebrew as needed terms = [f"{gloss} ({heb})" if gloss else heb for gloss, heb in zip(top_lex_gloss, top_lex_heb)] # Compute the maximum frequency and add padding (e.g., 10%) max_freq = max(top_lex_freq) y_end = max_freq * 1.1 # Create the figure with a fixed y_range p_terms = figure(x_range=terms, height=300, width=500, title=f"Parasha #{parasha_num}: {parasha_name_trans} - Top terms (excluding stopwords)", x_axis_label="Lexeme", y_axis_label="Frequency", toolbar_location="right", y_range=Range1d(start=0, end=y_end)) # Create a ColumnDataSource to hold the data source = ColumnDataSource(data=dict(x=terms, top=top_lex_freq)) # Draw the bars p_terms.vbar(x='x', top='top', width=0.8, color="#2ca02c", source=source) # Lock the y_range to prevent zooming or panning p_terms.y_range.bounds = (0, y_end) # If a toolbar exists, ensure no active drag or scroll tools are set if p_terms.toolbar: p_terms.toolbar.active_drag = None p_terms.toolbar.active_scroll = None # Rotate x-axis labels and adjust font size to prevent overlap p_terms.xaxis.major_label_orientation = 0.9 p_terms.xaxis.major_label_text_font_size = "8pt" # Add labels above each bar labels = LabelSet(x='x', y='top', text='top', level='glyph', x_offset=-10, y_offset=1, source=source) p_terms.add_layout(labels) # Display the plot show(p_terms) # note: save image as 'top_terms.png' # ## 4.4 - Clause types and phrase functions # # Finally, we consider some syntactic patterns and markers in the parasha. The BHSA dataset provides detailed syntactic analysis: # - *Clause types:* Each clause is classified by its structure and leading element. For example, a clause beginning with a *wayyiqtol* verb is labeled as `Wayyiqtol-X clause (WayX)` if it has an explicit subject, or `Wayyiqtol-0 clause (Way0)` if no subject is explicitly present ([Typ - BHSA](https://etcbc.github.io/bhsa/features/typ/#)). There are many such codes (e.g., clauses starting with *weqatal*, infinitives, participles, etc., as listed in BHSA documentation ([Typ - BHSA](https://etcbc.github.io/bhsa/features/typ/#)) ([Typ - BHSA](https://etcbc.github.io/bhsa/features/typ/#))). These clause type patterns often correlate with narrative structure (Wayyiqtol chains for narrative sequence, X-qatal for past background, etc.). # - *Phrase functions:* Each phrase in the syntax tree has a grammatical function, such as Subject (Subj), Object (Objc), Predicate (Pred), Adjunct (Adju), etc. ([Function - BHSA](https://etcbc.github.io/bhsa/features/function/#)) ([Function - BHSA](https://etcbc.github.io/bhsa/features/function/#)). This tells us the role a phrase plays in the clause. # # In[12]: # Count phrase functions in the parasha function_counts = Counter(F.function.v(ph) for ph in phrases_in_parasha if F.function.v(ph)) # Exclude Unknown or None if None in function_counts: del function_counts[None] if "Unkn" in function_counts: del function_counts["Unkn"] # Get the most frequent functions common_funcs = function_counts.most_common(10) print("Most common phrase functions:") for func, count in common_funcs: print(f" {func}: {count}") # Let's visualize this in a bar chart: # In[13]: from bokeh.layouts import column from bokeh.models import Range1d, LabelSet, ColumnDataSource from bokeh.plotting import figure, output_file, show # Prepare data for phrase function chart func_labels = [func for func, cnt in common_funcs] func_counts = [cnt for func, cnt in common_funcs] # Compute maximum count and add a little padding (e.g., 10%) max_count = max(func_counts) y_end = max_count * 1.1 # Create the figure with a fixed y_range p_funcs = figure(x_range=func_labels, height=300, width=400, title=f"Parasha {parasha_num}: {parasha_name_trans} - Phrase function distribution", x_axis_label="Phrase Function", y_axis_label="Count", toolbar_location='right', y_range=Range1d(start=0, end=y_end)) p_funcs.vbar(x=func_labels, top=func_counts, width=0.6, color="#8c564b") # Lock the y_range to prevent any zooming/panning adjustments p_funcs.y_range.bounds = (0, y_end) # If a toolbar exists, deactivate any active drag or scroll tools if p_funcs.toolbar: p_funcs.toolbar.active_drag = None p_funcs.toolbar.active_scroll = None # Add labels above bars (optional) func_source = ColumnDataSource(data={'func': func_labels, 'count': func_counts, 'pos': func_counts}) labels = LabelSet(x='func', y='pos', text='count', level='glyph', x_offset=-13, y_offset=3, source=func_source) p_funcs.add_layout(labels) show(p_funcs) # note: save image as 'phrase_function_distribution.png' # # 5 - References # # For more details on the BHSA dataset and features, see the [ETCBC BHSA documentation](https://etcbc.github.io/bhsa/). # # The BHSaddons repository ([GitHub - tonyjurg/BHSaddons](https://tonyjurg.github.io/BHSaddons/)) provides a list of the parasha specific features we use. # # Additionally, the BHSA feature documentation covers morphological and syntactic features, e.g.: # - the `vt` codes for verb tense ([Vt - BHSA](https://etcbc.github.io/bhsa/features/vt/)), # - the `domain` clause text-type codes ([Domain - BHSA](https://etcbc.github.io/bhsa/features/domain/)), # - the `function` codes for phrase roles ([Function - BHSA](https://etcbc.github.io/bhsa/features/function/)). # # 6 - Notebook version details # ##### [Back to ToC](#TOC) # #
# # # # # # # # # # # # # #
AuthorTony Jurg
Version1.0
Date26 March 2025
#