#!/usr/bin/env python # coding: utf-8 # # Differences in MT and SP in parasha #8: Vayishlach (Genesis 32:4-36:43) # ## Table of Content (ToC) # # * 1 - Introduction # * 2 - Load Text-Fabric app and data # * 3 - Compare surface texts of SP and MT # * 4 - Compare texts using minimum Levenshtein distance # * 5 - Comparison of spelling of proper nouns between SP and MT # * 6 - References and acknowledgement # * 7 - Required libraries # * 8 - Notebook version details # # 1 - Introduction # ##### [Back to ToC](#TOC) # # The Samaritan Pentateuch (SP) is a version of the Torah preserved by the Samaritan community, differing from the Masoretic Text (MT) in several aspects, including language, orthography, and occasionally theological emphasis. This notebook compares the text of the Masoretic Text, based on the BHSA dataset in Text-Fabric, with the Samaritan Pentateuch, also available as a Text-Fabric dataset.1 # # In this analysis, we focus on comparing the text of the verses in a specific parasha, highlighting differences in wording and orthography. Additionally, special attention is given to spelling variations of proper nouns between the two traditions. This notebook draws inspiration from the notebook provided by Martijn Naaijer2 and aims to explore the textual nuances between these two important versions of the Torah. # # 2 - Load Text-Fabric app and data # ##### [Back to ToC](#TOC) # # The following code will load the Text-Fabric version of the [Samaritan Pentatuch](https://github.com/DT-UCPH/sp), the [Biblia Hebraica Stuttgartensia (Amstelodamensis)](https://etcbc.github.io/bhsa/) together with the additonal parasha related features from [tonyjurg/BHSaddons](https://github.com/tonyjurg/BHSaddons). # In[1]: from tf.app import use # Load the SP data, and rename the node features class F, # the locality class L and the text class T, # then they cannot be overwritten while loading the MT. SP = use('DT-UCPH/sp', version='3.4') Fsp, Lsp, Tsp = SP.api.F, SP.api.L, SP.api.T # Do the same for the MT dataset (BHSA) together with BHSaddons MT = use('etcbc/bhsa', version='2021',mod="tonyjurg/BHSaddons/tf/") Fmt, Lmt, Tmt = MT.api.F, MT.api.L, MT.api.T # # 3 - Compare surface texts of SP and MT # ##### [Back to ToC](#TOC) # # In this section, we compare the surface texts of the Samaritan Pentateuch (SP) and the Masoretic Text (MT) at the verse level. By analyzing the wording and structure of these texts, we aim to identify variations. # In[2]: # find all word nodes for this parasha (we can either use the transliterated name or the sequence number) parashaQuery = ''' verse parashanum=8 ''' parashaResults = MT.search(parashaQuery) # In[3]: # Extract book, chapter, and verse information bookChapterVerseList = [ Tmt.sectionFromNode(verse[0]) for verse in parashaResults ] # Store parashname, start and end verse for future use startNode=parashaResults[0][0] endNode=parashaResults[-1][0] parashaNameHebrew=Fmt.parashahebr.v(startNode) parashaNameEnglish=Fmt.parashatrans.v(startNode) bookStart,chapterStart,startVerse=Tmt.sectionFromNode(startNode) parashaStart=f'{bookStart} {chapterStart}:{startVerse}' bookEnd,chapterEnd,startEnd=Tmt.sectionFromNode(endNode) parashaEnd=f'{chapterEnd}:{startEnd}' htmlStart='
' htmlFooter=f'Data generated by `delta_mt_and_sp.ipynb` at `github.com/tonyjurg/Parashot`
`' # In[4]: # Function to reconstruct verses def reconstructVerses(F, L, T, textFeature, inputList): """Reconstruct text for each verse.""" verseTexts = {} for verseName in inputList: verseText = '' verseNode = T.nodeFromSection(verseName) wordNodes = L.d(verseNode, 'word') for wordNode in wordNodes: wordText = eval(f'F.{textFeature}.v(wordNode)') trailer = F.trailer.v(wordNode) if wordText: verseText += wordText + (trailer if trailer else ' ') verseTexts[verseName] = verseText.strip() return verseTexts SPverses = reconstructVerses(Fsp, Lsp, Tsp, 'g_cons', bookChapterVerseList) MTverses = reconstructVerses(Fmt, Lmt, Tmt, 'g_cons', bookChapterVerseList) # In[5]: from difflib import SequenceMatcher from IPython.display import HTML, display def highlightMatches(baseText, comparisonText): matcher = SequenceMatcher(None, baseText, comparisonText) highlightedComparisonText = "" for tag, i1, i2, j1, j2 in matcher.get_opcodes(): if tag == "equal": # Identical parts highlightedComparisonText += comparisonText[j1:j2] else: # Non-matching parts highlightedComparisonText += f'{comparisonText[j1:j2]}' return highlightedComparisonText def cleanText(text): replacements = [ # for the transcoded strings ('00_P', ''), # Remove '00_P' ('00_S', ''), # Remove '00_S' ('00', ''), # Remove '00' ('&', ' '), # Replace '&' with a space # for the Hebrew strings ('ס ', ''), # Final Samekh ('פ ', ''), # Final Pe ('׃', ''), # End of verse ('־',' ') # maqaf ] # Apply each replacement for old, new in replacements: text = text.replace(old, new) return text # Function to format and highlight verse differences between MT and SP def formatAndHighlight(label, MTverseText, SPverseText): book, chapter, verse = label MTverseNode = Tmt.nodeFromSection(label) MTtext = cleanText(Tmt.text(MTverseNode, "text-orig-plain")) SPverseNode = Tsp.nodeFromSection(label) SPtext = Tsp.text(SPverseNode) SPmarkedText = highlightMatches(MTtext, SPtext) MTmarkedText = highlightMatches(SPtext, MTtext) formattedDiff = ( f'SP: {SPmarkedText}
MT: {MTmarkedText}
Levenshtein Distance: {levDistance}
' # Add the distance MT.dm(formattedDiff) htmlContent += formattedDiff # Append to the HTML content # Save the content to an HTML file fileName = f"levenshtein_differences_MT_SP({parashaNameEnglish.replace(' ','%20')}).html" with open(fileName, "w", encoding="utf-8") as file: file.write(htmlContent) # wrap html header and footer and display a download button htmlContentFull = f'{htmlStart}{htmlContent}{htmlFooter}' downloadButton = f""" """ display(HTML(downloadButton)) # # 5 - Comparison of spelling of proper nouns between SP and MT # ##### [Back to ToC](#TOC) # # This section focuses on comparing the spelling of proper nouns between the Samaritan Pentateuch (SP) and the Masoretic Text (MT). Proper nouns, including names of people, places, and unique terms, often exhibit variations in spelling # In[7]: import collections def collectProperNounSpellings(F, L, T, inputList): """ Collect proper noun spellings and their associated word node numbers. Ensures only one tuple is stored for each lexeme-to-spelling mapping. """ properNounsSpellings = {} for bookChapterVerse in inputList: verseNode = T.nodeFromSection(bookChapterVerse) wordNodes = L.d(verseNode, 'word') for wordNode in wordNodes: if F.sp.v(wordNode) == 'nmpr': # Check if the word is a proper noun lex = F.lex.v(wordNode) # Lexical form spelling = F.g_cons.v(wordNode) # Spelling # Store only the first occurrence for each lex-to-cons mapping if lex not in properNounsSpellings or spelling not in {item[0] for item in properNounsSpellings[lex]}: properNounsSpellings.setdefault(lex, []).append((spelling, wordNode)) return properNounsSpellings SPspellingDict = collectProperNounSpellings(Fsp, Lsp, Tsp, bookChapterVerseList) MTspellingDict = collectProperNounSpellings(Fmt, Lmt, Tmt, bookChapterVerseList) # In[8]: from IPython.display import HTML, display # Initialize HTML content htmlContent = f'Author | #Tony Jurg | #
Version | #1.1 | #
Date | #18 November 2024 | #