This notebook contains the Pandas instructions to load the Pandas export export of the BHSA.
We then perform some simple information extracting on the data.
The direct download link is data-2021.pd
The pandas file is over 50 MB, a bit too large for GitHub without large file support. So I attached it to the latest release.
If you want to do it yourself,
/pandas
subdirectory.import os
import pandas as pd # pip3 install pandas
We set up some variables for the location of the Pandas file and a location where we will save the full text of this corpus.
VERSION = "2021"
PANDAS_DIR = os.path.abspath("../pandas")
TEXT_DIR = os.path.abspath(os.path.expanduser("~/Downloads/text"))
TABLE_FILE_PD = f"{PANDAS_DIR}/data-{VERSION}.pd"
TABLE_FILE_TXT = f"{TEXT_DIR}/data-{VERSION}.txt"
if not os.path.exists(TEXT_DIR):
os.makedirs(TEXT_DIR)
frame = pd.read_parquet(TABLE_FILE_PD, engine="pyarrow")
print("Done. Size={}".format(frame.size))
Done. Size=104171832
frame.shape
(1446831, 72)
frame.head(30)
nd | otype | g_cons | g_cons_utf8 | g_lex | g_lex_utf8 | g_word | g_word_utf8 | lex | lex_utf8 | ... | tab | txt | typ | uvf | vbe | vbs | verse | voc_lex | vs | vt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 426591 | book | ... | <NA> | <NA> | ||||||||||||||||
1 | 426630 | chapter | ... | <NA> | <NA> | ||||||||||||||||
2 | 1414389 | verse | ... | <NA> | 1 | ||||||||||||||||
3 | 1172308 | sentence | ... | <NA> | <NA> | ||||||||||||||||
4 | 1236025 | sentence_atom | ... | <NA> | <NA> | ||||||||||||||||
5 | 427559 | clause | ... | <NA> | ? | xQtX | <NA> | ||||||||||||||
6 | 515690 | clause_atom | ... | <NA> | xQtX | <NA> | |||||||||||||||
7 | 606394 | half_verse | ... | <NA> | <NA> | ||||||||||||||||
8 | 651573 | phrase | ... | <NA> | PP | <NA> | |||||||||||||||
9 | 904776 | phrase_atom | ... | <NA> | PP | <NA> | |||||||||||||||
10 | 1437602 | lex | B | ב | ... | <NA> | <NA> | B.: | |||||||||||||
11 | 1 | word | B | ב | B.:- | בְּ | B.:- | בְּ | B | ב | ... | <NA> | absent | n/a | n/a | <NA> | B.: | NA | NA | ||
12 | 1437603 | lex | R>CJT/ | ראשׁית֜ | ... | <NA> | <NA> | R;>CIJT | |||||||||||||
13 | 2 | word | R>CJT | ראשׁית | R;>CIJT | רֵאשִׁית | R;>CI73JT | רֵאשִׁ֖ית | R>CJT/ | ראשׁית | ... | <NA> | absent | n/a | n/a | <NA> | R;>CIJT | NA | NA | ||
14 | 1437604 | lex | BR>[ | ברא | ... | <NA> | <NA> | BR> | |||||||||||||
15 | 651574 | phrase | ... | <NA> | VP | <NA> | |||||||||||||||
16 | 904777 | phrase_atom | ... | <NA> | VP | <NA> | |||||||||||||||
17 | 3 | word | BR> | ברא | B.@R@> | בָּרָא | B.@R@74> | בָּרָ֣א | BR>[ | ברא | ... | <NA> | absent | absent | <NA> | BR> | qal | perf | |||
18 | 1437605 | lex | >LHJM/ | אלהים֜ | ... | <NA> | <NA> | >:ELOHIJM | |||||||||||||
19 | 651575 | phrase | ... | <NA> | NP | <NA> | |||||||||||||||
20 | 904778 | phrase_atom | ... | <NA> | NP | <NA> | |||||||||||||||
21 | 4 | word | >LHJM | אלהים | >:ELOH | אֱלֹה | >:ELOHI92JM | אֱלֹהִ֑ים | >LHJM/ | אלהים | ... | <NA> | absent | n/a | n/a | <NA> | >:ELOHIJM | NA | NA | ||
22 | 606395 | half_verse | ... | <NA> | <NA> | ||||||||||||||||
23 | 651576 | phrase | ... | <NA> | PP | <NA> | |||||||||||||||
24 | 904779 | phrase_atom | ... | <NA> | PP | <NA> | |||||||||||||||
25 | 1300539 | subphrase | ... | <NA> | <NA> | ||||||||||||||||
26 | 1437606 | lex | >T | את | ... | <NA> | <NA> | >;T | |||||||||||||
27 | 5 | word | >T | את | >;T | אֵת | >;71T | אֵ֥ת | >T | את | ... | <NA> | absent | n/a | n/a | <NA> | >;T | NA | NA | ||
28 | 1437607 | lex | H | ה | ... | <NA> | <NA> | HA | |||||||||||||
29 | 6 | word | H | ה | HA- | הַ | HA- | הַ | H | ה | ... | <NA> | absent | n/a | n/a | <NA> | HA | NA | NA |
30 rows × 72 columns
columnList = frame.columns.values.tolist()
columnList
['nd', 'otype', 'g_cons', 'g_cons_utf8', 'g_lex', 'g_lex_utf8', 'g_word', 'g_word_utf8', 'lex', 'lex_utf8', 'phono', 'phono_trailer', 'qere', 'qere_trailer', 'qere_trailer_utf8', 'qere_utf8', 'trailer', 'trailer_utf8', 'voc_lex_utf8', 'in_book', 'in_chapter', 'in_verse', 'in_lex', 'in_half_verse', 'in_sentence', 'in_sentence_atom', 'in_clause', 'in_clause_atom', 'in_phrase', 'in_phrase_atom', 'in_subphrase', 'in_word', 'crossref', 'mother', 'book', 'chapter', 'code', 'det', 'domain', 'freq_lex', 'function', 'gloss', 'gn', 'label', 'language', 'ls', 'nametype', 'nme', 'nu', 'number', 'pargr', 'pdp', 'pfm', 'prs', 'prs_gn', 'prs_nu', 'prs_ps', 'ps', 'rank_lex', 'rela', 'sp', 'st', 'tab', 'txt', 'typ', 'uvf', 'vbe', 'vbs', 'verse', 'voc_lex', 'vs', 'vt']
Let us extract some data. First a list of the book names.
books = frame[frame.otype == "book"].book
print(" ".join(str(x) for x in books))
Genesis Exodus Leviticus Numeri Deuteronomium Josua Judices Samuel_I Samuel_II Reges_I Reges_II Jesaia Jeremia Ezechiel Hosea Joel Amos Obadia Jona Micha Nahum Habakuk Zephania Haggai Sacharia Maleachi Psalmi Iob Proverbia Ruth Canticum Ecclesiastes Threni Esther Daniel Esra Nehemia Chronica_I Chronica_II
Now the complete text of the whole bible.
words = frame.loc[frame.otype == "word"]
text = words.g_word_utf8 + words.trailer_utf8
with open(TABLE_FILE_TXT, "w") as pt:
pt.write(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n"))
pt.write("\n")
!head {TABLE_FILE_TXT}
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃ וְהָאָ֗רֶץ הָיְתָ֥ה תֹ֨הוּ֙ וָבֹ֔הוּ וְחֹ֖שֶׁךְ עַל־פְּנֵ֣י תְהֹ֑ום וְר֣וּחַ אֱלֹהִ֔ים מְרַחֶ֖פֶת עַל־פְּנֵ֥י הַמָּֽיִם׃ וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י אֹ֑ור וַֽיְהִי־אֹֽור׃ וַיַּ֧רְא אֱלֹהִ֛ים אֶת־הָאֹ֖ור כִּי־טֹ֑וב וַיַּבְדֵּ֣ל אֱלֹהִ֔ים בֵּ֥ין הָאֹ֖ור וּבֵ֥ין הַחֹֽשֶׁךְ׃ וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לָאֹור֙ יֹ֔ום וְלַחֹ֖שֶׁךְ קָ֣רָא לָ֑יְלָה וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר יֹ֥ום אֶחָֽד׃ פ וַיֹּ֣אמֶר אֱלֹהִ֔ים יְהִ֥י רָקִ֖יעַ בְּתֹ֣וךְ הַמָּ֑יִם וִיהִ֣י מַבְדִּ֔יל בֵּ֥ין מַ֖יִם לָמָֽיִם׃ וַיַּ֣עַשׂ אֱלֹהִים֮ אֶת־הָרָקִיעַ֒ וַיַּבְדֵּ֗ל בֵּ֤ין הַמַּ֨יִם֙ אֲשֶׁר֙ מִתַּ֣חַת לָרָקִ֔יעַ וּבֵ֣ין הַמַּ֔יִם אֲשֶׁ֖ר מֵעַ֣ל לָרָקִ֑יעַ וַֽיְהִי־כֵֽן׃ וַיִּקְרָ֧א אֱלֹהִ֛ים לָֽרָקִ֖יעַ שָׁמָ֑יִם וַֽיְהִי־עֶ֥רֶב וַֽיְהִי־בֹ֖קֶר יֹ֥ום שֵׁנִֽי׃ פ וַיֹּ֣אמֶר אֱלֹהִ֗ים יִקָּו֨וּ הַמַּ֜יִם מִתַּ֤חַת הַשָּׁמַ֨יִם֙ אֶל־מָקֹ֣ום אֶחָ֔ד וְתֵרָאֶ֖ה הַיַּבָּשָׁ֑ה וַֽיְהִי־כֵֽן׃ וַיִּקְרָ֨א אֱלֹהִ֤ים׀ לַיַּבָּשָׁה֙ אֶ֔רֶץ וּלְמִקְוֵ֥ה הַמַּ֖יִם קָרָ֣א יַמִּ֑ים וַיַּ֥רְא אֱלֹהִ֖ים כִּי־טֹֽוב׃
Let us get the words from the first verse.
How do we know the node of the first verse? See the end of the bigTable notebook.
v1 = 1414389
Is this really a verse?
element = frame[frame.nd == v1].otype.iloc[0]
element
'verse'
wordIds = frame[(frame.otype == "word") & (frame.in_verse == v1)].nd
print(wordIds.values)
<IntegerArray> [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11] Length: 11, dtype: Int64
Now the text of the first verse.
words = frame[(frame.otype == "word") & (frame.in_verse == v1)]
text = words.g_word_utf8 + words.trailer_utf8
print(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n"))
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ׃
Let us get the words and text of an arbitrary passage, say Psalmi 131:2
First the id of the chunk (i.e. the Text-Fabric node number):
verse_id = frame[
(frame.otype == "verse")
& (frame.book == "Psalmi")
& (frame.chapter == 131)
& (frame.verse == 2)
].nd.iloc[0]
print(verse_id)
1431812
Now the word ids of that verse:
words = frame[(frame.otype == "word") & (frame.in_verse == verse_id)]
print(words.nd.values)
<IntegerArray> [333425, 333426, 333427, 333428, 333429, 333430, 333431, 333432, 333433, 333434, 333435, 333436, 333437, 333438, 333439] Length: 15, dtype: Int64
And, finally, the text of those words.
text = words.g_word_utf8 + words.trailer_utf8
print(("".join(text)).replace("\u05C3", "\u05C3\n"))
אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃
Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object.
def object2text(nd):
otype = frame[frame.nd == nd].otype.iloc[0]
inelement = "in_" + otype
words = frame[(frame.otype == "word") & (frame[inelement] == nd)]
text = words.g_word_utf8 + words.trailer_utf8
return ("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n")
def verse2object(book, chapter, verse):
return frame[
(frame.otype == "verse")
& (frame.book == book)
& (frame.chapter == chapter)
& (frame.verse == verse)
].nd.iloc[0]
def verse2text(book, chapter, verse):
return object2text(verse2object(book, chapter, verse))
def chapter2object(book, chapter):
return frame[
(frame.otype == "chapter") & (frame.book == book) & (frame.chapter == chapter)
].nd.iloc[0]
def chapter2text(book, chapter):
return object2text(chapter2object(book, chapter))
print(verse2text("Psalmi", 131, 2))
אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃
print(chapter2text("Psalmi", 131))
שִׁ֥יר הַֽמַּֽעֲלֹ֗ות לְדָ֫וִ֥ד יְהוָ֤ה׀ לֹא־גָבַ֣הּ לִ֭בִּי וְלֹא־רָמ֣וּ עֵינַ֑י וְלֹֽא־הִלַּ֓כְתִּי׀ בִּגְדֹלֹ֖ות וּבְנִפְלָאֹ֣ות מִמֶּֽנִּי׃ אִם־לֹ֤א שִׁוִּ֨יתִי׀ וְדֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּגָּמֻ֖ל עָלַ֣י נַפְשִֽׁי׃ יַחֵ֣ל יִ֝שְׂרָאֵל אֶל־יְהוָ֑ה מֵֽ֝עַתָּ֗ה וְעַד־עֹולָֽם׃
We make a column of verse-bound bi-grams of lexemes. The two lexemes are separated by an underscore _
.
vsNext = frame[frame.otype == "word"].in_verse
vsPrev = frame[frame.otype == "word"].in_verse.shift(1)
lex = frame[frame.otype == "word"].lex_utf8
lexNext = frame[frame.otype == "word"].lex_utf8.shift(1)
lastInVs = vsPrev != vsNext
lexNext[lastInVs] = ""
bigram = ["{}_{}".format(*p) for p in zip(lex, lexNext)]
bigram[10_000:10_030]
['אשׁה_אם', 'מן_אשׁה', 'ארץ_מן', 'מצרים_ארץ', 'ו_', 'היה_ו', 'ב_היה', 'ה_ב', 'עת_ה', 'ה_עת', 'היא_ה', 'ו_היא', 'אמר_ו', 'אבימלך_אמר', 'ו_אבימלך', 'פיכל_ו', 'שׂר_פיכל', 'צבא_שׂר', 'אל_צבא', 'אברהם_אל', 'ל_אברהם', 'אמר_ל', 'אלהים_אמר', 'עם_אלהים', 'ב_עם', 'כל_ב', 'אשׁר_כל', 'אתה_אשׁר', 'עשׂה_אתה', 'ו_']