#!/usr/bin/env python # coding: utf-8 # # # # # # # Import BHSA data into Pandas # # This notebook contains the Pandas instructions to load the # [Pandas export](export.ipynb) export of the BHSA. # # We then perform some simple information extracting on the data. # # How to get the Pandas file # # The direct download link is # [data-2021.pd](https://github.com/ETCBC/bhsa/releases/download/v1.8/data-2021.pd) # # The pandas file is over 50 MB, a bit too large for GitHub without large file support. # So I attached it to the # [latest release](https://github.com/ETCBC/bhsa/releases/tag/v1.8). # # ## Reproduction # # If you want to do it yourself, # # * clone this repo # * find the [export](export.ipynb) notebook # * run it in Jupyterlab # * pick up the newly generated file from the `/pandas` subdirectory. # In[5]: import os import pandas as pd # pip3 install pandas # # File locations # # We set up some variables for the location of the Pandas file and a location # where we will save the full text of this corpus. # In[6]: VERSION = "2021" PANDAS_DIR = os.path.abspath("../pandas") TEXT_DIR = os.path.abspath(os.path.expanduser("~/Downloads/text")) TABLE_FILE_PD = f"{PANDAS_DIR}/data-{VERSION}.pd" TABLE_FILE_TXT = f"{TEXT_DIR}/data-{VERSION}.txt" if not os.path.exists(TEXT_DIR): os.makedirs(TEXT_DIR) # # Load the dataframe # In[7]: frame = pd.read_parquet(TABLE_FILE_PD, engine="pyarrow") print("Done. Size={}".format(frame.size)) # In[4]: frame.shape # In[5]: frame.head(30) # In[10]: columnList = frame.columns.values.tolist() columnList # # Books # Let us extract some data. # First a list of the book names. # In[11]: books = frame[frame.otype == "book"].book print(" ".join(str(x) for x in books)) # # Text # # Now the complete text of the whole bible. # In[12]: words = frame.loc[frame.otype == "word"] text = words.g_word_utf8 + words.trailer_utf8 with open(TABLE_FILE_TXT, "w") as pt: pt.write(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n")) pt.write("\n") # In[13]: get_ipython().system('head {TABLE_FILE_TXT}') # # Drill down to a passage # Let us get the words from the first verse. # # How do we know the node of the first verse? See the end of the # [bigTable](bigTable.ipynb) notebook. # In[14]: v1 = 1414389 # Is this really a verse? # In[15]: element = frame[frame.nd == v1].otype.iloc[0] element # In[16]: wordIds = frame[(frame.otype == "word") & (frame.in_verse == v1)].nd print(wordIds.values) # Now the *text* of the first verse. # In[17]: words = frame[(frame.otype == "word") & (frame.in_verse == v1)] text = words.g_word_utf8 + words.trailer_utf8 print(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n")) # Let us get the words and text of an arbitrary passage, say Psalmi 131:2 # # First the id of the chunk (i.e. the Text-Fabric node number): # In[18]: verse_id = frame[ (frame.otype == "verse") & (frame.book == "Psalmi") & (frame.chapter == 131) & (frame.verse == 2) ].nd.iloc[0] print(verse_id) # Now the word ids of that verse: # In[19]: words = frame[(frame.otype == "word") & (frame.in_verse == verse_id)] print(words.nd.values) # And, finally, the text of those words. # In[20]: text = words.g_word_utf8 + words.trailer_utf8 print(("".join(text)).replace("\u05C3", "\u05C3\n")) # Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object. # In[21]: def object2text(nd): otype = frame[frame.nd == nd].otype.iloc[0] inelement = "in_" + otype words = frame[(frame.otype == "word") & (frame[inelement] == nd)] text = words.g_word_utf8 + words.trailer_utf8 return ("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n") def verse2object(book, chapter, verse): return frame[ (frame.otype == "verse") & (frame.book == book) & (frame.chapter == chapter) & (frame.verse == verse) ].nd.iloc[0] def verse2text(book, chapter, verse): return object2text(verse2object(book, chapter, verse)) def chapter2object(book, chapter): return frame[ (frame.otype == "chapter") & (frame.book == book) & (frame.chapter == chapter) ].nd.iloc[0] def chapter2text(book, chapter): return object2text(chapter2object(book, chapter)) # In[22]: print(verse2text("Psalmi", 131, 2)) # In[23]: print(chapter2text("Psalmi", 131)) # # Bi-grams # # We make a column of verse-bound bi-grams of lexemes. The two lexemes are separated by an underscore `_`. # In[24]: vsNext = frame[frame.otype == "word"].in_verse vsPrev = frame[frame.otype == "word"].in_verse.shift(1) lex = frame[frame.otype == "word"].lex_utf8 lexNext = frame[frame.otype == "word"].lex_utf8.shift(1) # In[25]: lastInVs = vsPrev != vsNext lexNext[lastInVs] = "" # In[26]: bigram = ["{}_{}".format(*p) for p in zip(lex, lexNext)] # In[27]: bigram[10_000:10_030] # In[ ]: