#!/usr/bin/env python
# coding: utf-8
#
#
#
#
#
# # Import BHSA data into Pandas
#
# This notebook contains the Pandas instructions to load the
# [Pandas export](export.ipynb) export of the BHSA.
#
# We then perform some simple information extracting on the data.
# # How to get the Pandas file
#
# The direct download link is
# [data-2021.pd](https://github.com/ETCBC/bhsa/releases/download/v1.8/data-2021.pd)
#
# The pandas file is over 50 MB, a bit too large for GitHub without large file support.
# So I attached it to the
# [latest release](https://github.com/ETCBC/bhsa/releases/tag/v1.8).
#
# ## Reproduction
#
# If you want to do it yourself,
#
# * clone this repo
# * find the [export](export.ipynb) notebook
# * run it in Jupyterlab
# * pick up the newly generated file from the `/pandas` subdirectory.
# In[5]:
import os
import pandas as pd # pip3 install pandas
# # File locations
#
# We set up some variables for the location of the Pandas file and a location
# where we will save the full text of this corpus.
# In[6]:
VERSION = "2021"
PANDAS_DIR = os.path.abspath("../pandas")
TEXT_DIR = os.path.abspath(os.path.expanduser("~/Downloads/text"))
TABLE_FILE_PD = f"{PANDAS_DIR}/data-{VERSION}.pd"
TABLE_FILE_TXT = f"{TEXT_DIR}/data-{VERSION}.txt"
if not os.path.exists(TEXT_DIR):
os.makedirs(TEXT_DIR)
# # Load the dataframe
# In[7]:
frame = pd.read_parquet(TABLE_FILE_PD, engine="pyarrow")
print("Done. Size={}".format(frame.size))
# In[4]:
frame.shape
# In[5]:
frame.head(30)
# In[10]:
columnList = frame.columns.values.tolist()
columnList
# # Books
# Let us extract some data.
# First a list of the book names.
# In[11]:
books = frame[frame.otype == "book"].book
print(" ".join(str(x) for x in books))
# # Text
#
# Now the complete text of the whole bible.
# In[12]:
words = frame.loc[frame.otype == "word"]
text = words.g_word_utf8 + words.trailer_utf8
with open(TABLE_FILE_TXT, "w") as pt:
pt.write(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n"))
pt.write("\n")
# In[13]:
get_ipython().system('head {TABLE_FILE_TXT}')
# # Drill down to a passage
# Let us get the words from the first verse.
#
# How do we know the node of the first verse? See the end of the
# [bigTable](bigTable.ipynb) notebook.
# In[14]:
v1 = 1414389
# Is this really a verse?
# In[15]:
element = frame[frame.nd == v1].otype.iloc[0]
element
# In[16]:
wordIds = frame[(frame.otype == "word") & (frame.in_verse == v1)].nd
print(wordIds.values)
# Now the *text* of the first verse.
# In[17]:
words = frame[(frame.otype == "word") & (frame.in_verse == v1)]
text = words.g_word_utf8 + words.trailer_utf8
print(("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n"))
# Let us get the words and text of an arbitrary passage, say Psalmi 131:2
#
# First the id of the chunk (i.e. the Text-Fabric node number):
# In[18]:
verse_id = frame[
(frame.otype == "verse")
& (frame.book == "Psalmi")
& (frame.chapter == 131)
& (frame.verse == 2)
].nd.iloc[0]
print(verse_id)
# Now the word ids of that verse:
# In[19]:
words = frame[(frame.otype == "word") & (frame.in_verse == verse_id)]
print(words.nd.values)
# And, finally, the text of those words.
# In[20]:
text = words.g_word_utf8 + words.trailer_utf8
print(("".join(text)).replace("\u05C3", "\u05C3\n"))
# Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object.
# In[21]:
def object2text(nd):
otype = frame[frame.nd == nd].otype.iloc[0]
inelement = "in_" + otype
words = frame[(frame.otype == "word") & (frame[inelement] == nd)]
text = words.g_word_utf8 + words.trailer_utf8
return ("".join(text)).replace("\u05C3", "\u05C3\n").replace("\\n", "\n")
def verse2object(book, chapter, verse):
return frame[
(frame.otype == "verse")
& (frame.book == book)
& (frame.chapter == chapter)
& (frame.verse == verse)
].nd.iloc[0]
def verse2text(book, chapter, verse):
return object2text(verse2object(book, chapter, verse))
def chapter2object(book, chapter):
return frame[
(frame.otype == "chapter") & (frame.book == book) & (frame.chapter == chapter)
].nd.iloc[0]
def chapter2text(book, chapter):
return object2text(chapter2object(book, chapter))
# In[22]:
print(verse2text("Psalmi", 131, 2))
# In[23]:
print(chapter2text("Psalmi", 131))
# # Bi-grams
#
# We make a column of verse-bound bi-grams of lexemes. The two lexemes are separated by an underscore `_`.
# In[24]:
vsNext = frame[frame.otype == "word"].in_verse
vsPrev = frame[frame.otype == "word"].in_verse.shift(1)
lex = frame[frame.otype == "word"].lex_utf8
lexNext = frame[frame.otype == "word"].lex_utf8.shift(1)
# In[25]:
lastInVs = vsPrev != vsNext
lexNext[lastInVs] = ""
# In[26]:
bigram = ["{}_{}".format(*p) for p in zip(lex, lexNext)]
# In[27]:
bigram[10_000:10_030]
# In[ ]: