This notebook contains the R instructions to load the
bigTables export of the BHSA
and save it in the much more compact .rds
format.
We then perform some simple information extracting on the data. For comparison, the same information extraction has been done for Pandas: in bigTablesP.
Note that we have to ignore quotes and comment signs!
First we load the big text file with all information. This will take 3 minutes or so.
bhsa = read.table(
'../_temp/2017/r/bhsa2017.txt',
sep="\t",
header=TRUE,
comment.char="",
quote="",
as.is = TRUE,
)
dim(bhsa)
Now we save it into compact .rds
format.
saveRDS(
object=bhsa,
file='../_temp/2017/r/bhsa2017.rds'
)
We load the data again, now from the compact representation. Much quicker. Still 40 seconds.
bhsa = readRDS(
file='../_temp/2017/r/bhsa2017.rds'
)
dim(bhsa)
head(bhsa, n=30)
n | otype | in.subphrase | in.phrase_atom | in.phrase | in.clause_atom | in.clause | in.sentence_atom | in.sentence | in.half_verse | ⋯ | txt | typ | uvf | vbe | vbs | verse | voc_lex | voc_lex_utf8 | vs | vt |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
426585 | book | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | |||||||||
426624 | chapter | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | |||||||||
1414190 | verse | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | 1 | |||||||||
1172209 | sentence | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | |||||||||
1235920 | sentence_atom | NA | NA | NA | NA | NA | NA | 1172209 | NA | ⋯ | NA | |||||||||
427553 | clause | NA | NA | NA | NA | NA | 1235920 | 1172209 | NA | ⋯ | ? | xQtX | NA | |||||||
515654 | clause_atom | NA | NA | NA | NA | 427553 | 1235920 | 1172209 | NA | ⋯ | xQtX | NA | ||||||||
606323 | half_verse | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | NA | ⋯ | NA | |||||||||
651503 | phrase | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | PP | NA | ||||||||
904690 | phrase_atom | NA | NA | 651503 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | PP | NA | ||||||||
1437403 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | B.: | בְּ | |||||||
1 | word | NA | 904690 | 651503 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | absent | n/a | n/a | NA | NA | NA | ||||
1437404 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | R;>CIJT | רֵאשִׁית | |||||||
2 | word | NA | 904690 | 651503 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | absent | n/a | n/a | NA | NA | NA | ||||
1437405 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | BR> | ברא | |||||||
651504 | phrase | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | VP | NA | ||||||||
904691 | phrase_atom | NA | NA | 651504 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | VP | NA | ||||||||
3 | word | NA | 904691 | 651504 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | absent | absent | NA | qal | perf | |||||
1437406 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | >:ELOHIJM | אֱלֹהִים | |||||||
651505 | phrase | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | NP | NA | ||||||||
904692 | phrase_atom | NA | NA | 651505 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | NP | NA | ||||||||
4 | word | NA | 904692 | 651505 | 515654 | 427553 | 1235920 | 1172209 | 606323 | ⋯ | absent | n/a | n/a | NA | NA | NA | ||||
606324 | half_verse | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | NA | ⋯ | NA | |||||||||
651506 | phrase | NA | NA | NA | 515654 | 427553 | 1235920 | 1172209 | 606324 | ⋯ | PP | NA | ||||||||
904693 | phrase_atom | NA | NA | 651506 | 515654 | 427553 | 1235920 | 1172209 | 606324 | ⋯ | PP | NA | ||||||||
1300406 | subphrase | NA | 904693 | 651506 | 515654 | 427553 | 1235920 | 1172209 | 606324 | ⋯ | NA | |||||||||
1437407 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | >;T | אֵת | |||||||
5 | word | 1300406 | 904693 | 651506 | 515654 | 427553 | 1235920 | 1172209 | 606324 | ⋯ | absent | n/a | n/a | NA | NA | NA | ||||
1437408 | lex | NA | NA | NA | NA | NA | NA | NA | NA | ⋯ | NA | HA | הַ | |||||||
6 | word | 1300406 | 904693 | 651506 | 515654 | 427553 | 1235920 | 1172209 | 606324 | ⋯ | absent | n/a | n/a | NA | NA | NA |
Let us extract some data. First a list of the book names.
books = bhsa$book[bhsa$otype == 'book']
paste(books, collapse=' ')
Now the complete text of the whole bible.
words = which(bhsa$otype == 'word')
text = paste(
bhsa$g_word_utf8[words], sub('׃', '׃\n', bhsa$trailer_utf8[words]),
sep='', collapse=''
)
write(text, file='../_temp/2017/r/plainTextFromR.txt')
Let us get the part of speech of the words from the first verse:
wordIds = bhsa$n[bhsa$otype=='word' & bhsa$in.verse==1414190]
wordIds
Now the text of the first verse.
words = which(bhsa$n %in% wordIds)
gsub('׃', '׃\n',
paste(bhsa$g_word_utf8[words], bhsa$trailer_utf8[words], collapse='')
)
Let us get the words and text of an arbitrary passage, say Psalmi 131:2
verseId = bhsa$n[bhsa$otype == 'verse' & bhsa$book == 'Psalmi' & bhsa$chapter == 131 & bhsa$verse == 2]
verseId
wordIds = bhsa$n[bhsa$otype=='word' & bhsa$in.verse == verseId]
wordIds
words = which(bhsa$n %in% wordIds)
gsub('׃', '׃\n',
paste(bhsa$g_word_utf8[words], bhsa$trailer_utf8[words], collapse='')
)
Now let us organize this in two functions: one that returns the verse object given a passage, and one that prints the texts of the words in a given object.
object2text = function(n) {
otype = bhsa$otype[bhsa$n == n]
wordIds = eval(parse(text=paste("bhsa$n[bhsa$otype=='word' & bhsa$in.", otype, '==n]', sep='')))
words = which(bhsa$n %in% wordIds)
return(gsub('׃', '׃\n',
paste(bhsa$g_word_utf8[words], bhsa$trailer_utf8[words], collapse='')
))
}
verse2object = function(book, chapter, verse) {
return(bhsa$n[bhsa$otype == 'verse' & bhsa$book == book & bhsa$chapter == chapter & bhsa$verse == verse])
}
verse2text = function(book, chapter, verse) {
return(object2text(verse2object(book, chapter, verse)))
}
chapter2object = function(book, chapter) {
return(bhsa$n[bhsa$otype == 'chapter' & bhsa$book == book & bhsa$chapter == chapter])
}
chapter2text = function(book, chapter) {
return(object2text(chapter2object(book, chapter)))
}
cat(verse2text('Psalmi', 131, 2))
אִם ־לֹ֤א שִׁוִּ֨יתִי ׀ וְ דֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭ גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּ גָּמֻ֖ל עָלַ֣י נַפְשִֽׁי ׃
cat(chapter2text('Psalmi', 131))
שִׁ֥יר הַֽ מַּֽעֲלֹ֗ות לְ דָ֫וִ֥ד יְהוָ֤ה ׀ לֹא ־גָבַ֣הּ לִ֭בִּי וְ לֹא ־רָמ֣וּ עֵינַ֑י וְ לֹֽא ־הִלַּ֓כְתִּי ׀ בִּ גְדֹלֹ֖ות וּ בְ נִפְלָאֹ֣ות מִמֶּֽנִּי ׃ אִם ־לֹ֤א שִׁוִּ֨יתִי ׀ וְ דֹומַ֗מְתִּי נַ֫פְשִׁ֥י כְּ֭ גָמֻל עֲלֵ֣י אִמֹּ֑ו כַּ גָּמֻ֖ל עָלַ֣י נַפְשִֽׁי ׃ יַחֵ֣ל יִ֝שְׂרָאֵל אֶל ־יְהוָ֑ה מֵֽ֝ עַתָּ֗ה וְ עַד ־עֹולָֽם ׃
We make a column of verse-bound bi-grams of lexemes. The two lexemes are separated by an underscore _
.
vsNext = bhsa$in.verse[bhsa$otype=='word'][-1]
vsPrev = bhsa$in.verse[bhsa$otype=='word'][-length(bhsa)]
lex = bhsa$g_lex_utf8[bhsa$otype=='word']
lexNext = bhsa$g_lex_utf8[bhsa$otype=='word'][-1]
lastInVs = vsPrev != vsNext
lexNext[lastInVs] = ''
bigram = paste(
lex,
lexNext,
sep='_'
)
head(bigram, n=30)
vsNext[0:2]
vsPrev[0:2]