%load_ext autoreload
%autoreload 2
import collections
from tf.app import use
from tf.dataset import modify
from tf.core.files import initTree
BASE = "~/github"
ORG = "ETCBC"
REPO = "bhsa"
REPO_MIN = "bhsa-min"
RELATIVE = "/tf"
VERSION = "2021"
We remove a big part of the hierarchy.
deleteTypes = """
lex
subphrase
phrase_atom
clause_atom
sentence_atom
half_verse
""".strip().split()
We remove a large number of relatively obscure features.
deleteFeatures = """
book@am
book@ar
book@bn
book@da
book@de
book@el
book@es
book@fa
book@fr
book@he
book@hi
book@id
book@ja
book@ko
book@la
book@nl
book@pa
book@pt
book@ru
book@sw
book@syc
book@tr
book@ur
book@yo
book@zh
dist
dist_unit
distributional_parent
freq_occ
functional_parent
g_lex
g_lex_utf8
g_nme
g_nme_utf8
g_pfm
g_pfm_utf8
g_prs
g_prs_utf8
g_uvf
g_uvf_utf8
g_vbe
g_vbe_utf8
g_vbs
g_vbs_utf8
is_root
kq_hybrid
kq_hybrid_utf8
languageISO
lex0
lexeme_count
mother_object_type
number
omap@2017-2021
omap@c-2021
rank_lex
rank_occ
suffix_gender
suffix_number
suffix_person
voc_lex
voc_lex_utf8
""".strip().split()
We replace pseudo none values by real None values in all remaining features.
For this we load the original BHSA:
Aorig = use(f"{ORG}/{REPO}:clone", checkout="clone")
Locating corpus resources ...
Name | # of nodes | # slots/node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
lex | 9230 | 46.22 | 100 |
verse | 23213 | 18.38 | 100 |
half_verse | 45179 | 9.44 | 100 |
sentence | 63717 | 6.70 | 100 |
sentence_atom | 64514 | 6.61 | 100 |
clause | 88131 | 4.84 | 100 |
clause_atom | 90704 | 4.70 | 100 |
phrase | 253203 | 1.68 | 100 |
phrase_atom | 267532 | 1.59 | 100 |
subphrase | 113850 | 1.42 | 38 |
word | 426590 | 1.00 | 100 |
noneValues = {"n/a", "none", "NA", "absent", "unknown"}
deleteFeaturesSet = set(deleteFeatures)
modifiedFeatures = {}
nNones = collections.Counter()
for feat in Aorig.api.Fall():
if feat == "otype" or feat in deleteFeaturesSet:
continue
newData = {}
for (n, v) in Aorig.api.Fs(feat).items():
if v in noneValues:
newData[n] = None
nNones[v] += 1
nData = len(newData)
if nData:
print(f"{feat:<20} {nData:>7} pseudo None values")
modifiedFeatures[feat] = newData
print(f"TOTAL: {sum(nNones.values())} pseudo None values, distributed as follows:")
for (v, n) in sorted(nNones.items(), key=lambda x: (-x[1], x[0])):
print(f"{v:<12} {n:>7}x")
det 280219 pseudo None values gn 225693 pseudo None values ls 385975 pseudo None values nme 245354 pseudo None values nu 188676 pseudo None values pfm 381594 pseudo None values prs 381432 pseudo None values prs_gn 390964 pseudo None values prs_nu 381432 pseudo None values prs_ps 381432 pseudo None values ps 365071 pseudo None values rela 630059 pseudo None values st 245354 pseudo None values uvf 423044 pseudo None values vbe 352880 pseudo None values vbs 411184 pseudo None values vs 352880 pseudo None values vt 352880 pseudo None values TOTAL: 6376123 pseudo None values, distributed as follows: NA 3713865x n/a 1392890x absent 802598x none 385975x unknown 80795x
We remove the text formats that we can no longer furnish with the thinned feature set.
featureMeta = dict(
otext={
"dataset": "BHSA-min",
"datasetName": "Biblia Hebraica Stuttgartensia Amstelodamensis (minimalistic)",
"fmt:lex-default": None,
"fmt:lex-orig-full": None,
"fmt:lex-orig-plain": None,
"fmt:lex-trans-full": None,
"fmt:lex-trans-plain": None,
}
)
We clean the target location.
bhsaLocation = f"{BASE}/{ORG}/{REPO}{RELATIVE}/{VERSION}"
bhsaMinLocation = f"{BASE}/{ORG}/{REPO_MIN}{RELATIVE}/{VERSION}"
initTree(bhsaMinLocation, fresh=True)
This was all the preparation. Now we are going to run the modification.
modify(
bhsaLocation,
bhsaMinLocation,
addFeatures=dict(nodeFeatures=modifiedFeatures),
deleteFeatures=deleteFeatures,
deleteTypes=deleteTypes,
featureMeta=featureMeta,
silent="terse",
)
| WARNING: Missing for text API: features: g_lex, g_lex_utf8, voc_lex_utf8 0.01s Feature overview: 109 for nodes; 6 for edges; 1 configs; 9 computed
True
A = use(f"{ORG}/{REPO_MIN}:clone", checkout="clone")
Locating corpus resources ...
| 0.37s T otype from ~/github/ETCBC/bhsa-min/tf/2021 | 6.02s T oslots from ~/github/ETCBC/bhsa-min/tf/2021 | 0.01s T qere from ~/github/ETCBC/bhsa-min/tf/2021 | 1.00s T g_cons_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 0.83s T trailer from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T qere_trailer_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 0.99s T g_cons from ~/github/ETCBC/bhsa-min/tf/2021 | 0.04s T chapter from ~/github/ETCBC/bhsa-min/tf/2021 | 0.05s T book from ~/github/ETCBC/bhsa-min/tf/2021 | 0.04s T verse from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T book@en from ~/github/ETCBC/bhsa-min/tf/2021 | 0.83s T trailer_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 0.01s T qere_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 1.13s T g_word_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T qere_trailer from ~/github/ETCBC/bhsa-min/tf/2021 | 1.08s T g_word from ~/github/ETCBC/bhsa-min/tf/2021 | | 0.11s C __levels__ from otype, oslots, otext | | 3.05s C __order__ from otype, oslots, __levels__ | | 0.15s C __rank__ from otype, __order__ | | 4.92s C __levUp__ from otype, oslots, __rank__ | | 3.58s C __levDown__ from otype, __levUp__, __rank__ | | 0.41s C __characters__ from otext | | 2.00s C __boundary__ from otype, oslots, __rank__ | | 0.06s C __sections__ from otype, oslots, otext, __levUp__, __levels__, book, chapter, verse | 0.00s T code from ~/github/ETCBC/bhsa-min/tf/2021 | 0.27s T det from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T domain from ~/github/ETCBC/bhsa-min/tf/2021 | 0.84s T freq_lex from ~/github/ETCBC/bhsa-min/tf/2021 | 0.54s T function from ~/github/ETCBC/bhsa-min/tf/2021 | 0.95s T gloss from ~/github/ETCBC/bhsa-min/tf/2021 | 0.44s T gn from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T instruction from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T kind from ~/github/ETCBC/bhsa-min/tf/2021 | 0.06s T label from ~/github/ETCBC/bhsa-min/tf/2021 | 0.88s T language from ~/github/ETCBC/bhsa-min/tf/2021 | 0.93s T lex from ~/github/ETCBC/bhsa-min/tf/2021 | 0.96s T lex_utf8 from ~/github/ETCBC/bhsa-min/tf/2021 | 0.10s T ls from ~/github/ETCBC/bhsa-min/tf/2021 | 0.09s T mother from ~/github/ETCBC/bhsa-min/tf/2021 | 0.09s T nametype from ~/github/ETCBC/bhsa-min/tf/2021 | 0.40s T nme from ~/github/ETCBC/bhsa-min/tf/2021 | 0.56s T nu from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T pargr from ~/github/ETCBC/bhsa-min/tf/2021 | 0.91s T pdp from ~/github/ETCBC/bhsa-min/tf/2021 | 0.11s T pfm from ~/github/ETCBC/bhsa-min/tf/2021 | 0.11s T prs from ~/github/ETCBC/bhsa-min/tf/2021 | 0.09s T prs_gn from ~/github/ETCBC/bhsa-min/tf/2021 | 0.11s T prs_nu from ~/github/ETCBC/bhsa-min/tf/2021 | 0.11s T prs_ps from ~/github/ETCBC/bhsa-min/tf/2021 | 0.15s T ps from ~/github/ETCBC/bhsa-min/tf/2021 | 0.05s T rela from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T root from ~/github/ETCBC/bhsa-min/tf/2021 | 0.91s T sp from ~/github/ETCBC/bhsa-min/tf/2021 | 0.42s T st from ~/github/ETCBC/bhsa-min/tf/2021 | 0.00s T tab from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T txt from ~/github/ETCBC/bhsa-min/tf/2021 | 0.71s T typ from ~/github/ETCBC/bhsa-min/tf/2021 | 0.01s T uvf from ~/github/ETCBC/bhsa-min/tf/2021 | 0.17s T vbe from ~/github/ETCBC/bhsa-min/tf/2021 | 0.04s T vbs from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T vs from ~/github/ETCBC/bhsa-min/tf/2021 | 0.18s T vt from ~/github/ETCBC/bhsa-min/tf/2021
Name | # of nodes | # slots/node | % coverage |
---|---|---|---|
book | 39 | 10938.21 | 100 |
chapter | 929 | 459.19 | 100 |
verse | 23213 | 18.38 | 100 |
sentence | 63717 | 6.70 | 100 |
clause | 88131 | 4.84 | 100 |
phrase | 253203 | 1.68 | 100 |
word | 426590 | 1.00 | 100 |
s = A.api.F.otype.s("sentence")[45]
A.pretty(s, multiFeatures=False)
A.footprint()
feature | members | size in bytes |
---|---|---|
levUp | 855,822 | 90,952,232 |
oslots | 3 | 48,010,736 |
boundary | 2 | 45,968,624 |
levDown | 429,232 | 45,465,556 |
g_word_utf8 | 426,590 | 43,108,023 |
g_word | 426,590 | 38,366,975 |
g_cons_utf8 | 426,590 | 35,235,354 |
g_cons | 426,590 | 34,229,846 |
lex_utf8 | 426,590 | 33,511,928 |
lex | 426,590 | 33,390,506 |
gloss | 426,590 | 33,261,517 |
freq_lex | 426,590 | 32,920,796 |
trailer_utf8 | 426,590 | 32,917,176 |
pdp | 426,590 | 32,916,861 |
sp | 426,590 | 32,916,861 |
trailer | 426,590 | 32,916,804 |
language | 426,590 | 32,916,231 |
order | 855,822 | 30,809,632 |
typ | 341,334 | 20,046,107 |
function | 253,203 | 17,577,069 |
nu | 237,914 | 17,147,593 |
gn | 200,897 | 16,111,064 |
nme | 181,236 | 15,561,372 |
st | 181,236 | 15,560,606 |
det | 113,892 | 8,432,040 |
txt | 88,131 | 7,717,905 |
domain | 88,131 | 7,710,828 |
kind | 88,131 | 7,710,781 |
mother | 21,403 | 6,124,096 |
vs | 73,710 | 4,686,725 |
vbe | 73,710 | 4,686,321 |
vt | 73,710 | 4,685,832 |
root | 72,051 | 4,678,358 |
ps | 61,519 | 4,344,213 |
prs | 45,158 | 3,886,968 |
prs_ps | 45,158 | 3,886,105 |
prs_nu | 45,158 | 3,886,054 |
pfm | 44,996 | 3,881,866 |
rank | 855,822 | 3,624,652 |
otype | 4 | 3,434,407 |
label | 23,213 | 3,330,331 |
ls | 40,615 | 2,448,762 |
prs_gn | 35,626 | 2,308,428 |
nametype | 35,506 | 2,305,527 |
chapter | 24,142 | 1,990,976 |
book | 24,181 | 1,990,047 |
verse | 23,213 | 1,965,692 |
sections | 2 | 1,703,884 |
rela | 21,403 | 1,189,832 |
vbs | 15,406 | 1,021,735 |
qere_utf8 | 1,892 | 263,785 |
uvf | 3,546 | 247,082 |
qere | 1,892 | 200,631 |
qere_trailer_utf8 | 1,892 | 127,126 |
qere_trailer | 1,892 | 127,037 |
characters | 6 | 42,824 |
book@en | 39 | 4,454 |
levels | 7 | 1,519 |
code | 0 | 64 |
instruction | 0 | 64 |
pargr | 0 | 64 |
tab | 0 | 64 |
TOTAL | 11,127,528 | 916,466,548 |
For comparison, we show the footprint of the complete BHSA:
Aorig.footprint()
feature | members | size in bytes |
---|---|---|
levUp | 1,446,831 | 549,464,428 |
levDown | 1,020,241 | 136,906,600 |
oslots | 3 | 121,886,820 |
boundary | 2 | 107,077,088 |
number | 1,254,391 | 99,760,440 |
rela | 722,716 | 62,180,387 |
typ | 699,570 | 61,534,048 |
order | 1,446,831 | 52,085,956 |
mother | 182,269 | 50,752,708 |
freq_lex | 435,820 | 41,925,288 |
g_word_utf8 | 426,590 | 41,387,515 |
g_word | 426,590 | 38,366,975 |
phono | 426,590 | 38,354,715 |
rank_lex | 435,820 | 36,117,916 |
det | 520,735 | 35,552,335 |
g_cons_utf8 | 426,590 | 34,967,885 |
g_lex_utf8 | 426,590 | 34,738,775 |
g_cons | 426,590 | 34,229,846 |
lex_utf8 | 435,820 | 34,190,000 |
g_lex | 426,590 | 34,079,807 |
voc_lex_utf8 | 435,820 | 33,881,594 |
lex | 435,820 | 33,648,946 |
voc_lex | 435,820 | 33,624,242 |
gloss | 435,820 | 33,519,957 |
sp | 435,820 | 33,175,301 |
language | 435,820 | 33,174,671 |
ls | 426,975 | 32,927,695 |
vs | 426,590 | 32,917,488 |
prs | 426,590 | 32,917,243 |
nme | 426,590 | 32,917,143 |
trailer_utf8 | 426,590 | 32,917,109 |
vbe | 426,590 | 32,917,085 |
pdp | 426,590 | 32,916,861 |
trailer | 426,590 | 32,916,804 |
vbs | 426,590 | 32,916,682 |
pfm | 426,590 | 32,916,677 |
vt | 426,590 | 32,916,595 |
uvf | 426,590 | 32,916,425 |
nu | 426,590 | 32,916,380 |
ps | 426,590 | 32,916,380 |
gn | 426,590 | 32,916,327 |
prs_gn | 426,590 | 32,916,327 |
prs_ps | 426,590 | 32,916,324 |
phono_trailer | 426,590 | 32,916,322 |
st | 426,590 | 32,916,321 |
prs_nu | 426,590 | 32,916,273 |
function | 253,203 | 17,577,069 |
code | 90,704 | 8,877,720 |
otype | 4 | 8,162,830 |
pargr | 90,704 | 7,977,817 |
tab | 90,704 | 7,783,508 |
txt | 88,131 | 7,717,905 |
domain | 88,131 | 7,710,828 |
rank | 1,446,831 | 6,149,120 |
label | 68,392 | 5,906,221 |
crossref | 3,783 | 2,812,004 |
nametype | 38,117 | 2,378,635 |
chapter | 24,142 | 1,990,976 |
book | 24,181 | 1,990,047 |
verse | 23,213 | 1,965,692 |
sections | 2 | 1,704,976 |
qere_utf8 | 1,892 | 241,179 |
qere | 1,892 | 200,631 |
qere_trailer_utf8 | 1,892 | 127,115 |
qere_trailer | 1,892 | 127,037 |
characters | 12 | 76,835 |
book@am | 39 | 5,940 |
book@bn | 39 | 5,748 |
book@ru | 39 | 5,720 |
book@el | 39 | 5,716 |
book@hi | 39 | 5,676 |
book@pa | 39 | 5,654 |
book@fa | 39 | 5,648 |
book@ur | 39 | 5,628 |
book@syc | 39 | 5,616 |
book@he | 39 | 5,570 |
book@ar | 39 | 5,564 |
book@ja | 39 | 5,472 |
book@ko | 39 | 5,382 |
book@zh | 39 | 5,325 |
book@es | 39 | 4,877 |
book@pt | 39 | 4,861 |
book@tr | 39 | 4,814 |
book@fr | 39 | 4,767 |
book@yo | 39 | 4,742 |
book@da | 39 | 4,557 |
book@de | 39 | 4,511 |
book@nl | 39 | 4,496 |
book@sw | 39 | 4,474 |
book@en | 39 | 4,454 |
book@id | 39 | 4,442 |
book@la | 39 | 4,439 |
levels | 13 | 2,830 |
TOTAL | 25,073,133 | 2,596,543,772 |
A reduction from 2.6 GB to 0.9 GB
Make a zip file to release
A.zipAll()
Aorig.zipAll()
Data to be zipped:
WARNING: no local release info found.
Maybe you have to do go to this repo and do `git pull --tags` We'll fetch the local commit info anyway. OK app (v?? ca8267) : ~/github/ETCBC/bhsa-min/app
WARNING: no local release info found.
Maybe you have to do go to this repo and do `git pull --tags` We'll fetch the local commit info anyway. OK main data (v?? ca8267) : ~/github/ETCBC/bhsa-min/tf/2021 Writing zip file ... Result: ~/Downloads/github/ETCBC/bhsa-min/complete.zip Data to be zipped: OK app (v1.8 157309) : ~/github/ETCBC/bhsa/app OK main data (v1.8 157309) : ~/github/ETCBC/bhsa/tf/2021 OK module phono (v2.1 bd97bc) : ~/github/ETCBC/phono/tf/2021 OK module parallels (v2.1 cf333f) : ~/github/ETCBC/parallels/tf/2021 Writing zip file ... Result: ~/Downloads/github/ETCBC/bhsa/complete.zip
!ls -l ~/Downloads/github/ETCBC/bhsa-min/complete.zip
!ls -l ~/Downloads/github/ETCBC/bhsa/complete.zip
-rw-r--r-- 1 me staff 12580582 May 9 14:31 /Users/me/Downloads/github/ETCBC/bhsa-min/complete.zip -rw-r--r-- 1 me staff 33954656 May 9 14:31 /Users/me/Downloads/github/ETCBC/bhsa/complete.zip
Press i
twice to quit the browser.
!tf
This is Text-Fabric 11.4.9 Starting new kernel listening on 14907 Loading data for ETCBC/bhsa-min. Please wait ... Setting up TF kernel for ETCBC/bhsa-min **Locating corpus resources ...** Using app in ~/github/ETCBC/bhsa-min/app: repo clone offline under ~/github (local github) Using data in ~/github/ETCBC/bhsa-min/tf/2021: repo clone offline under ~/github (local github) TF setup done. Starting new webserver listening on 24907 WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead. * Running on http://localhost:24907 Press CTRL+C to quit Opening ETCBC/bhsa-min in browser Press <Ctrl+C> to stop the TF browser Kernel listening at port 14907 127.0.0.1 - - [09/May/2023 14:33:34] "GET / HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/base.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/display.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/highlight.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/fonts.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/index.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/fontawesome.css HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/jquery.js HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/tf3.0.js HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/icon.png HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/huc.png HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /data/static/logo.png HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/fonts/fa-solid-900.woff2 HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/fonts/SILEOT.woff HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/fonts/fa-regular-400.woff2 HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "GET /server/static/favicon.ico HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:34] "POST /passage HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:38] "POST /passage HTTP/1.1" 200 - 127.0.0.1 - - [09/May/2023 14:33:40] "POST /passage/3 HTTP/1.1" 200 - ^C keyboard interrupt! TF web server has stopped TF kernel has stopped