#!/usr/bin/env python
# coding: utf-8
# # Getting started
#
# It is assumed that you have read
# [start](start.ipynb)
# and followed the installation instructions there.
# # Corpus
#
# This is:
#
# * `dss` Dead Sea Scrolls
# # First acquaintance
#
# We just want to grasp what the corpus is about and how we can find our way in the data.
#
# Open a terminal or command prompt and say one of the following
#
# ```text-fabric dss```
#
# Wait and see a lot happening before your browser starts up and shows you an interface on the corpus:
# Text-Fabric needs an app to deal with the corpus-specific things.
# It downloads/finds/caches the latest version of the **app**:
#
# ```
# Using TF-app in /Users/dirk/text-fabric-data/annotation/app-dss/code:
# rv0.6=#304d66fd7eab50bbe4de8505c24d8b3eca30b1f1 (latest release)
# ```
#
# It downloads/finds/caches the latest version of the **data**:
#
# ```
# Using data in /Users/dirk/text-fabric-data/etcbc/dss/tf/0.6:
# rv0.6=#9b52e40a8a36391b60807357fa94343c510bdee0 (latest release)
# ```
#
# The data is preprocessed in order to speed up typical Text-Fabric operations.
# The result is cached on your computer.
# Preprocessing costs time. Next time you use this corpus on this machine, the startup time is much quicker.
#
# ```
# TF setup done.
# ```
#
# Then the app goes on to act as a local webserver serving the corpus that has just been downloaded
# and it will open your browser for you and load the corpus page
#
# ```
# * Running on http://localhost:8107/ (Press CTRL+C to quit)
# Opening dss in browser
# Listening at port 18987
# ```
#
# # Help!
#
# Indeed, that is what you need. Click the vertical `Help` tab.
#
# From there, click around a little bit. Don't read closely, just note the kinds of information that is presented to you.
#
# Later on, it will make more sense!
# # Browsing
#
# First we browse our data. Click the browse button.
#
#
#
# and then, in the table of *documents* (scrolls), click on a fragment of scroll `1QSb`:
#
#
#
# Now you're looking at a fragment of a scroll: the writing in Hebrew characters without vowel signs.
#
#
# Now click the *Options* tab and select the `layout-orig-unicode` format to see the same fragment in a layout that indicates the status
# of the pieces of writing.
#
#
# You can click a triangle to see how a line is broken down:
#
#
# # Searching
#
# In this corpus there is a lot of attention for the uncertainty of signs and whether they have been corrected, either in antiquity or
# in more modern times.
#
# Also, the corpus is marked up with part-of-speech for each word.
#
# So we can, for example, search for *verbs* that have an uncertain or corrected or removed consonant in them.
#
# ```
# word sp=verb
# sign type=cons
# /with/
# .. unc=1|2|3|4
# /or/
# .. cor=1|2|3
# /or/
# .. rem=1|2
# /or/
# .. rec=1
# /-/
# ```
#
#
# In English:
#
# search all `word`s that contain a `sign` with feature `type`
# having value `cons` (consonant) where at least one of the following holds for
# that sign:
#
# * the feature `unc` has value `1` or `2` or `3` or `4`
# * the feature `cor` has value `1` or `2` or `3`
# * the feature `rem` has value `1` or `2`
# * the feature `rec` has value `1`
#
# Words with multiple uncertain signs correspond with multiple results. We can condense the results in such a way that all results for the same word are shown as one result.
#
# Click the options tab, check *condense results*, and check *word* as the container into you want to condense.
#
#
# You can expand results by clicking the triangle.
#
# You can see the result in context by clicking the browse icon.
#
# You can go back to the result list by clicking the results icon.
#
#
# # Computing
#
# This triggers other questions.
#
# For example: how many verbs are there, if there are already 37344 with uncertain signs?
# How is uncertainty distributed over the verbs?
# I.e. how many verbs have how many uncertain/corrected/removed signs?
#
# *This is a typical question where you want to leave the search mode and enter computing mode*.
#
# Let's find out.
#
# Extra information:
#
# * the features `unc`, `cor`, `rem` have values 1, 2, 3, 4 that indicate the kind of uncertainty, correction, removal.
# We just use those values as the seriousness of the uncertainty.
# Essentially, we just sum up all values of these features for each sign.
# * the feature `rec` means, if that the sign is reconstructed. We consider it to be severely uncertain, and add the penalty 10 for
# such signs.
#
# Open your terminal and say
#
# ``` sh
# jupyter notebook
# ```
#
# Your browser starts up and presents you a local computing environment where you can run Python programs.
#
# You see cells like the one below, where you can type programming statements and execute them by pressing `Shift Enter`.
# In[ ]:
# First we load the Text-Fabric module, as follows:
# In[1]:
from tf.app import use
# Now we load the TF-app for the corpus `dss` and that app loads the corpus data.
#
# We give a name to the result of all that loading: `A`.
# In[3]:
A = use('ETCBC/dss', hoist=globals())
# Some bits are familiar from above, when you ran the `text-fabric` command in the terminal.
#
# Other bits are links to the documentation, they point to the same places as the links on the Text-Fabric browser.
#
# You see a list of all the data features that have been loaded.
#
# And a list of references to the API documentation, which tells you how you can use this data in your program statements.
# # Searching (revisited)
#
# We do the same search again, but now inside our program.
#
# That means that we can capture the results in a list for further processing.
# In[4]:
template = '''
word sp=verb
sign type=cons
/with/
.. unc=1|2|3|4
/or/
.. cor=1|2|3
/or/
.. rem=1|2
/or/
.. rec=1
/-/
'''
results = A.search(template)
# In a few seconds, we have all the results!
#
# Let's look at the first one:
# In[5]:
results[0]
# Each result is a list of numbers: for a
#
# 1. word
# 1. sign
# Here is the second one:
# In[6]:
results[1]
# And here the last one:
# In[7]:
results[-1]
# Now we are only interested in the words that we have encountered.
# We collect them in a set:
# In[8]:
verbs = sorted({result[0] for result in results})
len(verbs)
# This corresponds exactly to the number of condensed results!
#
# Now we get the number of verbs:
# In[9]:
len(F.sp.s('verb'))
# In English: take feature `sp` (part-of-speech), and collect all nodes that have value `verb` for this feature.
# Then take the length of this list.
# Now we want to find out something for each result verb: what is the accumulated uncertainty of that verb?
# Some verbs have more consonants than others, so we divide by the number of consonants.
#
# We define a function that collects the uncertainty of a single sign:
# In[10]:
def getUncertainty(sign):
return sum((
F.unc.v(sign) or 0,
F.cor.v(sign) or 0,
F.rem.v(sign) or 0,
10 if F.rec.v(sign) else 0
))
# Let's see what this gives for the first sign in the 1000th result:
# In[11]:
sign = results[999][1]
A.pretty(sign)
# In[12]:
unc = getUncertainty(sign)
print(unc)
# An other one:
# In[13]:
sign = results[12][1]
A.pretty(sign)
print(getUncertainty(sign))
# Now we define a function that gives us the uncertainty of a word.
# We collect the consonants of the word.
# We sum the uncertainty of them and divide it by the number of consonants in the word.
# In[14]:
def uncertainty(word):
signs = L.d(word, otype='sign') # go a Level down to signs and collect them in a list
return sum(getUncertainty(sign) for sign in signs) / len(signs)
# We compute the uncertainty of some verbs.
# In[15]:
verb = verbs[999]
A.pretty(verb)
# Now the computation:
# In[16]:
unc = uncertainty(verb)
print(unc)
# An other one:
# In[17]:
verb = verbs[12]
A.pretty(verb)
print(uncertainty(verb))
# We compute all word uncertainties and store them in a dictionary:
# In[18]:
verbUncertainty = {verb: uncertainty(verb) for verb in verbs}
len(verbUncertainty)
# What is the minimum and the maximum uncertainty?
# In[19]:
max(verbUncertainty.values())
# In[20]:
min(verbUncertainty.values())
# In order to visualize how many how uncertain verbs there are, we make a scatterplot,
# using the *seaborn* library.
# (You might need to install the python package `seaborn`)
# In[21]:
get_ipython().system('pip install seaborn')
# In[22]:
import seaborn as sns
# In[23]:
sns.set(color_codes=True)
sns.distplot(list(verbUncertainty.values()), axlabel="uncertainty")
# Let's single out the verbs with uncertainty greater than 9, but lower than 10, and inspect a few.
# In[24]:
verbHighUnc = [verb for (verb, unc) in verbUncertainty.items() if 9 < unc < 10]
len(verbHighUnc)
# In[25]:
A.show([[verb] for verb in verbHighUnc], fmt='layout-orig-full', condenseType='word')
# In[ ]: