#!/usr/bin/env python
# coding: utf-8
# # Getting started
#
# It is assumed that you have read
# [start](start.ipynb)
# and followed the installation instructions there.
# # Corpus
#
# This is
#
# * `quran` Q'uran
# # First acquaintance
#
# We just want to grasp what the corpus is about and how we can find our way in the data.
#
# Open a terminal or command prompt and say one of the following
#
# ```text-fabric quran```
#
# Wait and see a lot happening before your browser starts up and shows you an interface on the corpus:
# Text-Fabric needs an app to deal with the corpus-specific things.
# It downloads/finds/caches the latest version of the **app**:
#
# ```
# Using TF-app in /Users/dirk/text-fabric-data/annotation/app-quran/code:
# #c55d75da760bfdc6ae272b3ade9629fe34d059ce (latest commit)
# ```
#
# It downloads/finds/caches the latest version of the **data**:
#
# ```
# Using data in /Users/dirk/text-fabric-data/q-ran/quran/tf/0.4:
# rv0.5=#60bd6788dadb13974e89df55cde7687c0593e65f (latest release)
# ```
#
# The data is preprocessed in order to speed up typical Text-Fabric operations.
# The result is cached on your computer.
# Preprocessing costs time. Next time you use this corpus on this machine, the startup time is much quicker.
#
# ```
# TF setup done.
# ```
#
# Then the app goes on to act as a local webserver serving the corpus that has just been downloaded
# and it will open your browser for you and load the corpus page
#
# ```
# * Running on http://localhost:8105/ (Press CTRL+C to quit)
# Opening quran in browser
# Listening at port 18985
# ```
#
# # Help!
#
# Indeed, that is what you need. Click the vertical `Help` tab.
#
# From there, click around a little bit. Don't read closely, just note the kinds of information that is presented to you.
#
# Later on, it will make more sense!
# # Browsing
#
# First we browse our data. Click the browse button.
#
#
#
# and then, in the table of *documents* (suras), click on one.
#
#
#
# Now you're looking at the ayas of a sura: the marks in an Arabic unicode characters.
#
#
# Now click the *Options* tab and select the `text-trans-full` format to see the sura tablet in ascii transcription.
#
#
# You can click a triangle to see how a line is broken down:
#
#
# # Searching
#
# An aya is a verse in a sura.
# Let's find all the ayas that contain a verb followed by the word for Allah.
#
# Enter this query in the search pad and press the search icon above it.
#
# ```
# aya
# word pos=verb
# <: word pos=noun
# posx=proper
# root=Alh
# ```
#
#
# In English:
#
# search all `aya`s that contain a `word` and a `word` where:
#
# * `<:` the second `word` follows immediately on the first `word`
# * the first `word` has value `verb` for feature `pos` (part-of-speech)
# * the second `word` has
# * value `noun` for feature `pos`
# * value `proper` for feature `posx` (subcategorisation of part-of-speech)
# * value `Alh` for feature `root` (basic word form, more fundamental than lemma)
# You can expand results by clicking the triangle.
#
# You can see the result in context by clicking the browse icon.
#
# You can go back to the result list by clicking the results icon.
#
#
# This corpus has a feature for ayas in which an English translation is given: `translation@en`.
#
# We can trigger the display of these translations by mentioning the feature in the query without posing additional constraints:
#
# ```
# aya translation@en*
# word pos=verb
# <: word pos=noun
# posx=proper
# root=Alh
# ```
#
# The `*` means: any value or even no value.
#
#
# # Computing
#
# We have found verbs associated with Allah.
#
# The question comes to mind: are there verbs that are associated with Allah only in this way?
#
# Let us look for verbs, followed by nouns, and for each verb, count how many different nouns occur in that position.
#
# *This is a typical question where you want to leave the search mode and enter computing mode*.
#
# Let's do that!
# If you have followed the installation instructions, you are set.
# Go to the browser window that opened when you gave the command `jupyter notebook` in your terminal.
#
# Then continue reading, and, ... executing.
#
# You can execute a cell by putting your cursor inside it and pressing `Shift Enter`.
# First we load the Text-Fabric module, as follows:
# In[1]:
import collections
import seaborn as sns
from tf.app import use
# Now we load the TF-app for the corpus `quran` and that app loads the corpus data.
#
# We give a name to the result of all that loading: `A`.
# In[2]:
A = use("quran:clone", checkout="clone", hoist=globals())
# A = use('quran', hoist=globals())
# Some bits are familiar from above, when you ran the `text-fabric` command in the terminal.
#
# Other bits are links to the documentation, they point to the same places as the links on the Text-Fabric browser.
#
# You see a list of all the data features that have been loaded.
#
# And a list of references to the API documentation, which tells you how you can use this data in your program statements.
# # Searching (revisited)
#
# We do the same search again, but now inside our program.
#
# That means that we can capture the results in a list for further processing.
# In[3]:
results = A.search(
"""
aya
word pos=verb
<: word pos=noun
posx=proper
root=Alh
"""
)
# In less than a second, we have all the results!
#
# Let's look at the first one:
# In[4]:
results[0]
# Each result is a list of numbers: for a
#
# 1. aya
# 1. word
# 1. word
# Here is the second one:
# In[5]:
results[1]
# And here the last one:
# In[6]:
results[-1]
# Now we modify the query to get all pairs of proper verbs followed by a proper noun.
# In[7]:
results = A.search(
"""
aya
word pos=verb
<: word pos=noun
"""
)
# We are going to make buckets for the root of each verb found, and in those buckets we throw the roots of all nouns found after them.
# We also count the number of occurrences of each noun root in the buckets.
# In[9]:
buckets = collections.defaultdict(collections.Counter)
# In[10]:
for (aya, verb, noun) in results:
buckets[F.root.v(verb)][F.root.v(noun)] += 1
# How many buckets do we have?
# In[11]:
len(buckets)
# How many nouns are there in each bucket?
# In[12]:
min(len(nouns) for (bucket, nouns) in buckets.items())
# In[13]:
max(len(nouns) for (bucket, nouns) in buckets.items())
# Lets get a picture of the distribution.
# In[15]:
sns.set(color_codes=True)
sns.distplot(list(len(nouns) for nouns in buckets.values()), axlabel="number of nouns")
# We collect the buckets of length 1 in two sets: those with Allah in the bucket and those with another word:
# In[16]:
buckets1A = {
bucket for (bucket, nouns) in buckets.items() if len(nouns) == 1 and "Alh" in nouns
}
buckets1N = {
bucket
for (bucket, nouns) in buckets.items()
if len(nouns) == 1 and "Alh" not in nouns
}
print(f"with Alh: {len(buckets1A):>3}")
print(f"without Alh: {len(buckets1N):>3}")
# Finally we show the roots of the verbs that are associated only with `Alh`:
# In[17]:
sorted(buckets1A)
# And now we want to find them back in the text and show the translations of the ayas they contain.
#
# We compute a query out of the `bucket1A` contents, and run it.
# In[18]:
condition = "|".join(buckets1A)
condition
# In[19]:
query = f"""
aya
word pos=verb
root={condition}
<: word pos=noun
"""
# In[20]:
print(query)
# In[21]:
results = A.search(query)
# Now we can show the results quite easily:
# In[22]:
A.table(results, end=10)
# We check the results in transcription
# In[23]:
A.table(results, end=10, fmt="text-trans-full")
# Even better, we have a transcription that only shows the roots of the words:
# In[24]:
A.table(results, end=10, fmt="root-trans-full")
# But we wanted the translations of the relevant ayas. Here they come:
# In[25]:
for (aya, verb, noun) in results:
print(Fs("translation@en").v(aya))
print("-----")