#!/usr/bin/env python # coding: utf-8 # # Getting started # # It is assumed that you have read # [start](start.ipynb) # and followed the installation instructions there. # # Corpus # # This is # # * `quran` Q'uran # # First acquaintance # # We just want to grasp what the corpus is about and how we can find our way in the data. # # Open a terminal or command prompt and say one of the following # # ```text-fabric quran``` # # Wait and see a lot happening before your browser starts up and shows you an interface on the corpus: # Text-Fabric needs an app to deal with the corpus-specific things. # It downloads/finds/caches the latest version of the **app**: # # ``` # Using TF-app in /Users/dirk/text-fabric-data/annotation/app-quran/code: # #c55d75da760bfdc6ae272b3ade9629fe34d059ce (latest commit) # ``` # # It downloads/finds/caches the latest version of the **data**: # # ``` # Using data in /Users/dirk/text-fabric-data/q-ran/quran/tf/0.4: # rv0.5=#60bd6788dadb13974e89df55cde7687c0593e65f (latest release) # ``` # # The data is preprocessed in order to speed up typical Text-Fabric operations. # The result is cached on your computer. # Preprocessing costs time. Next time you use this corpus on this machine, the startup time is much quicker. # # ``` # TF setup done. # ``` # # Then the app goes on to act as a local webserver serving the corpus that has just been downloaded # and it will open your browser for you and load the corpus page # # ``` # * Running on http://localhost:8105/ (Press CTRL+C to quit) # Opening quran in browser # Listening at port 18985 # ``` # # # Help! # # Indeed, that is what you need. Click the vertical `Help` tab. # # From there, click around a little bit. Don't read closely, just note the kinds of information that is presented to you. # # Later on, it will make more sense! # # Browsing # # First we browse our data. Click the browse button. # # # # and then, in the table of *documents* (suras), click on one. # # # # Now you're looking at the ayas of a sura: the marks in an Arabic unicode characters. # # # Now click the *Options* tab and select the `text-trans-full` format to see the sura tablet in ascii transcription. # # # You can click a triangle to see how a line is broken down: # # # # Searching # # An aya is a verse in a sura. # Let's find all the ayas that contain a verb followed by the word for Allah. # # Enter this query in the search pad and press the search icon above it. # # ``` # aya # word pos=verb # <: word pos=noun # posx=proper # root=Alh # ``` # # # In English: # # search all `aya`s that contain a `word` and a `word` where: # # * `<:` the second `word` follows immediately on the first `word` # * the first `word` has value `verb` for feature `pos` (part-of-speech) # * the second `word` has # * value `noun` for feature `pos` # * value `proper` for feature `posx` (subcategorisation of part-of-speech) # * value `Alh` for feature `root` (basic word form, more fundamental than lemma) # You can expand results by clicking the triangle. # # You can see the result in context by clicking the browse icon. # # You can go back to the result list by clicking the results icon. # # # This corpus has a feature for ayas in which an English translation is given: `translation@en`. # # We can trigger the display of these translations by mentioning the feature in the query without posing additional constraints: # # ``` # aya translation@en* # word pos=verb # <: word pos=noun # posx=proper # root=Alh # ``` # # The `*` means: any value or even no value. # # # # Computing # # We have found verbs associated with Allah. # # The question comes to mind: are there verbs that are associated with Allah only in this way? # # Let us look for verbs, followed by nouns, and for each verb, count how many different nouns occur in that position. # # *This is a typical question where you want to leave the search mode and enter computing mode*. # # Let's do that! # If you have followed the installation instructions, you are set. # Go to the browser window that opened when you gave the command `jupyter notebook` in your terminal. # # Then continue reading, and, ... executing. # # You can execute a cell by putting your cursor inside it and pressing `Shift Enter`. # First we load the Text-Fabric module, as follows: # In[1]: import collections import seaborn as sns from tf.app import use # Now we load the TF-app for the corpus `quran` and that app loads the corpus data. # # We give a name to the result of all that loading: `A`. # In[2]: A = use("quran:clone", checkout="clone", hoist=globals()) # A = use('quran', hoist=globals()) # Some bits are familiar from above, when you ran the `text-fabric` command in the terminal. # # Other bits are links to the documentation, they point to the same places as the links on the Text-Fabric browser. # # You see a list of all the data features that have been loaded. # # And a list of references to the API documentation, which tells you how you can use this data in your program statements. # # Searching (revisited) # # We do the same search again, but now inside our program. # # That means that we can capture the results in a list for further processing. # In[3]: results = A.search( """ aya word pos=verb <: word pos=noun posx=proper root=Alh """ ) # In less than a second, we have all the results! # # Let's look at the first one: # In[4]: results[0] # Each result is a list of numbers: for a # # 1. aya # 1. word # 1. word # Here is the second one: # In[5]: results[1] # And here the last one: # In[6]: results[-1] # Now we modify the query to get all pairs of proper verbs followed by a proper noun. # In[7]: results = A.search( """ aya word pos=verb <: word pos=noun """ ) # We are going to make buckets for the root of each verb found, and in those buckets we throw the roots of all nouns found after them. # We also count the number of occurrences of each noun root in the buckets. # In[9]: buckets = collections.defaultdict(collections.Counter) # In[10]: for (aya, verb, noun) in results: buckets[F.root.v(verb)][F.root.v(noun)] += 1 # How many buckets do we have? # In[11]: len(buckets) # How many nouns are there in each bucket? # In[12]: min(len(nouns) for (bucket, nouns) in buckets.items()) # In[13]: max(len(nouns) for (bucket, nouns) in buckets.items()) # Lets get a picture of the distribution. # In[15]: sns.set(color_codes=True) sns.distplot(list(len(nouns) for nouns in buckets.values()), axlabel="number of nouns") # We collect the buckets of length 1 in two sets: those with Allah in the bucket and those with another word: # In[16]: buckets1A = { bucket for (bucket, nouns) in buckets.items() if len(nouns) == 1 and "Alh" in nouns } buckets1N = { bucket for (bucket, nouns) in buckets.items() if len(nouns) == 1 and "Alh" not in nouns } print(f"with Alh: {len(buckets1A):>3}") print(f"without Alh: {len(buckets1N):>3}") # Finally we show the roots of the verbs that are associated only with `Alh`: # In[17]: sorted(buckets1A) # And now we want to find them back in the text and show the translations of the ayas they contain. # # We compute a query out of the `bucket1A` contents, and run it. # In[18]: condition = "|".join(buckets1A) condition # In[19]: query = f""" aya word pos=verb root={condition} <: word pos=noun """ # In[20]: print(query) # In[21]: results = A.search(query) # Now we can show the results quite easily: # In[22]: A.table(results, end=10) # We check the results in transcription # In[23]: A.table(results, end=10, fmt="text-trans-full") # Even better, we have a transcription that only shows the roots of the words: # In[24]: A.table(results, end=10, fmt="root-trans-full") # But we wanted the translations of the relevant ayas. Here they come: # In[25]: for (aya, verb, noun) in results: print(Fs("translation@en").v(aya)) print("-----")