#!/usr/bin/env python
# coding: utf-8

# # Notebook Example

# In[1]:


from lexicalrichness import LexicalRichness
import lexicalrichness
lexicalrichness.__version__


# In[2]:


# Enter your own text here if you prefer
text = """Measure of textual lexical diversity, computed as the mean length of sequential words in
                a text that maintains a minimum threshold TTR score.

                Iterates over words until TTR scores falls below a threshold, then increase factor
                counter by 1 and start over. McCarthy and Jarvis (2010, pg. 385) recommends a factor
                threshold in the range of [0.660, 0.750].
                (McCarthy 2005, McCarthy and Jarvis 2010)"""

# instantiate new text object (use the tokenizer=blobber argument to use the textblob tokenizer)
lex = LexicalRichness(text)


# ## Attributes

# In[3]:


# Get list of words
list_of_words = lex.wordlist
print(list_of_words[:10], list_of_words[-10:])


# In[4]:


# Return word count (w).
lex.words


# In[5]:


# Return (unique) word count (t).
lex.terms


# **Type-token ratio** (TTR; Chotlos 1944, Templin 1957):
# $$
# TTR = \frac{t}{w}
# $$
# where $t$ or $t(w)$ is the number unique terms as function of the text of length $w$ words. 

# In[6]:


# Return type-token ratio (TTR) of text.
lex.ttr


# **Root TTR** (RTTR; Guiraud 1954, 1960):
# $$
# RTTR = \frac{t}{\sqrt{w}}
# $$

# In[7]:


# Return root type-token ratio (RTTR) of text.
lex.rttr


# **Corrected TTR** (RTTR; Guiraud 1954, 1960):
# $$
# CTTR = \frac{t}{\sqrt{2w}}
# $$

# In[8]:


# Return corrected type-token ratio (CTTR) of text.
lex.cttr


# **Herdan's C** (Herdan 1960, 1964):
# $$
# C = \frac{log(t)}{log(w)}
# $$

# In[9]:


# Return Herdan's C
lex.Herdan


# **Summer's index** (Summer 1966)
# $$
# Summer = \frac{log \log(t)}{log\log(w)}
# $$

# In[10]:


# Return Summer's index
lex.Summer


# **Dugast's index** (Dugast 1978):
# $$
# Dugast = \frac{log(w)^2}{log(w) - log (t)}
# $$

# In[11]:


# Return Dugast's index
lex.Dugast


# **Maas's index** (Maas 1972):
# $$
# Maas = \frac{log(w) - log(t)}{log(w)^2}  
# $$

# In[12]:


lex.Maas


# **Yule's K** (Yule 1944, Tweedie and Baayen 1998):
# $$
# k = 10^4 \times \left\{\sum_{i=1}^n f(i,N) \left(\frac{i}{N}\right)^2 -\frac{1}{N} \right\}
# $$

# In[13]:


lex.yulek


# **Yule's I** (Yule 1944, Tweedie and Baayen 1998):
# $$
# I = \frac{t^2}{\sum^{n_{\text{max}}}_{i=1} i^2f(i,w) - t}
# $$

# In[14]:


lex.yulei


# **Herdan's Vm** (Herdan 1955, Tweedie and Baayen 1998):
# $$
# V_m = \sqrt{\sum^{n_{\text{max}}}_{i=1} f(i,w) \left(\frac{i}{w} \right)^2 - \frac{1}{w}}    
# $$

# In[15]:


lex.herdanvm


# **Simpson's D** (Simpson 1949, Tweedie and Baayen 1998):
# $$
# D = \sum^{n_{\text{max}}}_{i=1} f(i,w) \frac{i}{w}\frac{i-1}{w-1}
# $$

# In[16]:


lex.simpsond


# ## Methods

# ### MSTTR: Mean segmental type-token ratio
# 
# * computed as average of TTR scores for segments in a text
# * Split a text into segments of length segment_window. For each segment, compute the TTR. MSTTR score is the sum of these scores divided by the number of segments
# * (Johnson 1944)

# In[17]:


lex.msttr(
    segment_window=25  # size of each segment
)


# ### MATTR: Moving average type-token ratio
# * Computed using the average of TTRs over successive segments of a text
# * Then take the average of all window's TTR
# * (Covington 2007, Covington and McFall 2010)

# In[18]:


# Return moving average type-token ratio (MATTR).
lex.mattr(
    window_size=25  # Size of each sliding window
)


# ### MTLD: Measure of Lexical Diversity
# 
# * Computed as the mean length of sequential words in a text that maintains a minimum threshold TTR score
# * Iterates over words until TTR scores falls below a threshold, then increase factor counter by 1 and start over
# * (McCarthy 2005, McCarthy and Jarvis 2010)

# In[19]:


lex.mtld(
    # Factor threshold for MTLD. 
    # Algorithm skips to a new segment when TTR goes below the threshold
    threshold=0.72  
)


# ### voc-D
# * Vocd score of lexical diversity derived from a series of TTR samplings and curve fittings
# * *Step 1*: Take 100 random samples of 35 words from the text. Compute the mean TTR from the 100 samples
# * *Step 2*: Repeat this procedure for samples of 36 words, 37 words, and so on, all the way to ntokens (recommended as 50 [default]). In each iteration, compute the TTR. Then get the mean TTR over the different number of tokens. So now we have an array of averaged TTR values for ntoken=35, ntoken=36,..., and so on until ntoken=50
# * *Step 3*: Find the best-fitting curve from the empirical function of TTR to word size (ntokens). The value of D that provides the best fit is the vocd score
# * *Step 4*: Repeat steps 1 to 3 for x number (default=3) of times before averaging D, which is the returned value

# In[20]:


lex.vocd(
    ntokens=50,  # Maximum number for the token/word size in the random samplings
    within_sample=100,  # Number of samples
    iterations=3,  # Number of times to repeat steps 1 to 3 before averaging
    seed=42  # Seed for reproducibility
)


# In[21]:


lex.vocd()


# ### voc-D plot utility
# * Utility to plot empirical voc-D curve and the best fitting line

# In[22]:


lex.vocd_fig(
    ntokens=50,  # Maximum number for the token/word size in the random samplings
    within_sample=100,  # Number of samples
    seed=42,  # Seed for reproducibility
    savepath="images/vocd.png",
)


# ### HD-D
# * Hypergeometric distribution diversity (HD-D) score
# * (McCarthy and Jarvis 2007)

# In[23]:


lex.hdd(
    draws=42  # Number of random draws in the hypergeometric distribution
)


# In[ ]: