#!/usr/bin/env python
# coding: utf-8

# In[15]:


get_ipython().run_cell_magic('capture', '', '%load_ext autoreload\n%autoreload 2\n%cd ..\nimport statnlpbook.tokenization as tok\n')


# # Tokenization

# * Identify the **words** in a string of characters. 
# * Improve input **representation** in [structured prediction recipe](structured_prediction.ipynb).

# In Python you can tokenize a text via `split`:

# In[45]:


text = """Mr. Bob Dobolina is thinkin' of a master plan.
Why doesn't he quit?"""
text.split(" ")


# Why is this suboptimal? 

# Python allows users to construct tokenizers using 
# ### Regular Expressions 
# that define **patterns** at which to split tokens.

# A **regular expression** is a compact definition of a **set** of (character) sequences.
# 
# Examples:
# * `"Mr."`: set containing only `"Mr."`
# * `" |\n|!!!"`: set containing the sequences `" "`, `"\n"` and `"!!!"`
# * `"[abc]"`: set containing only the characters `a`, `b` and `c`
# * `"\s"`: set of all whitespace characters
# * `"1+"`: set of all sequences of at least one `"1"` 
# * etc.
# 

# In[53]:


import re
re.compile('\s').split(text)


# Problems:
# * Bad treatment of punctuation.  
# * Easier to **define a token** than a gap. 

# Let us use `findall` instead:

# In[57]:


re.compile('\w+|[.?]').findall(text) # \w+|[.?]


# Problems:
# * "Mr." is split into two tokens, should be single. 
# * Lost an apostrophe. 
# 
# Both is fixed below ...

# In[58]:


re.compile('Mr.|[\w\']+|[.?]').findall(text)


# ## Learning to Tokenize?
# * For English simple pattern matching often sufficient. 
# * In other languages (e.g. Japanese), words are not separated by whitespace.
# 

# In[20]:


jap = "今日もしないといけない。"


# Try lexicon-based tokenization ...

# In[60]:


re.compile('もし|今日|も|しない|と|けない').findall(jap)


# Equally complex for certain English domains (eg. bio-medical text). 

# In[61]:


bio = """We developed a nanocarrier system of herceptin-conjugated nanoparticles
of d-alpha-tocopheryl-co-poly(ethylene glycol) 1000 succinate (TPGS)-cisplatin
prodrug ..."""


# * d-alpha-tocopheryl-co-poly is **one** token
# * (TPGS)-cisplatin are **five**: 
#   * ( 
#   * TPGS 
#   * ) 
#   * - 
#   * cisplatin 

# In[23]:


re.compile('\s').split(bio)[:15]


# Solution: Treat tokenization as a **statistical NLP problem** (and as structured prediction)! 
#   * [classification](doc_classify.ipynb)
#   * [sequence labelling](sequence_labelling.ipynb)

# # Sentence Segmentation
# 
# * Many NLP tools work sentence-by-sentence. 
# * Often trivial after tokenisation: split sentences at sentence-ending punctuation tokens.

# In[63]:


tokens = re.compile('Mr.|[\w\']+|[.?]').findall(text)
# try different regular expressions
tok.sentence_segment(re.compile('\.'), tokens)


# What do you do with transcribed speech? 

# # Background Reading
# 
# * Jurafsky & Martin, Speech and Language Processing: Chapter 2, Regular Expressions and Automata.
# * Manning, Raghavan & Schuetze, Introduction to Information Retrieval: [Tokenization](http://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html)