#!/usr/bin/env python # coding: utf-8 # In[15]: get_ipython().run_cell_magic('capture', '', '%load_ext autoreload\n%autoreload 2\n%cd ..\nimport statnlpbook.tokenization as tok\n') # # Tokenization # * Identify the **words** in a string of characters. # * Improve input **representation** in [structured prediction recipe](structured_prediction.ipynb). # In Python you can tokenize a text via `split`: # In[45]: text = """Mr. Bob Dobolina is thinkin' of a master plan. Why doesn't he quit?""" text.split(" ") # Why is this suboptimal? # Python allows users to construct tokenizers using # ### Regular Expressions # that define **patterns** at which to split tokens. # A **regular expression** is a compact definition of a **set** of (character) sequences. # # Examples: # * `"Mr."`: set containing only `"Mr."` # * `" |\n|!!!"`: set containing the sequences `" "`, `"\n"` and `"!!!"` # * `"[abc]"`: set containing only the characters `a`, `b` and `c` # * `"\s"`: set of all whitespace characters # * `"1+"`: set of all sequences of at least one `"1"` # * etc. # # In[53]: import re re.compile('\s').split(text) # Problems: # * Bad treatment of punctuation. # * Easier to **define a token** than a gap. # Let us use `findall` instead: # In[57]: re.compile('\w+|[.?]').findall(text) # \w+|[.?] # Problems: # * "Mr." is split into two tokens, should be single. # * Lost an apostrophe. # # Both is fixed below ... # In[58]: re.compile('Mr.|[\w\']+|[.?]').findall(text) # ## Learning to Tokenize? # * For English simple pattern matching often sufficient. # * In other languages (e.g. Japanese), words are not separated by whitespace. # # In[20]: jap = "今日もしないといけない。" # Try lexicon-based tokenization ... # In[60]: re.compile('もし|今日|も|しない|と|けない').findall(jap) # Equally complex for certain English domains (eg. bio-medical text). # In[61]: bio = """We developed a nanocarrier system of herceptin-conjugated nanoparticles of d-alpha-tocopheryl-co-poly(ethylene glycol) 1000 succinate (TPGS)-cisplatin prodrug ...""" # * d-alpha-tocopheryl-co-poly is **one** token # * (TPGS)-cisplatin are **five**: # * ( # * TPGS # * ) # * - # * cisplatin # In[23]: re.compile('\s').split(bio)[:15] # Solution: Treat tokenization as a **statistical NLP problem** (and as structured prediction)! # * [classification](doc_classify.ipynb) # * [sequence labelling](sequence_labelling.ipynb) # # Sentence Segmentation # # * Many NLP tools work sentence-by-sentence. # * Often trivial after tokenisation: split sentences at sentence-ending punctuation tokens. # In[63]: tokens = re.compile('Mr.|[\w\']+|[.?]').findall(text) # try different regular expressions tok.sentence_segment(re.compile('\.'), tokens) # What do you do with transcribed speech? # # Background Reading # # * Jurafsky & Martin, Speech and Language Processing: Chapter 2, Regular Expressions and Automata. # * Manning, Raghavan & Schuetze, Introduction to Information Retrieval: [Tokenization](http://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html)