#!/usr/bin/env python
# coding: utf-8

# # 日本語NLP@janome/spaCy/Python
# 
# ## Installation
# 
# ```bash:install-japanese-nlp-spacy.sh
# virtualenv .env
# source .env/bin/activate
# pip install -U janome jupyter 
# pip install -U Cython git+https://github.com/explosion/spaCy
# pip freeze > .env/requirements.txt
# ```
# 
# `pip spacy`だとちょっと古く、日本語サポート`spacy.ja`が入っていなかったので、上記のように`pip install -U git+https://github.com/explosion/spaCy`で最新版をダウンロードし、ビルドしましょう。Cloud9が約30分ビルドに励んでくれました。
# 
# ## Run Jupyter Notebook on Cloud9 IDE
# spaCyが無事インストールできたら、Jupyter Notebookを起動しましょう。
# Cloud9なら：
# 
# ```sh:start-jupyter-c9.sh
# jupyter notebook --port $PORT --ip $IP --no-browser
# ```
# 

# In[14]:


from janome.tokenizer import Tokenizer
tokenizer = Tokenizer()


# In[15]:


def tokenize(text):
    for token in tokenizer.tokenize(text):
        print(token)


# In[16]:


text1 = 'すもももももももものうち。'
text2 = '庭には鶏が2羽いる。'
text3 = 'にわにはニワトリがにわいる。'
texts = text1 + text2 + text3


# In[17]:


tokenize(text1)


# In[18]:


tokenize(text2)


# In[19]:


tokenize(text3)


# In[20]:


tokenize('にわにはにわにわとりがいる')


# In[21]:


from spacy.ja import Japanese
parser = Japanese()


# In[22]:


def print_token(token):
    print("==========================")
    print("value:",token.orth_)
    print("lemma:",token.lemma_) # lemma is the root of a word
    print("shape:",token.shape_) # shape is capitalization and punctuation

def spacy_parse(text):
    tokens = parser(text)
    tokens_orth = [token.orth_ for token in tokens]
    print(tokens_orth)
    for token in tokens:
        print_token(token)


# In[23]:


spacy_parse(text1)


# In[24]:


spacy_parse(text2)


# In[25]:


spacy_parse(text3) #イマイチ


# In[26]:


spacy_parse(texts)


# ## NLTK?
# 
# NLTKも有力らしいですが、まだ試していません。
# 
# ```sh:install-nlp-nltk-python.sh
# pip install -U nltk requests
# ```
# 
# ## Links
# 
# - [explosion/spaCy, NLP with Python and Cython](https://github.com/explosion/spaCy)
# - [mocobeta/janome, Japanese morphological analysis engine](https://github.com/mocobeta/janome)
# - [nbviewer, a simple way to share Jupyter Notebooks](http://nbviewer.jupyter.org/)
# 

# In[ ]: