#!/usr/bin/env python
# coding: utf-8

# # **자연어 처리 바이블**
# - **[정규표현식 시작하기](https://wikidocs.net/4308) | [정규표현식 고급편](https://wikidocs.net/4309)**
# 
# ```java
# pip install konlpy
# pip install pororo
# bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
# ```
# ## **1 구문분석**
# ### **Tokeninzer**

# In[1]:


from konlpy.tag import Kkma
from konlpy.tag import Okt
from konlpy.tag import Mecab
Mecab().pos("아이들이 케이크를 먹었다")


# In[2]:


from pororo import Pororo
Pororo.available_models("collocation")
ner = Pororo(task="ner", lang="ko")
ner("마이클 제프리 조던(영어: Michael Jeffrey Jordan, 1963년 2월 17일 ~ )농구선수이다.")


# ## **2 NLTK 를 활용한 구조 구문분석**
# ### **01 NLTK 패키지를 활용한 규칙기반 구조 구문분석**

# In[3]:


import nltk
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> NN XSN JK | NN JK
VP -> NP VP | VV EP EF
NN -> '아이' | '케이크'
XSN -> '들'
JK -> '이' | '를'
VV -> '먹'
EP -> '었'
EF -> '다'
""")
parser = nltk.ChartParser(grammar)
parser


# In[4]:


texts = "아이들이 케이크를 먹었다"
from konlpy.tag import Mecab
tokens = Mecab().pos("아이들이 케이크를 먹었다")
sentence = [_[0]  for _ in tokens]
print(", ".join(sentence))
for tree in parser.parse(sentence):
    print(tree)


# ### **02 Spacy 를 이용한 의존 구문 분석**
# ```r
# ! pip install spacy
# ! python -m spacy download en
# ```
# - Spacy 모델은 문장을 token들로 구성된 document로 처리한다.
# - 각 token에는 품사, 의존 관계, 개체명 정보 등이 태깅된다.
#     - token.text: token 문자열
#     - token.dep_: token과 token의 지배소 간의 의존 관계 유형
#     - token.head: 지배소 token
# 
# 

# In[5]:


import spacy
# 영어 multi-task 통계 모델
nlp = spacy.load('en_core_web_sm')
doc = nlp('The fat cat sat on the mat')
for token in doc:
    print(token.text, token.dep_, token.head.text)


# In[6]:


from spacy import displacy
# Jupyter, Colab 등에서 동작
displacy.render(doc, style='dep', jupyter=True)


# ## **3 단어 의미 중의성 문제의 해결**
# ### **01 Lesk 알고리즘을 이용한 단어 중의성 해소**

# In[7]:


import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import wordnet 
from nltk import word_tokenize
from nltk.corpus import stopwords
import sys


# In[8]:


# 단어와  문장에 나타난 단어에 대해  Best Sense 추출
def disambiguate(word, sentence, stopwords):
    # Best sense 를 얻기위한 Lesk 알고리즘을 작성
    word_senses = wordnet.synsets(word)
    # Assume that first sense is most freq
    best_sense  = word_senses[0]
    max_overlap = 0
    context = set(word_tokenize(sentence))
    for sense in word_senses:
        signature = tokenized_gloss(sense)
        overlap = compute_overlap(signature, context, stopwords)
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense


# In[9]:


# sense의 definition에 대한 모든 token 추출
def tokenized_gloss(sense):
    tokens = set(word_tokenize(sense.definition()))
    for example in sense.examples():
        tokens.union(set(word_tokenize(example)))
    return tokens


# In[10]:


# 겹치는 단어의 비교
def compute_overlap(signature, context, stopwords):
    gloss = signature.difference(stopwords)
    return len(gloss.intersection(context))


# In[11]:


# NLTK에서 지정한 영어 불용어 처리
# ex) i, my, they
stopwords = set(stopwords.words('english'))
sentence = ("They eat a meal")
context = set(word_tokenize(sentence))
word = 'eat'
print("Word :", word)

syn = wordnet.synsets('eat')[1]
print("Sense :", syn.name())
print("Definition :", syn.definition())
print("Sentence :", sentence)

signature = tokenized_gloss(syn)
print(signature)
print(compute_overlap(signature, context, stopwords))
print("Best sense: ", disambiguate(word, sentence, stopwords))


# ## **4 NLTK를 이용한 개체명 인식**
# ### **01 Lesk 알고리즘을 이용한 단어 중의성 해소**

# In[12]:


# -*- coding:utf-8 -*-
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')


# In[13]:


sentence = "Prime Minister Boris Johnson had previously said the UK would leave by 31 October."
tokens = nltk.word_tokenize(sentence)
print(tokens)


# In[14]:


tagged = nltk.pos_tag(tokens)
print(tagged)


# In[15]:


entities = nltk.chunk.ne_chunk(tagged)
print(entities)


# ## **5 N-Gram 언어 모델로 문장 생서하기**
# ### **01 Lesk 알고리즘을 이용한 단어 중의성 해소**

# In[22]:


texts = "마이클 제프리 조던(Michael Jeffrey Jordan) 농구선수이다."

from konlpy.tag import Mecab
from nltk.util import ngrams
tokens = Mecab().pos(texts)
tokens = ["/".join(_)   for _ in tokens]

# 토큰을 N-gram의 형태로 바꾸어준다. 
# ngrams 함수의 두 번째 인자로 N값을 지정할 수 있다.
trigram = ngrams(tokens, 3)
[_  for _ in trigram]


# In[23]:


# padding 을 통해 입력 데이터에 문장의 시작과 끝을 알리는 토큰을 추가
bigram = ngrams(
    tokens, 2,
    pad_left=True, left_pad_symbol="<s>",
    pad_right=True, right_pad_symbol="</s>"
)
print("bigrams with padding: ")
for b in bigram:
    print(b)


# In[25]:


# 다운로드 받은 데이터셋을 읽고 인덱스와 라벨을 제외한 텍스트 부분만 가져온다.
# codecs 패키지는 대용량 파일을 조금씩 읽을 수 있게 해준다. 
import codecs
with codecs.open("data/ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()] # \n 제외
    data = data[1:] # header 제외

# 총 15만개의 문장으로 이루어진 데이터셋임을 알 수 있다.
docs = [row[1] for row in data] # 텍스트 부분만 가져옴
print(f"데이터셋: {data[:10]}\n텍스트 데이터: {docs[:5]}\n문장 개수: {len(docs)}") 


# In[28]:


from tqdm import tqdm
# 토큰화한 텍스트 데이터의 bigram을 모두 리스트에 추가한다.
sentences = []
for _ in tqdm(docs):
    tokens = ["/".join(_)   for _ in _]
    bigram = ngrams(tokens, 2, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>")
    sentences += [t for t in bigram]
print(sentences[:5])