#!/usr/bin/env python
# coding: utf-8
#
# # **나이브베이즈 한글 적용**
# nltk를 활용한 네이버 영화리뷰 평가모델 [nltk book](https://www.nltk.org/book/ch06.html) | [nltk How to](http://www.nltk.org/howto/classify.html)
# 1. 0 : 부정적인 리뷰
# 1. 1 : 긍정적인 리뷰
#
# ## **1 NAVER 영화리뷰 데이터 전처리**
# 1. https://github.com/e9t/nsmc
# 1. https://www.nltk.org/book/ch06.html
# In[1]:
# ! cat ./data/ratings_test.txt | head -n 10
# In[2]:
def read_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
data = data[1:] # header 제외
from random import randint
random_data = [data[randint(1, len(data))] for no in range(int(len(data)/20)) ]
return random_data
# In[3]:
train_data = read_data('../data/ratings_train.txt')
test_data = read_data('../data/ratings_test.txt')
print('Train_data ({})\nsample : {}\nTest_data ({})\nsample : {}'.format(
len(train_data), train_data[:3],
len(test_data), test_data[:3]))
# In[4]:
get_ipython().run_cell_magic('time', '', "from konlpy.tag import Twitter\ntwitter = Twitter()\n\ndef tokenize(doc):\n result = ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]\n return result \n\ntrain_docs = [(tokenize(row[1]), row[2]) for row in train_data]\ntest_docs = [(tokenize(row[1]), row[2]) for row in test_data]\n\nfrom pprint import pprint\npprint(train_docs[:2])\n")
# In[5]:
tokens = [t for d in train_docs
for t in d[0]]
print("Token Total :{}\nSample : {}".format(
len(tokens), tokens[:5]))
#
# ## **2 nltk 를 활용하여 연산모델 만들기**
# https://github.com/e9t/nsmc
# In[6]:
import nltk
text = nltk.Text(tokens, name='네이버리뷰')
print("number of Token : {} \nunique Token : {}\n".format(
len(text.tokens), len(set(text.tokens))))
pprint(text.vocab().most_common(20))
# In[7]:
get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib import rc
rc('font', family=['NanumGothic','Malgun Gothic'])
import matplotlib.pyplot as plt
plt.figure(figsize=(16,5))
text.plot(50)
#
# ## **3 모델의 정확도/ 일반화를 높이는 추가작업**
# 우도 상위 4000개 데이터를 추출
# In[8]:
# 검색가능 단어목록 추출하기
selected_words = [f[0] for f in text.vocab().most_common(4000)]
selected_words[:5]
# In[9]:
# Mission 1
# selected 객체를 './data/selected.words' 로 저장하기
import pickle
pickle.dump(selected_words, open('../data/selected.words', 'wb'))
# In[10]:
get_ipython().run_cell_magic('time', '', "def term_exists(doc):\n return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}\n\ntrain_xy = [(term_exists(d), c) for d, c in train_docs]\ntest_xy = [(term_exists(d), c) for d, c in test_docs]\n")
# In[11]:
# 문장별 Text을 분석가능한 Token으로 변환
line_num = 130
print("100th 문장 변환된 Token 갯수 : {}\n\nSample : {}".format(
len(train_xy[line_num][0]),
train_xy[line_num][0]))
# In[12]:
get_ipython().run_cell_magic('time', '', 'classifiers = nltk.NaiveBayesClassifier.train(train_xy)\n')
# In[13]:
# Mission 2
# classifiers 객체를 './data/classifiers.model' 로 저장하기
import pickle
pickle.dump(classifiers, open('../data/classifiers.model', 'wb'))
#
# ## **4 생성한 모델을 평가**
# Accuracy
# 1. 0 : 부정리뷰
# 1. 1 : 긍정리뷰
# In[14]:
classifiers.labels()
# In[15]:
classifiers.show_most_informative_features(20)
# In[16]:
get_ipython().run_cell_magic('time', '', "# Test 데이터로 Model의 정확도 측정\n'네이버 영화리뷰 모델의 Accuracy : {}'.format(\n nltk.classify.accuracy(classifiers, test_xy))\n")
#
# ## **5 모델의 활용**
# 1. 0 : 부정리뷰
# 1. 1 : 긍정리뷰
# In[17]:
review = """졸잼 굿 최고입니다 최고"""
# In[18]:
# 리뷰 데이터를 Tagged Token 생성하기
review = tokenize(review)
review
# In[19]:
# Tagged Token 중 selected_words 포함여부 판단
review = term_exists(review)
for k, v in review.items():
if v == True:
print("{} = {}".format(k, v))
# In[20]:
result = classifiers.classify(review) # 분류모델 평가
if result == '1':
print('긍정리뷰')
else:
print('부정리뷰')