# ! cat ./data/ratings_test.txt | head -n 10
def read_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
data = data[1:] # header 제외
from random import randint
random_data = [data[randint(1, len(data))] for no in range(int(len(data)/20)) ]
return random_data
train_data = read_data('./data/ratings_train.txt')
test_data = read_data('./data/ratings_test.txt')
print('Train_data ({})\nsample : {}\nTest_data ({})\nsample : {}'.format(
len(train_data), train_data[:3],
len(test_data), test_data[:3]))
%%time
from konlpy.tag import Twitter
twitter = Twitter()
def tokenize(doc):
result = ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
return result
train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]
from pprint import pprint
pprint(train_docs[:2])
tokens = [t for d in train_docs
for t in d[0]]
print("Token Total :{}\nSample : {}".format(
len(tokens), tokens[:5]))
import nltk
text = nltk.Text(tokens, name='네이버리뷰')
print("number of Token : {} \nunique Token : {}\n".format(
len(text.tokens), len(set(text.tokens))))
pprint(text.vocab().most_common( "=Quiz!=" ))
%matplotlib inline
from matplotlib import rc
rc('font', family=['NanumGothic','Malgun Gothic'])
import matplotlib.pyplot as plt
plt.figure(figsize=(16,5))
text.plot( "=Quiz!=" )
# 검색가능 단어목록 추출하기
selected_words = [f[0] for f in text.vocab().most_common("=Quiz!=")]
selected_words[:5]
# Mission 1
# selected 객체를 './data/selected.words' 로 저장하기
import pickle
# pickle.dump(selected_words, open('./data/selected.words', 'wb'))
%%time
def term_exists(doc):
return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}
train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]
# 문장별 Text을 분석가능한 Token으로 변환
line_num = 130
print("100th 문장 변환된 Token 갯수 : {}\n\nSample : {}".format(
len(train_xy[line_num][0]),
train_xy[line_num][0]))
%%time
classifiers = nltk.NaiveBayesClassifier.train(train_xy)
# Mission 2
# classifiers 객체를 './data/classifiers.model' 로 저장하기
import pickle
# pickle.dump(classifiers, open('./data/classifiers.model', 'wb'))
%%time
import pickle
# 학습한 모델객체 저장하기
# Google 에 찾아보기 : Python classifier Save
classifiers.labels()
classifiers.show_most_informative_features(20)
%%time
# Test 데이터로 Model의 정확도 측정
'네이버 영화리뷰 모델의 Accuracy : {}'.format(
nltk.classify.accuracy(classifiers, test_xy))
review = """졸잼 굿 최고입니다 최고"""
# 리뷰 데이터를 Tagged Token 생성하기
review = tokenize(review)
review
# Tagged Token 중 selected_words 포함여부 판단
review = term_exists(review)
for k, v in review.items():
if v == True:
print("{} = {}".format(k, v))
result = classifiers.classify(review) # 분류모델 평가
if result == '1':
print('긍정리뷰')
else:
print('부정리뷰')