#!/usr/bin/env python # coding: utf-8 #

# # **나이브베이즈 한글 적용** # nltk를 활용한 네이버 영화리뷰 평가모델 [nltk book](https://www.nltk.org/book/ch06.html) | [nltk How to](http://www.nltk.org/howto/classify.html) # 1. 0 : 부정적인 리뷰 # 1. 1 : 긍정적인 리뷰 #

# ## **1 NAVER 영화리뷰 데이터 전처리** # 1. https://github.com/e9t/nsmc # 1. https://www.nltk.org/book/ch06.html # In[1]: # ! cat ./data/ratings_test.txt | head -n 10 # In[2]: def read_data(filename): with open(filename, 'r', encoding='utf-8') as f: data = [line.split('\t') for line in f.read().splitlines()] data = data[1:] # header 제외 from random import randint random_data = [data[randint(1, len(data))] for no in range(int(len(data)/20)) ] return random_data # In[3]: train_data = read_data('../data/ratings_train.txt') test_data = read_data('../data/ratings_test.txt') print('Train_data ({})\nsample : {}\nTest_data ({})\nsample : {}'.format( len(train_data), train_data[:3], len(test_data), test_data[:3])) # In[4]: get_ipython().run_cell_magic('time', '', "from konlpy.tag import Twitter\ntwitter = Twitter()\n\ndef tokenize(doc):\n result = ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]\n return result \n\ntrain_docs = [(tokenize(row[1]), row[2]) for row in train_data]\ntest_docs = [(tokenize(row[1]), row[2]) for row in test_data]\n\nfrom pprint import pprint\npprint(train_docs[:2])\n") # In[5]: tokens = [t for d in train_docs for t in d[0]] print("Token Total :{}\nSample : {}".format( len(tokens), tokens[:5])) #

# ## **2 nltk 를 활용하여 연산모델 만들기** # https://github.com/e9t/nsmc # In[6]: import nltk text = nltk.Text(tokens, name='네이버리뷰') print("number of Token : {} \nunique Token : {}\n".format( len(text.tokens), len(set(text.tokens)))) pprint(text.vocab().most_common(20)) # In[7]: get_ipython().run_line_magic('matplotlib', 'inline') from matplotlib import rc rc('font', family=['NanumGothic','Malgun Gothic']) import matplotlib.pyplot as plt plt.figure(figsize=(16,5)) text.plot(50) #

# ## **3 모델의 정확도/ 일반화를 높이는 추가작업** # 우도 상위 4000개 데이터를 추출 # In[8]: # 검색가능 단어목록 추출하기 selected_words = [f[0] for f in text.vocab().most_common(4000)] selected_words[:5] # In[9]: # Mission 1 # selected 객체를 './data/selected.words' 로 저장하기 import pickle pickle.dump(selected_words, open('../data/selected.words', 'wb')) # In[10]: get_ipython().run_cell_magic('time', '', "def term_exists(doc):\n return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}\n\ntrain_xy = [(term_exists(d), c) for d, c in train_docs]\ntest_xy = [(term_exists(d), c) for d, c in test_docs]\n") # In[11]: # 문장별 Text을 분석가능한 Token으로 변환 line_num = 130 print("100th 문장 변환된 Token 갯수 : {}\n\nSample : {}".format( len(train_xy[line_num][0]), train_xy[line_num][0])) # In[12]: get_ipython().run_cell_magic('time', '', 'classifiers = nltk.NaiveBayesClassifier.train(train_xy)\n') # In[13]: # Mission 2 # classifiers 객체를 './data/classifiers.model' 로 저장하기 import pickle pickle.dump(classifiers, open('../data/classifiers.model', 'wb')) #

# ## **4 생성한 모델을 평가** # Accuracy # 1. 0 : 부정리뷰 # 1. 1 : 긍정리뷰 # In[14]: classifiers.labels() # In[15]: classifiers.show_most_informative_features(20) # In[16]: get_ipython().run_cell_magic('time', '', "# Test 데이터로 Model의 정확도 측정\n'네이버 영화리뷰 모델의 Accuracy : {}'.format(\n nltk.classify.accuracy(classifiers, test_xy))\n") #

# ## **5 모델의 활용** # 1. 0 : 부정리뷰 # 1. 1 : 긍정리뷰 # In[17]: review = """졸잼 굿 최고입니다 최고""" # In[18]: # 리뷰 데이터를 Tagged Token 생성하기 review = tokenize(review) review # In[19]: # Tagged Token 중 selected_words 포함여부 판단 review = term_exists(review) for k, v in review.items(): if v == True: print("{} = {}".format(k, v)) # In[20]: result = classifiers.classify(review) # 분류모델 평가 if result == '1': print('긍정리뷰') else: print('부정리뷰')