#!/usr/bin/env python
# coding: utf-8
#
# # **gensim | doc2vec**
#
# ## **1 네이버 리뷰 문장의 활용**
# 네이버 영화리뷰 단어모델 만들기
# In[1]:
from konlpy.tag import Twitter
twitter = Twitter()
# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)
def read_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
from random import randint
random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]
return random_data
# 한글 Token에 품사정보를 덧붙이기
def tokenize(doc):
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
# In[2]:
get_ipython().run_cell_magic('time', '', "from collections import namedtuple\ntrain_data = read_data('../data/ratings_train.txt')\ntrain_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\nTaggedDocument = namedtuple('TaggedDocument', 'words tags')\ntagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]\n")
# In[3]:
from pprint import pprint
pprint(tagged_train_docs[0])
#
# ## **2 doc2vec 파라미터 설정 및 학습**
# **[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)**
# In[4]:
get_ipython().run_cell_magic('time', '', 'from gensim.models import doc2vec\n# doc_vectorizer = doc2vec.Doc2Vec(\n# dm = 0, # PV-DBOW / default 1\n# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n# window = 8, # distance between the predicted word and context words\n# vector_size = 300, # vector size\n# alpha = 0.025, # learning-rate\n# seed = 1234,\n# min_count = 20, # ignore with freq lower\n# min_alpha = 0.025, # min learning-rate\n# workers = 4, # multi cpu\n# hs = 1, # hierarchical softmax / default 0\n# negative = 10, # negative sampling / default 5\n# )\n\n# Doc2Vec 모델을 정의\ndoc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n\n# 정의된 모델에 Token 자료를 연결\ndoc_vectorizer.build_vocab(tagged_train_docs)\n\nfor epoch in range(10):\n doc_vectorizer.train(tagged_train_docs, \n total_examples = doc_vectorizer.corpus_count, \n epochs = doc_vectorizer.epochs)\n doc_vectorizer.alpha -= 0.002\n doc_vectorizer.min_alpha = doc_vectorizer.alpha \n\n# 학습이 완료된 모델의 데이터를 저장한다\ndoc_vectorizer.save(\'../data/doc2vec.model\')\nprint("doc2vec Model Saved")\n')
#
# ## **3 doc2Vec 모델활용**
# In[5]:
get_ipython().run_line_magic('reset', '')
# In[6]:
get_ipython().run_line_magic('who', '')
# In[7]:
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')
# In[8]:
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))
# In[9]:
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')
# In[10]:
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'],
negative=['남자/Noun']))
# In[11]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]
# In[12]:
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()
# In[13]:
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]
# In[14]:
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()