#!/usr/bin/env python # coding: utf-8 #

# # **gensim | doc2vec** #

# ## **1 네이버 리뷰 문장의 활용** # 네이버 영화리뷰 단어모델 만들기 # In[1]: from konlpy.tag import Twitter twitter = Twitter() # 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다) def read_data(filename): with open(filename, 'r', encoding='utf-8') as f: data = [line.split('\t') for line in f.read().splitlines()] from random import randint random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ] return random_data # 한글 Token에 품사정보를 덧붙이기 def tokenize(doc): return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)] # In[2]: get_ipython().run_cell_magic('time', '', "from collections import namedtuple\ntrain_data = read_data('../data/ratings_train.txt')\ntrain_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]\nTaggedDocument = namedtuple('TaggedDocument', 'words tags')\ntagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]\n") # In[3]: from pprint import pprint pprint(tagged_train_docs[0]) #

# ## **2 doc2vec 파라미터 설정 및 학습** # **[세부적인 파라미터 설정](http://hero4earth.com/blog/projects/2018/01/21/naver_movie_review/)** # In[4]: get_ipython().run_cell_magic('time', '', 'from gensim.models import doc2vec\n# doc_vectorizer = doc2vec.Doc2Vec(\n# dm = 0, # PV-DBOW / default 1\n# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0\n# window = 8, # distance between the predicted word and context words\n# vector_size = 300, # vector size\n# alpha = 0.025, # learning-rate\n# seed = 1234,\n# min_count = 20, # ignore with freq lower\n# min_alpha = 0.025, # min learning-rate\n# workers = 4, # multi cpu\n# hs = 1, # hierarchical softmax / default 0\n# negative = 10, # negative sampling / default 5\n# )\n\n# Doc2Vec 모델을 정의\ndoc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)\n\n# 정의된 모델에 Token 자료를 연결\ndoc_vectorizer.build_vocab(tagged_train_docs)\n\nfor epoch in range(10):\n doc_vectorizer.train(tagged_train_docs, \n total_examples = doc_vectorizer.corpus_count, \n epochs = doc_vectorizer.epochs)\n doc_vectorizer.alpha -= 0.002\n doc_vectorizer.min_alpha = doc_vectorizer.alpha \n\n# 학습이 완료된 모델의 데이터를 저장한다\ndoc_vectorizer.save(\'../data/doc2vec.model\')\nprint("doc2vec Model Saved")\n') #

# ## **3 doc2Vec 모델활용** # In[5]: get_ipython().run_line_magic('reset', '') # In[6]: get_ipython().run_line_magic('who', '') # In[7]: from gensim.models import doc2vec from pprint import pprint doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model') # In[8]: pprint(doc_vectorizer.wv.most_similar('공포/Noun')) # In[9]: doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle') # In[10]: pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], negative=['남자/Noun'])) # In[11]: doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10] # In[12]: doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum() # In[13]: doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10] # In[14]: doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()