from konlpy.tag import Twitter
twitter = Twitter()
# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)
def read_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
from random import randint
random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]
return random_data
# 한글 Token에 품사정보를 덧붙이기
def tokenize(doc):
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
%%time
from collections import namedtuple
train_data = read_data('./data/ratings_train.txt')
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
from pprint import pprint
pprint(tagged_train_docs[0])
%%time
from gensim.models import doc2vec
# doc_vectorizer = doc2vec.Doc2Vec(
# dm = 0, # PV-DBOW / default 1
# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0
# window = 8, # distance between the predicted word and context words
# vector_size = 300, # vector size
# alpha = 0.025, # learning-rate
# seed = 1234,
# min_count = 20, # ignore with freq lower
# min_alpha = 0.025, # min learning-rate
# workers = 4, # multi cpu
# hs = 1, # hierarchical softmax / default 0
# negative = 10, # negative sampling / default 5
# )
# Doc2Vec 모델을 정의
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
# 정의된 모델에 Token 자료를 연결
doc_vectorizer.build_vocab(tagged_train_docs)
for epoch in range(10):
doc_vectorizer.train(tagged_train_docs,
total_examples = doc_vectorizer.corpus_count,
epochs = doc_vectorizer.epochs)
doc_vectorizer.alpha -= 0.002
doc_vectorizer.min_alpha = doc_vectorizer.alpha
# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('./data/doc2vec.model')
%reset
%who
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('./data/doc2vec.model')
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'],
negative=['남자/Noun']))
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()