from konlpy.tag import Twitter
twitter = Twitter()
# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)
def read_data(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = [line.split('\t') for line in f.read().splitlines()]
from random import randint
random_data = [data[randint(1, len(data))] for no in range(int(len(data)/10)) ]
return random_data
# 한글 Token에 품사정보를 덧붙이기
def tokenize(doc):
return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]
%%time
from collections import namedtuple
train_data = read_data('../data/ratings_train.txt')
train_docs = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]
CPU times: user 24.6 s, sys: 335 ms, total: 24.9 s Wall time: 17.5 s
from pprint import pprint
pprint(tagged_train_docs[0])
TaggedDocument(words=['제이슨/Noun', '스타덤/Noun', '의/Josa', '그/Noun', '흔하다/Adjective', '액션/Noun', '하나/Noun', '화끈/Noun', '하다/Verb', '보이다/Verb', '못/VerbPrefix', '하다/Verb', '.../Punctuation', '조연/Noun', '들/Suffix', '스토리/Noun', '도/Josa', '엉/Exclamation', '성하/Noun', '게/Josa', '마무리/Noun', '되다/Verb', ',/Punctuation', '실망/Noun', '가득하다/Adjective', '영화/Noun'], tags=['0'])
%%time
from gensim.models import doc2vec
# doc_vectorizer = doc2vec.Doc2Vec(
# dm = 0, # PV-DBOW / default 1
# dbow_words = 1, # w2v simultaneous with DBOW d2v / default 0
# window = 8, # distance between the predicted word and context words
# vector_size = 300, # vector size
# alpha = 0.025, # learning-rate
# seed = 1234,
# min_count = 20, # ignore with freq lower
# min_alpha = 0.025, # min learning-rate
# workers = 4, # multi cpu
# hs = 1, # hierarchical softmax / default 0
# negative = 10, # negative sampling / default 5
# )
# Doc2Vec 모델을 정의
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)
# 정의된 모델에 Token 자료를 연결
doc_vectorizer.build_vocab(tagged_train_docs)
for epoch in range(10):
doc_vectorizer.train(tagged_train_docs,
total_examples = doc_vectorizer.corpus_count,
epochs = doc_vectorizer.epochs)
doc_vectorizer.alpha -= 0.002
doc_vectorizer.min_alpha = doc_vectorizer.alpha
# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('../data/doc2vec.model')
print("doc2vec Model Saved")
doc2vec Model Saved CPU times: user 55.9 s, sys: 3.72 s, total: 59.6 s Wall time: 30.2 s
%reset
Once deleted, variables cannot be recovered. Proceed (y/[n])? y
%who
Interactive namespace is empty.
from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')
pprint(doc_vectorizer.wv.most_similar('공포/Noun'))
[('코미디/Noun', 0.4376811385154724), ('스릴러/Noun', 0.43532365560531616), ('스럽지도/Josa', 0.39870721101760864), ('박진/Noun', 0.39721059799194336), ('장르/Noun', 0.389506995677948), ('액션영화/Noun', 0.3805934488773346), ('블랙/Noun', 0.3729945719242096), ('종교/Noun', 0.3727499842643738), ('롭고/Josa', 0.3712288737297058), ('풍자/Noun', 0.3630199432373047)]
/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')
/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
-0.012412064
pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'],
negative=['남자/Noun']))
[('스럽지도/Josa', 0.384132444858551), ('장르/Noun', 0.3558204174041748), ('코믹/Noun', 0.3339052200317383), ('스릴러/Noun', 0.3245369791984558), ('고어/Noun', 0.3172786235809326), ('명분/Noun', 0.3150975704193115), ('히어로/Noun', 0.31317320466041565), ('로맨스/Noun', 0.3098544478416443), ('복선/Noun', 0.29699862003326416), ('신파/Noun', 0.2888486981391907)]
/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]
array([ 1.99763617e-03, 1.71703286e-03, -2.20827432e-03, 3.30096926e-03, 3.01561877e-03, -1.26834167e-03, 1.21280085e-02, -1.89038850e-02, 3.29405302e-05, 2.55695544e-03], dtype=float32)
doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()
-0.15552977
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]
array([-0.03374508, -0.00827203, 0.0110028 , -0.00999906, -0.01591366, 0.00750665, -0.00079125, 0.00492844, -0.00993689, -0.01647354], dtype=float32)
doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()
0.22114044