gensim | doc2vec¶

1 네이버 리뷰 문장의 활용¶

네이버 영화리뷰 단어모델 만들기

In [1]:

from konlpy.tag import Twitter
twitter = Twitter()

# 네이버 영화리뷰 데이터 불러오기 (1/10만 추출한다)
def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        
    from random import randint
    random_data = [data[randint(1, len(data))]  for no in range(int(len(data)/10)) ]
    return random_data

# 한글 Token에 품사정보를 덧붙이기
def tokenize(doc):
    return ['/'.join(t) for t in twitter.pos(doc, norm=True, stem=True)]

In [2]:

%%time
from collections import namedtuple
train_data        = read_data('../data/ratings_train.txt')
train_docs        = [(tokenize(row[1]), row[2]) for row in train_data[1:]]
TaggedDocument    = namedtuple('TaggedDocument', 'words tags')
tagged_train_docs = [TaggedDocument(d, [c]) for d, c in train_docs]

CPU times: user 24.6 s, sys: 335 ms, total: 24.9 s
Wall time: 17.5 s

In [3]:

from pprint import pprint
pprint(tagged_train_docs[0])

TaggedDocument(words=['제이슨/Noun', '스타덤/Noun', '의/Josa', '그/Noun', '흔하다/Adjective', '액션/Noun', '하나/Noun', '화끈/Noun', '하다/Verb', '보이다/Verb', '못/VerbPrefix', '하다/Verb', '.../Punctuation', '조연/Noun', '들/Suffix', '스토리/Noun', '도/Josa', '엉/Exclamation', '성하/Noun', '게/Josa', '마무리/Noun', '되다/Verb', ',/Punctuation', '실망/Noun', '가득하다/Adjective', '영화/Noun'], tags=['0'])

2 doc2vec 파라미터 설정 및 학습¶

세부적인 파라미터 설정

In [4]:

%%time
from gensim.models import doc2vec
# doc_vectorizer = doc2vec.Doc2Vec(
#     dm          = 0,     # PV-DBOW / default 1
#     dbow_words  = 1,     # w2v simultaneous with DBOW d2v / default 0
#     window      = 8,     # distance between the predicted word and context words
#     vector_size = 300,   # vector size
#     alpha       = 0.025, # learning-rate
#     seed        = 1234,
#     min_count   = 20,    # ignore with freq lower
#     min_alpha   = 0.025, # min learning-rate
#     workers     = 4,     # multi cpu
#     hs          = 1,     # hierarchical softmax / default 0
#     negative    = 10,    # negative sampling / default 5
# )

# Doc2Vec 모델을 정의
doc_vectorizer = doc2vec.Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.025, seed=1234)

# 정의된 모델에 Token 자료를 연결
doc_vectorizer.build_vocab(tagged_train_docs)

for epoch in range(10):
    doc_vectorizer.train(tagged_train_docs, 
                         total_examples = doc_vectorizer.corpus_count, 
                         epochs = doc_vectorizer.epochs)
    doc_vectorizer.alpha -= 0.002
    doc_vectorizer.min_alpha = doc_vectorizer.alpha 

# 학습이 완료된 모델의 데이터를 저장한다
doc_vectorizer.save('../data/doc2vec.model')
print("doc2vec Model Saved")

doc2vec Model Saved
CPU times: user 55.9 s, sys: 3.72 s, total: 59.6 s
Wall time: 30.2 s

3 doc2Vec 모델활용¶

In [5]:

%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [6]:

%who

Interactive namespace is empty.

In [7]:

from gensim.models import doc2vec
from pprint import pprint
doc_vectorizer = doc2vec.Doc2Vec.load('../data/doc2vec.model')

In [8]:

pprint(doc_vectorizer.wv.most_similar('공포/Noun'))

[('코미디/Noun', 0.4376811385154724),
 ('스릴러/Noun', 0.43532365560531616),
 ('스럽지도/Josa', 0.39870721101760864),
 ('박진/Noun', 0.39721059799194336),
 ('장르/Noun', 0.389506995677948),
 ('액션영화/Noun', 0.3805934488773346),
 ('블랙/Noun', 0.3729945719242096),
 ('종교/Noun', 0.3727499842643738),
 ('롭고/Josa', 0.3712288737297058),
 ('풍자/Noun', 0.3630199432373047)]

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [9]:

doc_vectorizer.wv.similarity('공포/Noun', 'ㅋㅋ/KoreanParticle')

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[9]:

-0.012412064

In [10]:

pprint(doc_vectorizer.wv.most_similar(positive=['여자/Noun', '공포/Noun'], 
                                      negative=['남자/Noun']))

[('스럽지도/Josa', 0.384132444858551),
 ('장르/Noun', 0.3558204174041748),
 ('코믹/Noun', 0.3339052200317383),
 ('스릴러/Noun', 0.3245369791984558),
 ('고어/Noun', 0.3172786235809326),
 ('명분/Noun', 0.3150975704193115),
 ('히어로/Noun', 0.31317320466041565),
 ('로맨스/Noun', 0.3098544478416443),
 ('복선/Noun', 0.29699862003326416),
 ('신파/Noun', 0.2888486981391907)]

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [11]:

doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun'])[:10]

Out[11]:

array([ 1.99763617e-03,  1.71703286e-03, -2.20827432e-03,  3.30096926e-03,
        3.01561877e-03, -1.26834167e-03,  1.21280085e-02, -1.89038850e-02,
        3.29405302e-05,  2.55695544e-03], dtype=float32)

In [12]:

doc_vectorizer.infer_vector(['픽사/Noun', '최고/Noun', '명작/Noun']).sum()

Out[12]:

-0.15552977

In [13]:

doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun'])[:10]

Out[13]:

array([-0.03374508, -0.00827203,  0.0110028 , -0.00999906, -0.01591366,
        0.00750665, -0.00079125,  0.00492844, -0.00993689, -0.01647354],
      dtype=float32)

In [14]:

doc_vectorizer.infer_vector(['호러/Noun', '여자/Noun', '공포/Noun']).sum()

Out[14]:

0.22114044