from gensim.models import Word2Vec
w2v_model = Word2Vec.load('../backup/model.bin')
w2v_model.wv.most_similar(positive=['woman','king'], negative=['man'])
[('chryses', 0.6371129751205444), ('priest', 0.6282371282577515), ('nymph', 0.6165897250175476), ('thanks', 0.6120550632476807), ('dishonored', 0.6062030792236328), ('narrate', 0.605045735836029), ('angered', 0.6038438677787781), ('chieftains', 0.6015218496322632), ('appease', 0.6003137826919556), ('akhilleus', 0.6002722978591919)]
w2v_model.wv.similarity('woman','man')
0.3998656
# sorted(w2v_model.wv.vocab.keys(), reverse=False)[:14]
len(w2v_model.wv.vocab.keys())
11098
w2v_model.wv.most_similar('stark')
[('principals', 0.9933313131332397), ('threatening', 0.9859204292297363), ('distorting', 0.9620187878608704), ('freeport', 0.9174789190292358), ('stood', 0.8492887616157532), ('extend', 0.8167773485183716), ('douglas', 0.767949104309082), ('1858', 0.7635020017623901), ('conspiracy', 0.762062668800354), ('values', 0.7606658339500427)]
import logging
import pandas as pd
from time import time
# Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s",
datefmt= '%H:%M:%S', level=logging.INFO)
데이터 전처리 작업의 진행
df = pd.read_csv('../backup/simpsons_dataset.csv')
print(df.shape)
# NaN, Null 데이터를 제거 합니다
print(df.isnull().sum())
df = df.dropna().reset_index(drop=True)
print(df.shape)
print(df.isnull().sum())
df.head(3)
(158314, 2) raw_character_text 17814 spoken_words 26459 dtype: int64 (131853, 2) raw_character_text 0 spoken_words 0 dtype: int64
raw_character_text | spoken_words | |
---|---|---|
0 | Miss Hoover | No, actually, it was a little of both. Sometim... |
1 | Lisa Simpson | Where's Mr. Bergstrom? |
2 | Miss Hoover | I don't know. Although I'd sure like to talk t... |
import re, spacy
def cleaning(doc):
txt = [token.lemma_ for token in doc if not token.is_stop]
if len(txt) > 2: # 2글자 이상 단어만 선별
return ' '.join(txt)
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower()
for row in df['spoken_words'])
# Spacy의 PipeLine 을 활용하여 표제어로 변경 및 Stopword 를 제거 합니다.
from tqdm import tqdm
nlp = spacy.load('en', disable=['ner', 'parser']) # stopword 필터링 파이프라인
txt = [cleaning(doc) for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1))]
# 데이터 없는 행제거 및 전처리 완료 된 테이블을 출력 합니다
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)
df.head(3)
131853it [01:07, 1961.82it/s]
(85960, 1)
raw_character_text | spoken_words | |
---|---|---|
0 | Miss Hoover | No, actually, it was a little of both. Sometim... |
1 | Lisa Simpson | Where's Mr. Bergstrom? |
2 | Miss Hoover | I don't know. Although I'd sure like to talk t... |
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
print(df_clean.shape)
df_clean.head(3)
(85960, 1)
clean | |
---|---|
0 | actually little disease magazine news show nat... |
2 | know sure like talk touch lesson plan teach |
3 | life worth live |
"mr_burns", "bart_simpson" 과 같은 bi-gram 모델을 만듭니다.
sent = [row.split() for row in df_clean['clean']]
sent[:3]
[['actually', 'little', 'disease', 'magazine', 'news', 'show', 'natural', 'think'], ['know', 'sure', 'like', 'talk', 'touch', 'lesson', 'plan', 'teach'], ['life', 'worth', 'live']]
# Phrases() 모듈에는 list로 구성된 텍스트를 바로 입력 가능 합니다
# Phraser() 를 사용하는 이유는 Phrases() 의 메모리 차지를 줄입니다
from gensim.models.phrases import Phrases, Phraser
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]
sentences
INFO - 15:35:52: collecting all words and their counts INFO - 15:35:52: PROGRESS: at sentence #0, processed 0 words and 0 word types INFO - 15:35:52: PROGRESS: at sentence #10000, processed 63561 words and 52716 word types INFO - 15:35:52: PROGRESS: at sentence #20000, processed 130949 words and 99637 word types INFO - 15:35:52: PROGRESS: at sentence #30000, processed 192972 words and 138212 word types INFO - 15:35:52: PROGRESS: at sentence #40000, processed 249845 words and 172230 word types INFO - 15:35:52: PROGRESS: at sentence #50000, processed 311277 words and 208051 word types INFO - 15:35:52: PROGRESS: at sentence #60000, processed 373597 words and 243068 word types INFO - 15:35:53: PROGRESS: at sentence #70000, processed 436446 words and 278001 word types INFO - 15:35:53: PROGRESS: at sentence #80000, processed 497916 words and 311099 word types INFO - 15:35:53: collected 329869 word types from a corpus of 537147 words (unigram + bigrams) and 85960 sentences INFO - 15:35:53: using 329869 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000> INFO - 15:35:53: source_vocab length 329869 INFO - 15:35:56: Phraser built with 126 phrasegrams
<gensim.interfaces.TransformedCorpus at 0x7f495b4efba8>
Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams.
from collections import defaultdict
word_freq = defaultdict(int)
for sent in sentences:
for i in sent:
word_freq[i] += 1
print(len(word_freq))
sorted(word_freq, key=word_freq.get, reverse=True)[:10]
# " // ".join(word_freq.keys())
29643
['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']
Gensim 에 내장된 word2vec 로 학습 합니다. 3 단계로 구분한 뒤 단계별 모니터링 하면서 작업을 진행 합니다.
Word2Vec():
이 첫 번째 단계에서는 모델의 매개 변수를 하나씩 설정합니다.
매개 변수 문장을 제공하지 않으므로 의도적으로 모델을 초기화하지 않은 상태로 둡니다.
.build_vocab():
여기에서는 일련의 문장으로 어휘를 구성하여 모델을 초기화했습니다.
로깅을 통해 단어 진행에 대한 min_count 및 sample의 효과와 진행 상황을 더 중요하게 따를 수 있습니다.
특히 샘플은 모델의 성능에 큰 영향을 미치는 것으로 나타났습니다.
.train():
Finally, trains the model.
The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.
개별 Parametor 는 다음과 같습니다.
import multiprocessing
# 컴퓨터에서 연산 가능한 core 숫자를 호출 합니다
cores = multiprocessing.cpu_count()
cores
4
from gensim.models import Word2Vec
w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5,
alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1)
# Building the Vocabulary Table
# Word2Vec requires us to build the vocabulary table
# (simply digesting all the words and filtering out the unique words, and doing some basic counts on them)
t = time()
w2v_model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
INFO - 15:35:58: collecting all words and their counts INFO - 15:35:58: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types INFO - 15:35:58: PROGRESS: at sentence #10000, processed 61706 words, keeping 9491 word types INFO - 15:35:59: PROGRESS: at sentence #20000, processed 127342 words, keeping 14373 word types INFO - 15:35:59: PROGRESS: at sentence #30000, processed 187807 words, keeping 17431 word types INFO - 15:35:59: PROGRESS: at sentence #40000, processed 243316 words, keeping 20124 word types INFO - 15:35:59: PROGRESS: at sentence #50000, processed 303167 words, keeping 22558 word types INFO - 15:35:59: PROGRESS: at sentence #60000, processed 363915 words, keeping 24804 word types INFO - 15:36:00: PROGRESS: at sentence #70000, processed 425375 words, keeping 26960 word types INFO - 15:36:00: PROGRESS: at sentence #80000, processed 485514 words, keeping 28777 word types INFO - 15:36:00: collected 29643 word types from a corpus of 523645 raw words and 85960 sentences INFO - 15:36:00: Loading a fresh vocabulary INFO - 15:36:00: effective_min_count=20 retains 3315 unique words (11% of original 29643, drops 26328) INFO - 15:36:00: effective_min_count=20 leaves 437848 word corpus (83% of original 523645, drops 85797) INFO - 15:36:00: deleting the raw counts dictionary of 29643 items INFO - 15:36:00: sample=6e-05 downsamples 1204 most-common words INFO - 15:36:00: downsampling leaves estimated 199419 word corpus (45.5% of prior 437848) INFO - 15:36:00: estimated required memory for 3315 words and 300 dimensions: 9613500 bytes INFO - 15:36:00: resetting layer weights
Time to build vocab: 0.04 mins
# Training of the model (Parameters of the training)
# total_examples = int : Count of sentences
# epochs = int : Number of iterations (epochs) over the corpus - [10, 20, 30]
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
INFO - 15:36:00: training model with 3 workers on 3315 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 INFO - 15:36:01: EPOCH 1 - PROGRESS: at 39.39% examples, 78203 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:02: EPOCH 1 - PROGRESS: at 86.15% examples, 84963 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:02: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:02: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:02: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:02: EPOCH - 1 : training on 523645 raw words (199755 effective words) took 2.3s, 86796 effective words/s INFO - 15:36:03: EPOCH 2 - PROGRESS: at 45.82% examples, 90257 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:05: EPOCH 2 - PROGRESS: at 88.13% examples, 85172 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:05: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:05: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:05: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:05: EPOCH - 2 : training on 523645 raw words (198855 effective words) took 2.3s, 86358 effective words/s INFO - 15:36:06: EPOCH 3 - PROGRESS: at 41.54% examples, 82756 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:07: EPOCH 3 - PROGRESS: at 84.22% examples, 82341 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:07: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:07: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:07: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:07: EPOCH - 3 : training on 523645 raw words (199333 effective words) took 2.4s, 83467 effective words/s INFO - 15:36:08: EPOCH 4 - PROGRESS: at 43.69% examples, 86246 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:09: EPOCH 4 - PROGRESS: at 90.04% examples, 87661 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:09: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:09: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:09: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:09: EPOCH - 4 : training on 523645 raw words (199442 effective words) took 2.3s, 88133 effective words/s INFO - 15:36:10: EPOCH 5 - PROGRESS: at 43.69% examples, 85630 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:11: EPOCH 5 - PROGRESS: at 90.04% examples, 88778 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:12: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:12: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:12: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:12: EPOCH - 5 : training on 523645 raw words (199772 effective words) took 2.2s, 89311 effective words/s INFO - 15:36:13: EPOCH 6 - PROGRESS: at 43.69% examples, 86464 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:14: EPOCH 6 - PROGRESS: at 88.13% examples, 86912 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:14: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:14: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:14: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:14: EPOCH - 6 : training on 523645 raw words (199674 effective words) took 2.3s, 88173 effective words/s INFO - 15:36:15: EPOCH 7 - PROGRESS: at 43.69% examples, 87081 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:16: EPOCH 7 - PROGRESS: at 90.04% examples, 88962 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:16: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:16: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:16: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:16: EPOCH - 7 : training on 523645 raw words (199235 effective words) took 2.2s, 90257 effective words/s INFO - 15:36:17: EPOCH 8 - PROGRESS: at 43.69% examples, 84497 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:18: EPOCH 8 - PROGRESS: at 90.04% examples, 88015 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:18: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:18: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:18: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:18: EPOCH - 8 : training on 523645 raw words (199529 effective words) took 2.2s, 89366 effective words/s INFO - 15:36:19: EPOCH 9 - PROGRESS: at 45.82% examples, 87895 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:20: EPOCH 9 - PROGRESS: at 90.04% examples, 87242 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:21: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:21: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:21: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:21: EPOCH - 9 : training on 523645 raw words (198889 effective words) took 2.2s, 88585 effective words/s INFO - 15:36:22: EPOCH 10 - PROGRESS: at 45.82% examples, 89262 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:23: EPOCH 10 - PROGRESS: at 91.99% examples, 90315 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:23: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:23: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:23: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:23: EPOCH - 10 : training on 523645 raw words (199127 effective words) took 2.2s, 90887 effective words/s INFO - 15:36:24: EPOCH 11 - PROGRESS: at 41.54% examples, 82666 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:25: EPOCH 11 - PROGRESS: at 90.04% examples, 88063 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:25: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:25: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:25: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:25: EPOCH - 11 : training on 523645 raw words (199146 effective words) took 2.2s, 88544 effective words/s INFO - 15:36:26: EPOCH 12 - PROGRESS: at 41.54% examples, 82819 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:27: EPOCH 12 - PROGRESS: at 86.15% examples, 85256 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:27: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:27: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:27: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:27: EPOCH - 12 : training on 523645 raw words (199233 effective words) took 2.3s, 85932 effective words/s INFO - 15:36:29: EPOCH 13 - PROGRESS: at 47.89% examples, 91893 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:30: EPOCH 13 - PROGRESS: at 93.83% examples, 90291 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:30: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:30: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:30: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:30: EPOCH - 13 : training on 523645 raw words (199783 effective words) took 2.2s, 90916 effective words/s INFO - 15:36:31: EPOCH 14 - PROGRESS: at 45.82% examples, 88879 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:32: EPOCH 14 - PROGRESS: at 93.83% examples, 91737 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:32: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:32: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:32: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:32: EPOCH - 14 : training on 523645 raw words (199586 effective words) took 2.2s, 91999 effective words/s INFO - 15:36:33: EPOCH 15 - PROGRESS: at 39.39% examples, 79193 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:34: EPOCH 15 - PROGRESS: at 88.13% examples, 85931 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:34: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:34: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:34: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:34: EPOCH - 15 : training on 523645 raw words (199069 effective words) took 2.3s, 87451 effective words/s INFO - 15:36:35: EPOCH 16 - PROGRESS: at 45.82% examples, 86120 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:36: EPOCH 16 - PROGRESS: at 90.04% examples, 86970 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:36: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:36: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:36: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:36: EPOCH - 16 : training on 523645 raw words (199228 effective words) took 2.3s, 87862 effective words/s INFO - 15:36:37: EPOCH 17 - PROGRESS: at 43.69% examples, 86403 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:38: EPOCH 17 - PROGRESS: at 90.04% examples, 88756 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:39: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:39: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:39: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:39: EPOCH - 17 : training on 523645 raw words (199507 effective words) took 2.2s, 89617 effective words/s INFO - 15:36:40: EPOCH 18 - PROGRESS: at 41.54% examples, 80584 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:41: EPOCH 18 - PROGRESS: at 90.04% examples, 87118 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:41: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:41: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:41: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:41: EPOCH - 18 : training on 523645 raw words (199602 effective words) took 2.3s, 88206 effective words/s INFO - 15:36:42: EPOCH 19 - PROGRESS: at 45.82% examples, 89088 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:43: EPOCH 19 - PROGRESS: at 90.04% examples, 87161 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:43: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:43: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:43: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:43: EPOCH - 19 : training on 523645 raw words (199656 effective words) took 2.3s, 88199 effective words/s INFO - 15:36:44: EPOCH 20 - PROGRESS: at 45.82% examples, 88658 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:45: EPOCH 20 - PROGRESS: at 91.99% examples, 89779 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:45: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:45: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:45: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:45: EPOCH - 20 : training on 523645 raw words (199528 effective words) took 2.2s, 90283 effective words/s INFO - 15:36:46: EPOCH 21 - PROGRESS: at 41.54% examples, 82527 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:47: EPOCH 21 - PROGRESS: at 90.04% examples, 87982 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:48: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:48: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:48: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:48: EPOCH - 21 : training on 523645 raw words (199284 effective words) took 2.2s, 89021 effective words/s INFO - 15:36:49: EPOCH 22 - PROGRESS: at 41.54% examples, 82536 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:50: EPOCH 22 - PROGRESS: at 86.15% examples, 84963 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:50: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:50: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:50: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:50: EPOCH - 22 : training on 523645 raw words (198940 effective words) took 2.3s, 86840 effective words/s INFO - 15:36:51: EPOCH 23 - PROGRESS: at 45.82% examples, 88984 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:52: EPOCH 23 - PROGRESS: at 90.04% examples, 87060 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:52: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:52: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:52: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:52: EPOCH - 23 : training on 523645 raw words (199452 effective words) took 2.3s, 87866 effective words/s INFO - 15:36:53: EPOCH 24 - PROGRESS: at 43.69% examples, 84958 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:54: EPOCH 24 - PROGRESS: at 90.04% examples, 87942 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:54: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:54: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:54: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:55: EPOCH - 24 : training on 523645 raw words (199363 effective words) took 2.2s, 88733 effective words/s INFO - 15:36:56: EPOCH 25 - PROGRESS: at 41.54% examples, 82595 words/s, in_qsize 1, out_qsize 0 INFO - 15:36:57: EPOCH 25 - PROGRESS: at 90.04% examples, 88227 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:57: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:57: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:57: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:57: EPOCH - 25 : training on 523645 raw words (199378 effective words) took 2.3s, 88375 effective words/s INFO - 15:36:58: EPOCH 26 - PROGRESS: at 41.54% examples, 79913 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:59: EPOCH 26 - PROGRESS: at 86.15% examples, 82995 words/s, in_qsize 0, out_qsize 0 INFO - 15:36:59: worker thread finished; awaiting finish of 2 more threads INFO - 15:36:59: worker thread finished; awaiting finish of 1 more threads INFO - 15:36:59: worker thread finished; awaiting finish of 0 more threads INFO - 15:36:59: EPOCH - 26 : training on 523645 raw words (199376 effective words) took 2.4s, 84789 effective words/s INFO - 15:37:00: EPOCH 27 - PROGRESS: at 41.54% examples, 81776 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:01: EPOCH 27 - PROGRESS: at 86.15% examples, 84585 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:01: worker thread finished; awaiting finish of 2 more threads INFO - 15:37:01: worker thread finished; awaiting finish of 1 more threads INFO - 15:37:01: worker thread finished; awaiting finish of 0 more threads INFO - 15:37:01: EPOCH - 27 : training on 523645 raw words (199427 effective words) took 2.3s, 85989 effective words/s INFO - 15:37:02: EPOCH 28 - PROGRESS: at 45.82% examples, 90238 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:04: EPOCH 28 - PROGRESS: at 91.99% examples, 89660 words/s, in_qsize 1, out_qsize 0 INFO - 15:37:04: worker thread finished; awaiting finish of 2 more threads INFO - 15:37:04: worker thread finished; awaiting finish of 1 more threads INFO - 15:37:04: worker thread finished; awaiting finish of 0 more threads INFO - 15:37:04: EPOCH - 28 : training on 523645 raw words (199613 effective words) took 2.2s, 90478 effective words/s INFO - 15:37:05: EPOCH 29 - PROGRESS: at 41.54% examples, 81478 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:06: EPOCH 29 - PROGRESS: at 90.04% examples, 87387 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:06: worker thread finished; awaiting finish of 2 more threads INFO - 15:37:06: worker thread finished; awaiting finish of 1 more threads INFO - 15:37:06: worker thread finished; awaiting finish of 0 more threads INFO - 15:37:06: EPOCH - 29 : training on 523645 raw words (199377 effective words) took 2.3s, 88449 effective words/s INFO - 15:37:07: EPOCH 30 - PROGRESS: at 43.69% examples, 84277 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:08: EPOCH 30 - PROGRESS: at 88.13% examples, 84638 words/s, in_qsize 0, out_qsize 0 INFO - 15:37:08: worker thread finished; awaiting finish of 2 more threads INFO - 15:37:08: worker thread finished; awaiting finish of 1 more threads INFO - 15:37:08: worker thread finished; awaiting finish of 0 more threads INFO - 15:37:08: EPOCH - 30 : training on 523645 raw words (199395 effective words) took 2.3s, 86538 effective words/s INFO - 15:37:08: training on a 15709350 raw words (5981554 effective words) took 68.1s, 87845 effective words/s
Time to train the model: 1.13 mins
# 추가로 학습하지 않을 경우 init_sims()을 호출하여 메모리 효율을 높입니다.
# which will make the model much more memory-efficient:
w2v_model.init_sims(replace=True)
INFO - 15:37:08: precomputing L2-norms of word weight vectors
앞에서 학습이 완료된 모델을 저장 및 활용할 수 있습니다.
# 메인 캐릭터 호머와 연관성 높은 단어들을 호출 합니다
w2v_model.wv.most_similar(positive=["homer"])
# w2v_model.wv.most_similar(positive=["marge"])
# w2v_model.wv.most_similar(positive=["bart"])
[('depressed', 0.800049901008606), ('sweetheart', 0.7771680355072021), ('snuggle', 0.7697296142578125), ('marge', 0.7636249661445618), ('terrific', 0.7587988376617432), ('good_friend', 0.7575525045394897), ('gee', 0.7561341524124146), ('hammock', 0.7530875205993652), ('feel_well', 0.7504291534423828), ('becky', 0.7494775652885437)]
# bi_gram 인 호머심슨을 대상으로 확인 합니다
w2v_model.wv.most_similar(positive=["homer_simpson"])
[('united_state', 0.785035252571106), ('congratulation', 0.7744359374046326), ('select', 0.773761510848999), ('pleased', 0.7718425989151001), ('council', 0.7696194648742676), ('aboard', 0.7508918046951294), ('recent', 0.748868465423584), ('robert', 0.7483236789703369), ('governor', 0.7412874698638916), ('easily', 0.7393561601638794)]
# 단어간 유사도를 측정 합니다.
# w2v_model.wv.similarity("moe_'s", 'tavern')
w2v_model.wv.similarity('maggie', 'baby')
0.72318304
w2v_model.wv.similarity('bart', 'nelson')
0.66021216
# 단어들 중 연관성이 낮은 단어를 선별 합니다.
# w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])
w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"])
/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/models/keyedvectors.py:877: FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple. Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error in the future. vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
'nelson'
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)
[('see', 0.6492701768875122), ('admire', 0.6330732107162476), ('care', 0.627632737159729)]
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)
[('lisa', 0.7655842304229736), ('hearing', 0.6929087042808533), ('parent', 0.6803374290466309)]
학습한 모델을 보다 포괄적으로 검증 가능한, 시각화 방법을 활용 합니다
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
def tsnescatterplot(model, word, list_names):
""" Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word,
its list of most similar words, and a list of words."""
arrays = np.empty((0, 300), dtype='f')
word_labels = [word]
color_list = ['red']
# adds the vector of the query word
arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
# gets list of most similar words
close_words = model.wv.most_similar([word])
# adds the vector for each of the closest words to the array
for wrd_score in close_words:
wrd_vector = model.wv.__getitem__([wrd_score[0]])
word_labels.append(wrd_score[0])
color_list.append('blue')
arrays = np.append(arrays, wrd_vector, axis=0)
# adds the vector for each of the words from list_names to the array
for wrd in list_names:
wrd_vector = model.wv.__getitem__([wrd])
word_labels.append(wrd)
color_list.append('green')
arrays = np.append(arrays, wrd_vector, axis=0)
# Reduces the dimensionality from 300 to 50 dimensions with PCA
reduc = PCA(n_components=15).fit_transform(arrays)
# Finds t-SNE coordinates for 2 dimensions
np.set_printoptions(suppress=True)
Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
# Sets everything up to plot
df = pd.DataFrame({'x': [x for x in Y[:, 0]],
'y': [y for y in Y[:, 1]],
'words': word_labels, 'color': color_list})
fig, _ = plt.subplots()
fig.set_size_inches(9, 9)
# Basic plot
p1 = sns.regplot(data=df, x="x", y="y", fit_reg=False, marker="o",
scatter_kws={'s': 40,'facecolors': df['color']})
# Adds annotations one by one with a loop
for line in range(0, df.shape[0]):
p1.text(df["x"][line], df['y'][line],
' ' + df["words"][line].title(),
horizontalalignment = 'left',
verticalalignment = 'bottom', size='medium',
color=df['color'][line], weight='normal').set_size(15)
plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
plt.title('t-SNE visualization for {}'.format(word.title()))
%matplotlib inline
tsnescatterplot(w2v_model, 'homer',
['dog', 'bird', 'ah', 'maude', 'bob', 'mel', 'apu', 'duff'])