#!/usr/bin/env python # coding: utf-8 # # **고급 Feacture 엔지니어링** # ## **1 Gensim 을 활용한 NLP 알고리즘** # Word2Vec # 1. ! pip install gensim # In[1]: from gensim.models import Word2Vec w2v_model = Word2Vec.load('../backup/model.bin') w2v_model.wv.most_similar(positive=['woman','king'], negative=['man']) # In[2]: w2v_model.wv.similarity('woman','man') # In[3]: # sorted(w2v_model.wv.vocab.keys(), reverse=False)[:14] len(w2v_model.wv.vocab.keys()) # ## **2 Gensim 을 활용한 유사도 분석실습** # Word2Vec # 1. ! pip install gensim # 1. **"Ice"** 와 **"Fire"** 의 문학내 유사도 측정 # In[4]: w2v_model.wv.most_similar('stark') #

# # **The Simpson 대본의 학습 및 분석** # 심슨 에피소드의 대본을 활용한 문장내 단어의 의미 분석 Totorial 입니다. # 1. **[Kaggle Tutorial](https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial)** # 1. **[Simpson Script](https://www.kaggle.com/pierremegret/dialogue-lines-of-the-simpsons)** # In[5]: import logging import pandas as pd from time import time # Setting up the loggings to monitor gensim logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO) # ## **1 PreProcessing** # 데이터 전처리 작업의 진행 # In[6]: df = pd.read_csv('../backup/simpsons_dataset.csv') print(df.shape) # NaN, Null 데이터를 제거 합니다 print(df.isnull().sum()) df = df.dropna().reset_index(drop=True) print(df.shape) print(df.isnull().sum()) df.head(3) # In[7]: import re, spacy def cleaning(doc): txt = [token.lemma_ for token in doc if not token.is_stop] if len(txt) > 2: # 2글자 이상 단어만 선별 return ' '.join(txt) brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words']) # Spacy의 PipeLine 을 활용하여 표제어로 변경 및 Stopword 를 제거 합니다. from tqdm import tqdm nlp = spacy.load('en', disable=['ner', 'parser']) # stopword 필터링 파이프라인 txt = [cleaning(doc) for doc in tqdm(nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1))] # 데이터 없는 행제거 및 전처리 완료 된 테이블을 출력 합니다 df_clean = pd.DataFrame({'clean': txt}) df_clean = df_clean.dropna().drop_duplicates() print(df_clean.shape) df.head(3) # In[8]: df_clean = pd.DataFrame({'clean': txt}) df_clean = df_clean.dropna().drop_duplicates() print(df_clean.shape) df_clean.head(3) # ## **2 N-Gram Modeling** # **"mr_burns", "bart_simpson"** 과 같은 **bi-gram** 모델을 만듭니다. # In[9]: sent = [row.split() for row in df_clean['clean']] sent[:3] # In[10]: # Phrases() 모듈에는 list로 구성된 텍스트를 바로 입력 가능 합니다 # Phraser() 를 사용하는 이유는 Phrases() 의 메모리 차지를 줄입니다 from gensim.models.phrases import Phrases, Phraser phrases = Phrases(sent, min_count=30, progress_per=10000) bigram = Phraser(phrases) sentences = bigram[sent] sentences # ## **3 빈도 수의 계산 및 모델링** # Mainly a sanity check of the effectiveness of the lemmatization, removal of stopwords, and addition of bigrams. # In[11]: from collections import defaultdict word_freq = defaultdict(int) for sent in sentences: for i in sent: word_freq[i] += 1 print(len(word_freq)) sorted(word_freq, key=word_freq.get, reverse=True)[:10] # " // ".join(word_freq.keys()) # ## **4 W2V 모델의 학습** # Gensim 에 내장된 **[word2vec](https://radimrehurek.com/gensim/models/word2vec.html)** 로 학습 합니다. 3 단계로 구분한 뒤 단계별 모니터링 하면서 작업을 진행 합니다. # 1. Word2Vec(): # # 이 첫 번째 단계에서는 모델의 매개 변수를 하나씩 설정합니다. # 매개 변수 문장을 제공하지 않으므로 의도적으로 모델을 초기화하지 않은 상태로 둡니다. # # 1. .build_vocab(): # # 여기에서는 일련의 문장으로 어휘를 구성하여 모델을 초기화했습니다. # 로깅을 통해 단어 진행에 대한 min_count 및 sample의 효과와 진행 상황을 더 중요하게 따를 수 있습니다. # 특히 샘플은 모델의 성능에 큰 영향을 미치는 것으로 나타났습니다. # # 1. .train(): # # Finally, trains the model. # The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously. # # 개별 Parametor 는 다음과 같습니다. # 1. **min_count = int :** Ignores all words with total absolute frequency lower than this - (2, 100) # 1. **window = int :** The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10) # 1. **size = int :** Dimensionality of the feature vectors. - (50, 300) # 1. **sample = float :** The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5) # 1. **alpha = float :** The initial learning rate - (0.01, 0.05) # 1. **min_alpha = float :** Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00 # 1. **negative = int :** If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20) # 1. **workers = int :** Use these many worker threads to train the model (=faster training with multicore machines) # In[12]: import multiprocessing # 컴퓨터에서 연산 가능한 core 숫자를 호출 합니다 cores = multiprocessing.cpu_count() cores # In[13]: from gensim.models import Word2Vec w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-1) # Building the Vocabulary Table # Word2Vec requires us to build the vocabulary table # (simply digesting all the words and filtering out the unique words, and doing some basic counts on them) t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) # In[14]: # Training of the model (Parameters of the training) # total_examples = int : Count of sentences # epochs = int : Number of iterations (epochs) over the corpus - [10, 20, 30] t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2))) # In[15]: # 추가로 학습하지 않을 경우 init_sims()을 호출하여 메모리 효율을 높입니다. # which will make the model much more memory-efficient: w2v_model.init_sims(replace=True) # ## **5 W2V 학습한 모델 살펴보기** # 앞에서 학습이 완료된 모델을 저장 및 활용할 수 있습니다. # In[16]: # 메인 캐릭터 호머와 연관성 높은 단어들을 호출 합니다 w2v_model.wv.most_similar(positive=["homer"]) # w2v_model.wv.most_similar(positive=["marge"]) # w2v_model.wv.most_similar(positive=["bart"]) # In[17]: # bi_gram 인 호머심슨을 대상으로 확인 합니다 w2v_model.wv.most_similar(positive=["homer_simpson"]) # In[18]: # 단어간 유사도를 측정 합니다. # w2v_model.wv.similarity("moe_'s", 'tavern') w2v_model.wv.similarity('maggie', 'baby') # In[19]: w2v_model.wv.similarity('bart', 'nelson') # In[20]: # 단어들 중 연관성이 낮은 단어를 선별 합니다. # w2v_model.wv.doesnt_match(['jimbo', 'milhouse', 'kearney']) w2v_model.wv.doesnt_match(["nelson", "bart", "milhouse"]) # In[21]: w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3) # In[22]: w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3) # ## **6 t-sne 를 활용한 시각화** # 학습한 모델을 보다 포괄적으로 검증 가능한, 시각화 방법을 활용 합니다 # In[23]: import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid") from sklearn.decomposition import PCA from sklearn.manifold import TSNE def tsnescatterplot(model, word, list_names): """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query word, its list of most similar words, and a list of words.""" arrays = np.empty((0, 300), dtype='f') word_labels = [word] color_list = ['red'] # adds the vector of the query word arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0) # gets list of most similar words close_words = model.wv.most_similar([word]) # adds the vector for each of the closest words to the array for wrd_score in close_words: wrd_vector = model.wv.__getitem__([wrd_score[0]]) word_labels.append(wrd_score[0]) color_list.append('blue') arrays = np.append(arrays, wrd_vector, axis=0) # adds the vector for each of the words from list_names to the array for wrd in list_names: wrd_vector = model.wv.__getitem__([wrd]) word_labels.append(wrd) color_list.append('green') arrays = np.append(arrays, wrd_vector, axis=0) # Reduces the dimensionality from 300 to 50 dimensions with PCA reduc = PCA(n_components=15).fit_transform(arrays) # Finds t-SNE coordinates for 2 dimensions np.set_printoptions(suppress=True) Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc) # Sets everything up to plot df = pd.DataFrame({'x': [x for x in Y[:, 0]], 'y': [y for y in Y[:, 1]], 'words': word_labels, 'color': color_list}) fig, _ = plt.subplots() fig.set_size_inches(9, 9) # Basic plot p1 = sns.regplot(data=df, x="x", y="y", fit_reg=False, marker="o", scatter_kws={'s': 40,'facecolors': df['color']}) # Adds annotations one by one with a loop for line in range(0, df.shape[0]): p1.text(df["x"][line], df['y'][line], ' ' + df["words"][line].title(), horizontalalignment = 'left', verticalalignment = 'bottom', size='medium', color=df['color'][line], weight='normal').set_size(15) plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50) plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50) plt.title('t-SNE visualization for {}'.format(word.title())) # In[24]: get_ipython().run_line_magic('matplotlib', 'inline') tsnescatterplot(w2v_model, 'homer', ['dog', 'bird', 'ah', 'maude', 'bob', 'mel', 'apu', 'duff'])