gensim | word2vec¶

1 데이터 전처리¶

In [1]:

import glob
from txtutil import txtnoun
# 2015 ~ 2018 지속가능 경영 보고서 Token을 수집
filelist = glob.glob('../data/kr-Report_201?.txt')
print(filelist)

# 불러온 Document 명사Token만 추출
skiplist = {'갤러시':'갤럭시', '가치창출':'가치창출'}
texts    = [txtnoun(file, skip=skiplist)  for file in filelist]
texts    = " ".join(texts)
texts[:300]

['../data/kr-Report_2016.txt', '../data/kr-Report_2017.txt', '../data/kr-Report_2015.txt', '../data/kr-Report_2018.txt']

Out[1]:

'삼성전자\n 지속가능경영보고서\n 삼성전자 지속가능경영보고서\n 삼성전자 지속가능경영 사람과 사회 환경 조화롭\n 공존 발전\n 가치 컬러 그래픽 조화롭 구성 심플 톤앤매너\n 표지 전달 또한 인재 기술 바탕 최고 제품\n 서비스 창출하 인류사회 공헌 경영이념 전개 사람 사회\n 환경 포괄 영역 삼성전자 활동 세분 라인 조합\n 완성 형태 움직임 통해 표현\n 보고서 작성 개요\n 삼성전자 경제 가치 환경 보호 사회 발전 지속가능경영 통해 세상 긍정\n 가치 제공 지속가능경영 활동 성과 공개 바탕\n 이해관계자 소통 위해 아홉 지속가능경영보고서 발간\n 보'

In [2]:

# 명사 Token 작업된 자료를 ssResport.txt 로 저장 
texts_file = '../data/ssResport.txt'
with open(texts_file, 'w', encoding='utf-8') as file:
    file.write(texts)

In [3]:

# ! cat ./data/ssResport.txt | head -n 10

2 Word 2 vec 객체 만들기¶

gensim

In [4]:

# ! pip3 install gensim

In [5]:

%%time
texts_file = '../data/ssResport.txt'

from gensim.models import word2vec
data  = word2vec.LineSentence(texts_file)
model = word2vec.Word2Vec(data, size=200, window=2, min_count=20, hs=1,
                          workers=4, iter=100, sg=1)
model.save("../data/ssReport.model")
print("model saved.")

model saved.
CPU times: user 1min 17s, sys: 426 ms, total: 1min 18s
Wall time: 31.1 s

3 저장된 객체 활용¶

gensim

In [6]:

%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [7]:

%who

Interactive namespace is empty.

In [8]:

from gensim.models import word2vec
model = word2vec.Word2Vec.load('../data/ssReport.model')
len(model.wv.vocab.keys())

Out[8]:

In [9]:

list(model.wv.index2word)[:10]

Out[9]:

['삼성전자', '관리', '제품', '협력사', '임직원', '사업', '위해', '통해', '글로벌', '교육']

In [10]:

model.wv.most_similar(positive=['삼성전자'])

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[10]:

[('생각', 0.2371642142534256),
 ('인사말', 0.22413358092308044),
 ('이니셔티브', 0.2232244610786438),
 ('지구', 0.22301766276359558),
 ('존중', 0.2171960026025772),
 ('공헌', 0.21072670817375183),
 ('가장', 0.21015311777591705),
 ('행동', 0.2073962390422821),
 ('제작', 0.20662453770637512),
 ('제조', 0.20571179687976837)]

In [11]:

model.wv.most_similar(negative=['삼성전자'])

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[11]:

[('각주', 0.09539655596017838),
 ('주기', 0.090581014752388),
 ('인원', 0.06392117589712143),
 ('훈련', 0.0467417910695076),
 ('모바일', 0.04639029502868652),
 ('에어컨', 0.046074528247117996),
 ('처리', 0.0430106446146965),
 ('변화', 0.0425986647605896),
 ('기여', 0.04008210450410843),
 ('소비자', 0.03669456019997597)]

In [12]:

model.wv.most_similar(positive=['글로벌'])

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[12]:

[('기업시민', 0.27174001932144165),
 ('책임경영', 0.24501878023147583),
 ('매출액', 0.23786215484142303),
 ('의식', 0.23527146875858307),
 ('디스플레이', 0.2316412329673767),
 ('세계', 0.2269509881734848),
 ('판매거점', 0.2236602008342743),
 ('인재양성', 0.21667678654193878),
 ('전세계', 0.21417926251888275),
 ('법인', 0.21227560937404633)]

In [13]:

model.wv.most_similar(negative=['글로벌'])

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[13]:

[('각주', 0.14380551874637604),
 ('선택', 0.10329953581094742),
 ('가족', 0.08594916760921478),
 ('최종', 0.0808052197098732),
 ('검사', 0.08069653809070587),
 ('판단', 0.07758765667676926),
 ('설비', 0.07216480374336243),
 ('혜택', 0.06981527805328369),
 ('결정', 0.06863678246736526),
 ('개사', 0.05936558544635773)]

In [14]:

model.wv.most_similar(positive=['삼성전자','경영활동'], 
                      negative=['근무환경']) # 담당자, 직원

/home/markbaum/Python/nltk/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[14]:

[('가치창출', 0.275915265083313),
 ('기업', 0.2585913836956024),
 ('측면', 0.24071839451789856),
 ('전력', 0.23596522212028503),
 ('사업활동', 0.2355266958475113),
 ('생각', 0.23054496943950653),
 ('책임', 0.22358964383602142),
 ('천톤', 0.2161034494638443),
 ('행동', 0.21353504061698914),
 ('인식', 0.2054463028907776)]

04 Visulaization¶

gensim

In [15]:

list(model.wv.vocab.keys())[:10]

Out[15]:

['삼성전자', '지속가능경영보고서', '지속가능경영', '사회', '환경', '발전', '가치', '구성', '전달', '또한']

In [16]:

# model.wv.vocab : { word: object of numeric vector }
vocab  = list(model.wv.vocab)
X      = model[vocab]
X.shape

/home/markbaum/Python/nltk/lib/python3.6/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  This is separate from the ipykernel package so we can avoid doing imports until

Out[16]:

(927, 200)

In [21]:

%%time
from sklearn.manifold import TSNE
tsne   = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X)

CPU times: user 16.4 s, sys: 1.49 s, total: 17.9 s
Wall time: 17.9 s

In [22]:

import pandas as pd
df = pd.DataFrame(X_tsne, 
                  index = vocab, 
                  columns=['x', 'y'])
df.head()

Out[22]:

	x	y
삼성전자	-0.172935	-0.924926
지속가능경영보고서	-2.630885	-0.719760
지속가능경영	-2.043247	-0.660033
사회	-1.052006	-1.501886
환경	-0.137400	-0.759023

In [23]:

%matplotlib inline
from matplotlib import rc
rc('font', family=['NanumGothic','Malgun Gothic'])

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,12))
ax  = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos)
plt.grid(True)