! apt-get update
! apt-get install g++ openjdk-8-jdk
! pip3 install nltk konlpy wordcloud matplotlib gensim
! apt-get install fonts-nanum*
! apt-get install fontconfig
! fc-cache -fv
! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
! rm -rf /content/.cache/matplotlib/*
script_text = "https://raw.githubusercontent.com/YongBeomKim/nltk_rnd/master/data/movie_memories_of_murder_2003.txt"
font_file = "/usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/NanumGothicCoding.ttf"
# script_text = "../data/movie_memories_of_murder_2003.txt"
# font_file = "../data/D2Coding.ttf"
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
font_name = fm.FontProperties(fname=font_file, size=10).get_name()
plt.rc('font', family=font_name)
fm._rebuild()
mpl.rcParams['axes.unicode_minus'] = False
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Release Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB] Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease Hit:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB] Hit:12 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB] Fetched 252 kB in 2s (131 kB/s) Reading package lists... Done Reading package lists... Done Building dependency tree Reading state information... Done g++ is already the newest version (4:7.3.0-3ubuntu2.1). openjdk-8-jdk is already the newest version (8u191-b12-2ubuntu0.18.04.1). The following package was automatically installed and is no longer required: libnvidia-common-410 Use 'apt autoremove' to remove it. 0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded. Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5) Requirement already satisfied: konlpy in /usr/local/lib/python3.6/dist-packages (0.5.1) Requirement already satisfied: wordcloud in /usr/local/lib/python3.6/dist-packages (1.5.0) Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.0.3) Requirement already satisfied: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0) Requirement already satisfied: JPype1>=0.5.7 in /usr/local/lib/python3.6/dist-packages (from konlpy) (0.6.3) Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from wordcloud) (4.1.1) Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from wordcloud) (1.14.6) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.5.3) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.0.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.3.1) Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.8.0) Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.1.0) Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow->wordcloud) (0.46) Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib) (40.8.0) Requirement already satisfied: boto>=2.32 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.49.0) Requirement already satisfied: bz2file in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (0.98) Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.18.4) Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (1.9.123) Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2.6) Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (3.0.4) Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (1.22) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2019.3.9) Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.9.4) Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.2.0) Requirement already satisfied: botocore<1.13.0,>=1.12.123 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (1.12.123) Requirement already satisfied: docutils>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.123->boto3->smart-open>=1.2.1->gensim) (0.14) Reading package lists... Done Building dependency tree Reading state information... Done Note, selecting 'fonts-nanum-eco' for glob 'fonts-nanum*' Note, selecting 'fonts-nanum' for glob 'fonts-nanum*' Note, selecting 'fonts-nanum-gothic-light' for glob 'fonts-nanum*' Note, selecting 'fonts-nanum-coding' for glob 'fonts-nanum*' Note, selecting 'fonts-nanum-extra' for glob 'fonts-nanum*' fonts-nanum is already the newest version (20170925-1). fonts-nanum-coding is already the newest version (2.5-1). fonts-nanum-eco is already the newest version (1.000-6). fonts-nanum-extra is already the newest version (20170925-1). The following package was automatically installed and is no longer required: libnvidia-common-410 Use 'apt autoremove' to remove it. 0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded. Reading package lists... Done Building dependency tree Reading state information... Done fontconfig is already the newest version (2.12.6-0ubuntu2). The following package was automatically installed and is no longer required: libnvidia-common-410 Use 'apt autoremove' to remove it. 0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded. /usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs /usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs /usr/share/fonts/truetype/dejavu: caching, new cache contents: 22 fonts, 0 dirs /usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs /usr/share/fonts/truetype/nanum: caching, new cache contents: 31 fonts, 0 dirs /usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs /root/.local/share/fonts: skipping, no such directory /root/.fonts: skipping, no such directory /var/cache/fontconfig: cleaning cache directory /root/.cache/fontconfig: not cleaning non-existent cache directory /root/.fontconfig: not cleaning non-existent cache directory fc-cache: succeeded
%matplotlib inline
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
def tsne_plot(model, figsize=(12,12)):
"Creates and TSNE model and plots it"
labels, tokens = [], []
for word in model.wv.vocab:
tokens.append(model[word])
labels.append(word)
tsne_model = TSNE(n_components=2)
new_values = tsne_model.fit_transform(tokens)
x, y = [], []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=figsize)
for i in range(len(x)):
plt.scatter(x[i],y[i])
plt.annotate(labels[i],
xy = (x[i], y[i]),
fontsize=15)
plt.grid(True)
plt.show()
# 텍스트를 줄단위로 끊어서 불러온뒤
# Token 단위로, 한글명사들을 추출한다
def txtnoun(sentences , skip=False, tags=['Noun'], stem=True, set_tokens=False):
r"""
살인의 추억 대본의 텍스트 전처리 작업을 진행합니다
:param sentences: 단일한 Text String 데이터를 입력합니다
:param skip: 분류된 Token 중 사용자가 원하는 형태로 변환된 내용을 출력
:param tags: konlpy 로 분류된 품사중 추출하고자 하는 품사를 정의합니다
:param stem: stemming 작업여부를 정의합니다.
:param set_tokens: return 결과를 token list 객체로 출력할지를 정의합니다
:return: set_tokens 내용에 따라 List, String 타입으로 출력합니다
"""
import re
from konlpy.tag import Okt
twitter = Okt()
result = []
sentences = sentences.replace('\n', '\n|')
sentences = sentences.split('|')
for content in sentences:
texts = content.replace('\n', '') # 해당줄의 줄바꿈 내용 제거
tokenizer = re.compile(r'[^ ㄱ-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자를 선택
token_data = tokenizer.sub('', texts) # 한글과 띄어쓰기를 제외한 모든 부분을 제거
token_data = token_data.split(' ')
sentence = []
for token in token_data:
# skip 대상이 없을 떄
if skip == False:
chk_tok = twitter.pos(token, stem=stem)
chk_tok = [temp[0] for temp in chk_tok if temp[1] in tags]
ckeck = "".join(chk_tok)
if len(ckeck) > 1:
sentence.append(ckeck)
# skip 내용이 있을 때
else:
if token.strip() in skip.keys():
result.append(skip[token.strip()])
else:
chk_tok = twitter.pos(token, stem=stem)
chk_tok = [temp[0] for temp in chk_tok if temp[1] in tags]
ckeck = "".join(chk_tok)
# 전처리가 끝난 결과가 skip에 해당여부 판단
if ckeck.strip() in skip.keys():
result.append(skip[ckeck.strip()])
elif len(ckeck) > 1:
sentence.append(ckeck)
# 단락별 작업이 끝난 뒤 '\n'를 덧붙여서 작업을 종료
temp = "".join(sentence)
if len(temp) > 1:
sentence = " ".join(sentence)
sentence += "\n"
result.append(sentence)
if set_tokens == True:
from nltk.tokenize import word_tokenize
set_token = word_tokenize(" ".join(result))
return list(set(set_token))
else:
return " ".join(result)
skips = {'두만':'박두만', '태윤':'서태윤', '용구':'조용구', '귀옥':'권귀옥',
'희봉':'구희봉', '동철':'신동철', '광호':'백광호', '병순':'조병순',
'해일':'박해일', '광호의':'백광호', '백광호의':'백광호'}
import requests
sentences = requests.get(script_text).text
sentences[:300]
# with open(script_text, 'r') as f:
# sentences = f.read()
# sentences[:300]
'박두만\n서태윤\n조용구\n권귀옥\n구희봉\n신동철\n백광호20대 초반. 정박아\n조병순30대 후반. 변태성향\n박해일20대 초반. 공장 노동자. ‘유력한 용의자’.\n선본 남자, 동네 양아치들... 등등\n30대 초반의 전직 간호조무사, 마을 ‘야매주사’ 여인.\n안송여중 1학년 학생\n소현의 단짝 친구\n박보희, 이향숙, 독고현순, 박명자, 안미선.\n화면 가득 한 남자 아이의 얼굴이 보여진다.\n쏟아지는 햇살 아래, 맑은 눈빛의 아이는 카메라 정면을 응시하고 있다.\n아이는 코스모스 위에 앉아있는 잠자리를 향해 살며시 손을 뻗는다.\n휙~ 잠자리가 날아가 '
sentences = txtnoun(sentences, skip=skips, tags=['Noun'])
script_file = 'script.txt'
with open(script_file, 'w', encoding='utf-8') as file:
file.write(sentences)
%%time
from gensim.models import word2vec
data = word2vec.LineSentence(script_file)
model = word2vec.Word2Vec(data, size=30, window=2, min_count=10,
hs=1, workers=4, iter=100, sg=1)
model_file = "script.model"
model.save(model_file)
CPU times: user 2.29 s, sys: 105 ms, total: 2.4 s Wall time: 2.46 s
# 저장된 학습모델파일 불러오기
from gensim.models import word2vec
model_file = "script.model"
model = word2vec.Word2Vec.load(model_file)
len(model.wv.vocab.keys())
151
model.wv.vocab.keys()
dict_keys(['박두만', '서태윤', '조용구', '권귀옥', '구희봉', '백광호', '박해일', '용의자', '남자', '동네', '소현', '이향숙', '박명자', '화면', '가득', '아이', '얼굴', '아래', '눈빛', '카메라', '고개', '멀리', '점점', '소리', '모습', '머리', '인상', '형사', '박두', '여자', '시체', '후레쉬', '순간', '위로', '거기', '뭔가', '주위', '브래지어', '너머', '순경', '바람', '시작', '표정', '마주', '서로', '시선', '보고', '음악', '취조실', '사무실', '계속', '사진', '바로', '부분', '사람', '갑자기', '분위기', '얘기', '저녁', '어디', '다시', '아저씨', '진짜', '정말', '새끼', '문득', '저기', '한번', '구석', '피해자', '시점', '책상', '주변', '사건', '현장', '여기', '지금', '간다', '참깨밭', '발자국', '하나', '발견', '근처', '반장', '듯이', '잔뜩', '다리', '스타킹', '설영', '자기', '가방', '조그만', '그냥', '그게', '그거', '잠시', '목소리', '가운데', '팬티', '임마', '끄덕', '그대로', '똑바로', '잠깐', '모두', '다른', '경찰', '코피', '휴지', '누군가', '운동화', '라디오', '권기옥', '자리', '장님', '우리', '생각', '버럭', '아무', '기억', '서서히', '서류', '훈련', '불빛', '의경', '자신', '당신', '전경', '범인', '이건', '신문', '방송', '조심스레', '빗줄기', '빗소리', '어둠속', '우산', '그림자', '빗물', '학교', '변소', '사이', '괴남자', '인부', '조병순', '언덕', '언덕집', '조각', '기차', '유전자', '총구'])
# 범인과 관련된 내용 중 사람이름이 안나옴...
model.wv.most_similar('범인', topn=10)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('우리', 0.7175503969192505), ('얘기', 0.6892109513282776), ('변소', 0.6610735058784485), ('그거', 0.6090747714042664), ('그냥', 0.6030449271202087), ('거기', 0.5868494510650635), ('사건', 0.5515612959861755), ('당신', 0.5280258059501648), ('그게', 0.5069879293441772), ('여자', 0.5003743171691895)]
# 현장과 가장 가깝게 등장한 인물이 1명 등장
model.wv.most_similar('현장', topn=10)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('사진', 0.7413114309310913), ('용의자', 0.6940706968307495), ('사건', 0.6851493120193481), ('참깨밭', 0.6236051321029663), ('이향숙', 0.6127980351448059), ('동네', 0.5867651104927063), ('얼굴', 0.5642785429954529), ('시점', 0.5474856495857239), ('서류', 0.5443352460861206), ('박명자', 0.5424180030822754)]
# 현장 과 백광호 와 밀접한 증거들 중에 방해가 되는 내용을 찾는다
model.wv.most_similar(['현장','이향숙'], topn=10)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('참깨밭', 0.7690945863723755), ('사진', 0.7256079912185669), ('사건', 0.6779365539550781), ('발자국', 0.6444573998451233), ('근처', 0.6405833959579468), ('박명자', 0.6084452271461487), ('용의자', 0.5995357036590576), ('얼굴', 0.5953850746154785), ('바로', 0.5729771852493286), ('동네', 0.5670396685600281)]
# 현장 과 백광호 와 밀접한 증거들 중에 방해가 되는 내용을 찾는다
model.wv.most_similar(['현장', '백광호'], topn=10)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('사진', 0.7666734457015991), ('얼굴', 0.7442331910133362), ('용의자', 0.7125645875930786), ('이향숙', 0.6383324265480042), ('참깨밭', 0.6042308807373047), ('사건', 0.6017242074012756), ('시점', 0.5852558016777039), ('조용구', 0.5626519322395325), ('발자국', 0.5593641996383667), ('이건', 0.5500349998474121)]
# 현장 과 백광호 와 밀접한 증거들 중에 '참깨밭' 이 계속 방해가 됨
# 참깨밭에 백광호가 밀접하게 연결되어 있어서 이를 제외한 분석이 필요
model.wv.most_similar(['현장','백광호'], negative=['참깨밭'], topn=15)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('얼굴', 0.7027637362480164), ('용의자', 0.6668912172317505), ('똑바로', 0.6218545436859131), ('사진', 0.5886619687080383), ('아래', 0.5364269614219666), ('시점', 0.5297864079475403), ('구희봉', 0.5276623368263245), ('빗물', 0.5162099003791809), ('그대로', 0.492700457572937), ('조용구', 0.49174538254737854), ('눈빛', 0.49077358841896057), ('보고', 0.47895872592926025), ('서류', 0.4716593027114868), ('서태윤', 0.4564111828804016), ('인부', 0.4538172483444214)]
# 현장 과 백광호 와 밀접한 증거들 중에 '참깨밭' 이 계속 방해가 됨
# 참깨밭에 백광호와 이향숙을 제외한 분석이 필요
model.wv.most_similar(['현장','백광호'], negative=['참깨밭','이향숙'], topn=20)
/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(vec.dtype, np.int):
[('똑바로', 0.4826679229736328), ('아래', 0.43381986021995544), ('시작', 0.4186092019081116), ('보고', 0.4110272228717804), ('총구', 0.3819498121738434), ('그대로', 0.37507113814353943), ('서태윤', 0.362000435590744), ('구희봉', 0.35851922631263733), ('용의자', 0.3528374433517456), ('다리', 0.34275075793266296), ('서류', 0.32808437943458557), ('눈빛', 0.3054373860359192), ('얼굴', 0.3029182553291321), ('가방', 0.29995161294937134), ('조용구', 0.29772767424583435), ('시점', 0.2970396876335144), ('서서히', 0.29514655470848083), ('박해일', 0.2906399667263031), ('빗물', 0.28145352005958557), ('인부', 0.2799288332462311)]
vocab = list(model.wv.vocab)
X = model[vocab]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
from sklearn.manifold import TSNE
tsne = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X)
import pandas as pd
df = pd.DataFrame(X_tsne, index=vocab, columns=['x','y'])
df.head()
x | y | |
---|---|---|
박두만 | 0.697110 | -2.459593 |
서태윤 | 1.422914 | -1.743897 |
조용구 | 0.900567 | -2.790313 |
권귀옥 | -0.797934 | -4.018767 |
구희봉 | 1.042710 | -4.409641 |
%matplotlib inline
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
ax.annotate(word, pos, fontsize=15)
plt.grid(True)
tsne_plot(model)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead). if __name__ == '__main__':
다듬기
# model 에 등장하는 인물들
charator = ["박두만", "서태윤", "조용구", "권귀옥", "구희봉", "신동철", "백광호",
"조병순", "박해일", "박보희", "이향숙", "독고현순", "박명자", "안미선",
"반장", "소현", "범인", "형사", '괴남자', '순경','피해자', '권기옥','용의자']
# model 에 등장하는 장소명 들
area = ['현장', '사무실', '취조실', '변소', '참깨밭', '빗줄기', '어둠속', '언덕집']
# model 에 등장하는 Item 들
items = ['브래지어', '팬티', '우산', '운동화', '스타킹', '목소리', '불빛', '음악', '후레쉬',
'카메라', '라디오', '방송', '유전자', '가방', '코피', '휴지', '신문', '총구']