Notebook

# **Chapter 3 | word2vec** ## **1 데이터 전처리** [**word2vec matplotlib**](https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne)

In [1]:

! apt-get update
! apt-get install g++ openjdk-8-jdk 
! pip3  install  nltk konlpy wordcloud matplotlib gensim 

! apt-get install fonts-nanum*
! apt-get install fontconfig
! fc-cache -fv
! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
! rm -rf /content/.cache/matplotlib/*

script_text = "https://raw.githubusercontent.com/YongBeomKim/nltk_rnd/master/data/movie_memories_of_murder_2003.txt"
font_file   = "/usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/NanumGothicCoding.ttf"
# script_text = "../data/movie_memories_of_murder_2003.txt"
# font_file = "../data/D2Coding.ttf"

import matplotlib as mpl        
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

font_name   = fm.FontProperties(fname=font_file, size=10).get_name()
plt.rc('font', family=font_name)
fm._rebuild()
mpl.rcParams['axes.unicode_minus'] = False

Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease
Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Fetched 252 kB in 2s (131 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
g++ is already the newest version (4:7.3.0-3ubuntu2.1).
openjdk-8-jdk is already the newest version (8u191-b12-2ubuntu0.18.04.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.
Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5)
Requirement already satisfied: konlpy in /usr/local/lib/python3.6/dist-packages (0.5.1)
Requirement already satisfied: wordcloud in /usr/local/lib/python3.6/dist-packages (1.5.0)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.0.3)
Requirement already satisfied: gensim in /usr/local/lib/python3.6/dist-packages (3.6.0)
Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0)
Requirement already satisfied: JPype1>=0.5.7 in /usr/local/lib/python3.6/dist-packages (from konlpy) (0.6.3)
Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from wordcloud) (4.1.1)
Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from wordcloud) (1.14.6)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.5.3)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.0.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.3.1)
Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.8.0)
Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim) (1.1.0)
Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow->wordcloud) (0.46)
Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib) (40.8.0)
Requirement already satisfied: boto>=2.32 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.49.0)
Requirement already satisfied: bz2file in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (0.98)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (2.18.4)
Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim) (1.9.123)
Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2.6)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (3.0.4)
Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (1.22)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->smart-open>=1.2.1->gensim) (2019.3.9)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.9.4)
Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (0.2.0)
Requirement already satisfied: botocore<1.13.0,>=1.12.123 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim) (1.12.123)
Requirement already satisfied: docutils>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.123->boto3->smart-open>=1.2.1->gensim) (0.14)
Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'fonts-nanum-eco' for glob 'fonts-nanum*'
Note, selecting 'fonts-nanum' for glob 'fonts-nanum*'
Note, selecting 'fonts-nanum-gothic-light' for glob 'fonts-nanum*'
Note, selecting 'fonts-nanum-coding' for glob 'fonts-nanum*'
Note, selecting 'fonts-nanum-extra' for glob 'fonts-nanum*'
fonts-nanum is already the newest version (20170925-1).
fonts-nanum-coding is already the newest version (2.5-1).
fonts-nanum-eco is already the newest version (1.000-6).
fonts-nanum-extra is already the newest version (20170925-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
fontconfig is already the newest version (2.12.6-0ubuntu2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.
/usr/share/fonts: caching, new cache contents: 0 fonts, 1 dirs
/usr/share/fonts/truetype: caching, new cache contents: 0 fonts, 3 dirs
/usr/share/fonts/truetype/dejavu: caching, new cache contents: 22 fonts, 0 dirs
/usr/share/fonts/truetype/liberation: caching, new cache contents: 16 fonts, 0 dirs
/usr/share/fonts/truetype/nanum: caching, new cache contents: 31 fonts, 0 dirs
/usr/local/share/fonts: caching, new cache contents: 0 fonts, 0 dirs
/root/.local/share/fonts: skipping, no such directory
/root/.fonts: skipping, no such directory
/var/cache/fontconfig: cleaning cache directory
/root/.cache/fontconfig: not cleaning non-existent cache directory
/root/.fontconfig: not cleaning non-existent cache directory
fc-cache: succeeded

In [0]:

%matplotlib inline
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def tsne_plot(model, figsize=(12,12)):
    "Creates and TSNE model and plots it"
    labels, tokens = [], []
    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)

    tsne_model = TSNE(n_components=2)
    new_values = tsne_model.fit_transform(tokens)

    x, y = [], []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=figsize) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy = (x[i], y[i]),
                     fontsize=15)
    plt.grid(True)
    plt.show()

In [0]:

# 텍스트를 줄단위로 끊어서 불러온뒤
# Token 단위로, 한글명사들을 추출한다
def txtnoun(sentences , skip=False, tags=['Noun'], stem=True, set_tokens=False):

    r"""
    살인의 추억 대본의 텍스트 전처리 작업을 진행합니다
    :param sentences: 단일한 Text String 데이터를 입력합니다
    :param skip: 분류된 Token 중 사용자가 원하는 형태로 변환된 내용을 출력
    :param tags: konlpy 로 분류된 품사중 추출하고자 하는 품사를 정의합니다
    :param stem: stemming 작업여부를 정의합니다.
    :param set_tokens: return 결과를 token list 객체로 출력할지를 정의합니다
    :return: set_tokens 내용에 따라 List, String 타입으로 출력합니다
    """
    
    import re
    from konlpy.tag import Okt
    twitter = Okt()
    result  = []
    sentences = sentences.replace('\n', '\n|')
    sentences = sentences.split('|')
    for content in sentences:
        texts      = content.replace('\n', '') # 해당줄의 줄바꿈 내용 제거
        tokenizer  = re.compile(r'[^ ㄱ-힣]+')  # 한글과 띄어쓰기를 제외한 모든 글자를 선택
        token_data = tokenizer.sub('', texts)  # 한글과 띄어쓰기를 제외한 모든 부분을 제거
        token_data = token_data.split(' ')
        sentence   = []

        for token in token_data:
            # skip 대상이 없을 떄
            if skip == False:
                chk_tok = twitter.pos(token, stem=stem)
                chk_tok = [temp[0]  for temp in chk_tok   if temp[1] in tags]
                ckeck   = "".join(chk_tok)
                if len(ckeck) > 1:
                    sentence.append(ckeck)

            # skip 내용이 있을 때
            else:
                if token.strip() in skip.keys():
                    result.append(skip[token.strip()])
                else:
                    chk_tok = twitter.pos(token, stem=stem)
                    chk_tok = [temp[0] for temp in chk_tok if temp[1] in tags]
                    ckeck   = "".join(chk_tok)

                    # 전처리가 끝난 결과가 skip에 해당여부 판단
                    if ckeck.strip() in skip.keys():
                        result.append(skip[ckeck.strip()])
                    elif len(ckeck) > 1:
                        sentence.append(ckeck)

        # 단락별 작업이 끝난 뒤 '\n'를 덧붙여서 작업을 종료
        temp = "".join(sentence)
        if len(temp) > 1:
            sentence = " ".join(sentence)
            sentence += "\n"
            result.append(sentence)

    if set_tokens == True:
        from nltk.tokenize import word_tokenize
        set_token = word_tokenize(" ".join(result))
        return list(set(set_token))

    else:
        return " ".join(result)

In [0]:

skips = {'두만':'박두만', '태윤':'서태윤', '용구':'조용구', '귀옥':'권귀옥', 
         '희봉':'구희봉', '동철':'신동철', '광호':'백광호', '병순':'조병순', 
         '해일':'박해일', '광호의':'백광호', '백광호의':'백광호'}

In [5]:

import requests
sentences = requests.get(script_text).text
sentences[:300]

# with open(script_text, 'r') as f:
#     sentences = f.read()
# sentences[:300]

Out[5]:

'박두만\n서태윤\n조용구\n권귀옥\n구희봉\n신동철\n백광호20대 초반. 정박아\n조병순30대 후반. 변태성향\n박해일20대 초반. 공장 노동자. ‘유력한 용의자’.\n선본 남자, 동네 양아치들... 등등\n30대 초반의 전직 간호조무사, 마을 ‘야매주사’ 여인.\n안송여중 1학년 학생\n소현의 단짝 친구\n박보희, 이향숙, 독고현순, 박명자, 안미선.\n화면 가득 한 남자 아이의 얼굴이 보여진다.\n쏟아지는 햇살 아래, 맑은 눈빛의 아이는 카메라 정면을 응시하고 있다.\n아이는 코스모스 위에 앉아있는 잠자리를 향해 살며시 손을 뻗는다.\n휙~ 잠자리가 날아가 '

In [0]:

sentences   = txtnoun(sentences, skip=skips, tags=['Noun'])
script_file = 'script.txt'
with open(script_file, 'w', encoding='utf-8') as file:
    file.write(sentences)

In [7]:

%%time
from gensim.models import word2vec
data  = word2vec.LineSentence(script_file)
model = word2vec.Word2Vec(data, size=30, window=2, min_count=10, 
                          hs=1, workers=4, iter=100, sg=1)
model_file = "script.model"
model.save(model_file)

CPU times: user 2.29 s, sys: 105 ms, total: 2.4 s
Wall time: 2.46 s

## **2 Word2Vec 모델의 활용** 모델을 활용하여 유력한 범인을 찾아보자!!

In [8]:

# 저장된 학습모델파일 불러오기
from gensim.models import word2vec
model_file = "script.model"
model = word2vec.Word2Vec.load(model_file)
len(model.wv.vocab.keys())

Out[8]:

In [9]:

model.wv.vocab.keys()

Out[9]:

dict_keys(['박두만', '서태윤', '조용구', '권귀옥', '구희봉', '백광호', '박해일', '용의자', '남자', '동네', '소현', '이향숙', '박명자', '화면', '가득', '아이', '얼굴', '아래', '눈빛', '카메라', '고개', '멀리', '점점', '소리', '모습', '머리', '인상', '형사', '박두', '여자', '시체', '후레쉬', '순간', '위로', '거기', '뭔가', '주위', '브래지어', '너머', '순경', '바람', '시작', '표정', '마주', '서로', '시선', '보고', '음악', '취조실', '사무실', '계속', '사진', '바로', '부분', '사람', '갑자기', '분위기', '얘기', '저녁', '어디', '다시', '아저씨', '진짜', '정말', '새끼', '문득', '저기', '한번', '구석', '피해자', '시점', '책상', '주변', '사건', '현장', '여기', '지금', '간다', '참깨밭', '발자국', '하나', '발견', '근처', '반장', '듯이', '잔뜩', '다리', '스타킹', '설영', '자기', '가방', '조그만', '그냥', '그게', '그거', '잠시', '목소리', '가운데', '팬티', '임마', '끄덕', '그대로', '똑바로', '잠깐', '모두', '다른', '경찰', '코피', '휴지', '누군가', '운동화', '라디오', '권기옥', '자리', '장님', '우리', '생각', '버럭', '아무', '기억', '서서히', '서류', '훈련', '불빛', '의경', '자신', '당신', '전경', '범인', '이건', '신문', '방송', '조심스레', '빗줄기', '빗소리', '어둠속', '우산', '그림자', '빗물', '학교', '변소', '사이', '괴남자', '인부', '조병순', '언덕', '언덕집', '조각', '기차', '유전자', '총구'])

4 Word2Vec 모델 내용 확인¶

모델을 활용하여 유력한 범인을 찾아보자!!

In [10]:

# 범인과 관련된 내용 중 사람이름이 안나옴...
model.wv.most_similar('범인', topn=10)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[10]:

[('우리', 0.7175503969192505),
 ('얘기', 0.6892109513282776),
 ('변소', 0.6610735058784485),
 ('그거', 0.6090747714042664),
 ('그냥', 0.6030449271202087),
 ('거기', 0.5868494510650635),
 ('사건', 0.5515612959861755),
 ('당신', 0.5280258059501648),
 ('그게', 0.5069879293441772),
 ('여자', 0.5003743171691895)]

In [11]:

# 현장과 가장 가깝게 등장한 인물이 1명 등장
model.wv.most_similar('현장', topn=10)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[11]:

[('사진', 0.7413114309310913),
 ('용의자', 0.6940706968307495),
 ('사건', 0.6851493120193481),
 ('참깨밭', 0.6236051321029663),
 ('이향숙', 0.6127980351448059),
 ('동네', 0.5867651104927063),
 ('얼굴', 0.5642785429954529),
 ('시점', 0.5474856495857239),
 ('서류', 0.5443352460861206),
 ('박명자', 0.5424180030822754)]

In [12]:

# 현장 과 백광호 와 밀접한 증거들 중에 방해가 되는 내용을 찾는다
model.wv.most_similar(['현장','이향숙'], topn=10)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[12]:

[('참깨밭', 0.7690945863723755),
 ('사진', 0.7256079912185669),
 ('사건', 0.6779365539550781),
 ('발자국', 0.6444573998451233),
 ('근처', 0.6405833959579468),
 ('박명자', 0.6084452271461487),
 ('용의자', 0.5995357036590576),
 ('얼굴', 0.5953850746154785),
 ('바로', 0.5729771852493286),
 ('동네', 0.5670396685600281)]

In [13]:

# 현장 과 백광호 와 밀접한 증거들 중에 방해가 되는 내용을 찾는다
model.wv.most_similar(['현장', '백광호'], topn=10)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[13]:

[('사진', 0.7666734457015991),
 ('얼굴', 0.7442331910133362),
 ('용의자', 0.7125645875930786),
 ('이향숙', 0.6383324265480042),
 ('참깨밭', 0.6042308807373047),
 ('사건', 0.6017242074012756),
 ('시점', 0.5852558016777039),
 ('조용구', 0.5626519322395325),
 ('발자국', 0.5593641996383667),
 ('이건', 0.5500349998474121)]

In [14]:

# 현장 과 백광호 와 밀접한 증거들 중에 '참깨밭' 이 계속 방해가 됨
# 참깨밭에 백광호가 밀접하게 연결되어 있어서 이를 제외한 분석이 필요
model.wv.most_similar(['현장','백광호'], negative=['참깨밭'], topn=15)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[14]:

[('얼굴', 0.7027637362480164),
 ('용의자', 0.6668912172317505),
 ('똑바로', 0.6218545436859131),
 ('사진', 0.5886619687080383),
 ('아래', 0.5364269614219666),
 ('시점', 0.5297864079475403),
 ('구희봉', 0.5276623368263245),
 ('빗물', 0.5162099003791809),
 ('그대로', 0.492700457572937),
 ('조용구', 0.49174538254737854),
 ('눈빛', 0.49077358841896057),
 ('보고', 0.47895872592926025),
 ('서류', 0.4716593027114868),
 ('서태윤', 0.4564111828804016),
 ('인부', 0.4538172483444214)]

In [15]:

# 현장 과 백광호 와 밀접한 증거들 중에 '참깨밭' 이 계속 방해가 됨
# 참깨밭에 백광호와 이향숙을 제외한 분석이 필요
model.wv.most_similar(['현장','백광호'], negative=['참깨밭','이향숙'], topn=20)

/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Out[15]:

[('똑바로', 0.4826679229736328),
 ('아래', 0.43381986021995544),
 ('시작', 0.4186092019081116),
 ('보고', 0.4110272228717804),
 ('총구', 0.3819498121738434),
 ('그대로', 0.37507113814353943),
 ('서태윤', 0.362000435590744),
 ('구희봉', 0.35851922631263733),
 ('용의자', 0.3528374433517456),
 ('다리', 0.34275075793266296),
 ('서류', 0.32808437943458557),
 ('눈빛', 0.3054373860359192),
 ('얼굴', 0.3029182553291321),
 ('가방', 0.29995161294937134),
 ('조용구', 0.29772767424583435),
 ('시점', 0.2970396876335144),
 ('서서히', 0.29514655470848083),
 ('박해일', 0.2906399667263031),
 ('빗물', 0.28145352005958557),
 ('인부', 0.2799288332462311)]

5 Visulaization¶

gensim

In [16]:

vocab = list(model.wv.vocab)
X     = model[vocab]

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

In [0]:

from sklearn.manifold import TSNE
tsne   = TSNE(n_components = 2)
X_tsne = tsne.fit_transform(X)

In [18]:

import pandas as pd
df = pd.DataFrame(X_tsne, index=vocab, columns=['x','y'])
df.head()

Out[18]:

	x	y
박두만	0.697110	-2.459593
서태윤	1.422914	-1.743897
조용구	0.900567	-2.790313
권귀옥	-0.797934	-4.018767
구희봉	1.042710	-4.409641

In [19]:

%matplotlib inline
import matplotlib.pyplot as plt
fig  = plt.figure(figsize=(12,12))
ax   = fig.add_subplot(1, 1, 1)
ax.scatter(df['x'], df['y'])
for word, pos in df.iterrows():
    ax.annotate(word, pos, fontsize=15)
plt.grid(True)

In [20]:

tsne_plot(model)

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  if __name__ == '__main__':

6 결과의 해석 및 활용¶

유사관계, 반대관계로 출력된 모든 Token들이 가치가 있지는 않다
min_count, Vector 갯수 2가지 조건만 사용하여 모델을 생성한다
Word2Vec 도 문서를 분석하는 도구에 불과 (절대적 가치를 창출하는 기법 으로 오해 X)
해당 분야의 잘 정리된 Document로 학습한 Word2Vec 모델 에서 유의미 한 token 들의 관계 를 Template으로 잘 정리
분석대상 문서를 유사한 조건으로 word2vec 모델 을 만들고, 앞에서 정리된 Template와 비교하여 결과

다듬기

시나리오의 분석 경우
1. 결과물 중 인물의 Token 만 활용하여 분석
2. 결과물 중 증거물의 Token 만 활용하여 분석
3. 결과물 중 장소의 Token 만 활용하여 분석

In [0]:

# model 에 등장하는 인물들
charator = ["박두만", "서태윤", "조용구", "권귀옥", "구희봉", "신동철", "백광호",
            "조병순", "박해일", "박보희", "이향숙", "독고현순", "박명자", "안미선", 
            "반장", "소현", "범인", "형사", '괴남자', '순경','피해자', '권기옥','용의자']

In [0]:

# model 에 등장하는 장소명 들
area = ['현장', '사무실', '취조실', '변소', '참깨밭', '빗줄기', '어둠속', '언덕집']

In [0]:

# model 에 등장하는 Item 들
items = ['브래지어', '팬티', '우산', '운동화', '스타킹', '목소리', '불빛', '음악', '후레쉬', 
         '카메라', '라디오', '방송', '유전자', '가방', '코피', '휴지', '신문', '총구']