Notebook

Ch11 문서 분석 시스템 만들기¶

처음 배우는 머신러닝 11장 : (GitHub)
문서 분류 시스템 : 간단한 문서 분류 시스템
토픽 모델 시스템 : 문서에서 토픽을 뽑아내는 시스템
품사 분석 시스템 : 문서에서 각 단어의 품사를 분석하는 시스템
고유명사 태깅 시스템 : 문서에서 시간, 장소등 고유명사 태깅
단어 임베딩 학습 : 문서에서 각 단어의 임베딩을 수학적으로 모델링

1 스펨메일 필터 만들기 (문서 분류 시스템)¶

(스팸메일) 데이터로 문서 분류 시스템 만들기

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question

01 스펨메일 데이터 분석¶

단어별 index를 생성합니다
단어빈도 피처 만들기

In [1]:

vocabulary = {} # 단어집을 처리
import numpy as np
with open('data/SMSSpamCollection') as file_handle:
    for line in file_handle:   # 파일을 한 줄씩 읽는다
        splits = line.split()  # 한 줄을 빈 칸으로 쪼개서 리스트로 변환
        label  = splits[0]     # 맨 앞 단어는 레이블로 구분
        text   = splits[1:]
        for word in text:      # 단어 단위로 고유번호를 부여
            lower = word.lower()
            if not lower in vocabulary:
                vocabulary[lower] = len(vocabulary)

# 단어집의 내용을 출력
print("전체 단어의 수", len(vocabulary))
for no, word in enumerate(vocabulary):
    if no < 5: print(no, word)

전체 단어의 수 13627
0 go
1 until
2 jurong
3 point,
4 crazy..

In [2]:

features = []  # features 리스트 생성
with open('data/SMSSpamCollection') as file_handle:
    for line in file_handle:                
        splits  = line.split()
        feature = np.zeros(len(vocabulary))
        text    = splits[1:]
        for word in text:   # Vocabulary 피처 갯수
            lower = word.lower()
            feature[vocabulary[lower]] += 1

        # 단어 빈도 피처를 문서 총 단어 갯수로 벡터를 나누어 피처를 생성
        feature = feature / sum(feature)
        features.append(feature)

print("단어빈도 피쳐의 수", len(features))
features[:5]

단어빈도 피쳐의 수 5574

Out[2]:

[array([0.05, 0.05, 0.05, ..., 0.  , 0.  , 0.  ]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.])]

In [3]:

labels = [] # 레이블을 생성 합니다.
with open('data/SMSSpamCollection') as file_handle:
    for line in file_handle: # 파일을 한 줄씩 읽습니다
        splits = line.split()
        label  = splits[0]
        # 맨 앞 단어(label)이 spam 1, 아니면 0
        if label == 'spam': labels.append(1)
        else: labels.append(0)

print("스팸 레이블 피쳐의 수", len(labels))
labels[:15]

스팸 레이블 피쳐의 수 5574

Out[3]:

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0]

In [4]:

# 위의 분석내용을 바탕으로 스펨메일 분석을 위한 데이터셋을 생성합니다
spam_header    = 'spam\t'
no_spam_header = 'ham\t'
documents, labels = [], []

with open('data/SMSSpamCollection') as file_handle:
    for line in file_handle: # 각 줄에서 레이블만 뗀 documents
        if line.startswith(spam_header):
            labels.append(1)
            documents.append(line[len(spam_header):])
        elif line.startswith(no_spam_header):
            labels.append(0)
            documents.append(line[len(no_spam_header):])

# 단어 횟수 피처에서 단어 빈도 피처 생성(idf 없으면 단어빈도(term frequency)가 생성)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer     = CountVectorizer()                  # 단어 횟수 피처
term_counts    = vectorizer.fit_transform(documents)# 문서내 단어횟수
vocabulary     = vectorizer.get_feature_names()
tf_transformer = TfidfTransformer(use_idf=False).fit(term_counts)
features       = tf_transformer.transform(term_counts)

import pickle  # 처리된 데이터를 파일로 저장
with open('data/processed.pickle', 'wb') as file_handle:
    pickle.dump((vocabulary, features, labels), file_handle)

02 스펨메일 피처를 이용한 분류기¶

회귀모델을 활용한 분류기

In [37]:

import pickle, warnings
warnings.simplefilter("ignore", category=FutureWarning)
with open('data/processed.pickle', 'rb') as file_handle:
    vocabulary, features, labels = pickle.load(file_handle)

total_number   = len(labels)
middle_index   = total_number//2 # 50%를 Train, 나머지를 Test

# 학습 데이터 분류 후 학습
train_features = features[ :middle_index, :]
train_labels   = labels[ :middle_index]

from sklearn.linear_model import LogisticRegression
classifier     = LogisticRegression()
classifier.fit(train_features, train_labels)

# 검증 데이터 분류허가
test_features  = features[middle_index: , :]
test_labels    = labels[middle_index: ]

print("train accuracy: {:.5f}\ntest accuracy : {:.5f}".format(
    classifier.score(train_features, train_labels),
    classifier.score(test_features, test_labels)))

train accuracy: 0.97309
test accuracy : 0.96125

In [38]:

# 어떤 항목이 판별에 영향을 많이 줬는지 찾아보기
weights, pairs = classifier.coef_[0, :], []
for index, value in enumerate(weights):
    pairs.append( (abs(value), vocabulary[index]) )

# 생성된 pairs 자료를 점수 순서대로 정렬합니다
pairs.sort(key=lambda x: x[0], reverse=True)
for pair in pairs[:10]:
    print("score {:.4f} 단어: {}".format(
        pair[0], pair[1]))

score 4.3649 단어: txt
score 4.0988 단어: call
score 3.3730 단어: free
score 2.6237 단어: text
score 2.5961 단어: to
score 2.4842 단어: uk
score 2.4744 단어: www
score 2.4241 단어: stop
score 2.4017 단어: claim
score 2.1648 단어: 150p

2 토픽 모델 시스템 만들기¶

LDA 모델을 구현하여 문자 Topic을 추출합니다

In [43]:

spam_header, no_spam_header = 'spam\t', 'ham\t'
documents                   = []

# 단순히 문서를 추출
with open('data/SMSSpamCollection') as file_handle:
    for line in file_handle:
        if line.startswith(spam_header):
            documents.append(line[len(spam_header):])
        elif line.startswith(no_spam_header):
            documents.append(line[len(no_spam_header):])

# 단어 feacture를 만듭니다
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
vectorizer  = CountVectorizer(stop_words='english', max_features=2000)
term_counts = vectorizer.fit_transform(documents)
vocabulary  = vectorizer.get_feature_names()
            
# LDA는 단어출현 갯수로 동작하므로 CountVectorizer를 활용
from sklearn.decomposition import LatentDirichletAllocation
topic_model = LatentDirichletAllocation(n_components=10)
topic_model.fit(term_counts) # 토픽 모델을 학습

Out[43]:

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [46]:

# 학습된 토픽들을 하나씩 출력합니다.
topics = topic_model.components_
for topic_id, weights in enumerate(topics):
    print('topic %d' % topic_id, end=': ')
    pairs = [(abs(value), vocabulary[term_id]) for term_id, value in enumerate(weights)]
    pairs.sort(key=lambda x: x[0], reverse=True)
    for pair in pairs[:10]:
        print(pair[1], end=',')
    print()

topic 0: gt,lt,know,don,oh,let,money,like,make,sent,
topic 1: day,good,dear,did,like,time,today,pls,morning,ur,
topic 2: just,lol,tone,week,ur,like,txt,number,new,stop,
topic 3: free,send,txt,ur,www,mobile,reply,stop,text,phone,
topic 4: come,home,da,said,pick,wait,buy,happy,way,ask,
topic 5: got,wat,time,say,come,dun,haha,need,soon,yeah,
topic 6: ll,sorry,later,just,leave,talk,aight,text,ok,place,
topic 7: love,hi,just,babe,hope,good,night,ur,miss,ya,
topic 8: ok,just,work,feel,dont,im,want,getting,waiting,really,
topic 9: lor,going,ok,urgent,prize,sleep,meet,trying,yup,dont,

3 품사 분석 시스템 만들기¶

nltk.tag.corenlp.CoreNLPPOSTagger 또는 nltk.tag.corenlp.CoreNLPNERTagger 대신 사용합니다
스탠포드 품사 태깅소스를 활용하여 품사 분석 머신러닝 모델을 만듭니다
스탠포드 품사 태깅자료 다운받기 (Download)

In [47]:

# 분석할 Document
text = """One day in November 2016, the two authors of this book, 
Seungyeon and Youngjoo, had a coffee at Red Rock cafe, 
which is a very popular place in Mountain View."""

from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
STANFORD_POS_MODEL_PATH = 'data/stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger'
STANFORD_POS_JAR_PATH   = 'data/stanford-postagger-2018-10-16/stanford-postagger-3.9.2.jar'
pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)
tokens     = word_tokenize(text)
print(tokens, '\n\n', pos_tagger.tag(tokens))

# 동사와 명사만 뽑아봅시다.
noun_and_verbs = [token[0]  for token in pos_tagger.tag(tokens)
                  if token[1].startswith('V') or token[1].startswith('N')]
print("\n동사, 명사추출: ", ', '.join(noun_and_verbs))

/home/markbaum/Python/python/lib/python3.6/site-packages/nltk/tag/stanford.py:149: DeprecationWarning: 
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use nltk.tag.corenlp.CoreNLPPOSTagger or nltk.tag.corenlp.CoreNLPNERTagger instead.
  super(StanfordPOSTagger, self).__init__(*args, **kwargs)

['One', 'day', 'in', 'November', '2016', ',', 'the', 'two', 'authors', 'of', 'this', 'book', ',', 'Seungyeon', 'and', 'Youngjoo', ',', 'had', 'a', 'coffee', 'at', 'Red', 'Rock', 'cafe', ',', 'which', 'is', 'a', 'very', 'popular', 'place', 'in', 'Mountain', 'View', '.'] 

 [('One', 'CD'), ('day', 'NN'), ('in', 'IN'), ('November', 'NNP'), ('2016', 'CD'), (',', ','), ('the', 'DT'), ('two', 'CD'), ('authors', 'NNS'), ('of', 'IN'), ('this', 'DT'), ('book', 'NN'), (',', ','), ('Seungyeon', 'NNP'), ('and', 'CC'), ('Youngjoo', 'NNP'), (',', ','), ('had', 'VBD'), ('a', 'DT'), ('coffee', 'NN'), ('at', 'IN'), ('Red', 'NNP'), ('Rock', 'NNP'), ('cafe', 'NN'), (',', ','), ('which', 'WDT'), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('popular', 'JJ'), ('place', 'NN'), ('in', 'IN'), ('Mountain', 'NNP'), ('View', 'NNP'), ('.', '.')]

동사, 명사추출:  day, November, authors, book, Seungyeon, Youngjoo, had, coffee, Red, Rock, cafe, is, place, Mountain, View

4 고유명사 태깅 시스템 만들기¶

nltk.tag.corenlp.CoreNLPPOSTagger 또는 nltk.tag.corenlp.CoreNLPNERTagger 를 활용합니다
스탠포드 고유명사 추출기를 활용하여 품사 분석 머신러닝 모델을 만듭니다
스탠포드 고유명사 추출기 다운받기 (Download)

In [48]:

# 분석할 Document
text = """One day in November 2016, the two authors of this book, 
Seungyeon and Youngjoo, had a coffee at Red Rock cafe, 
which is a very popular place in Mountain View."""

STANFORD_NER_CLASSIFER_PATH = 'data/stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz'
STANFORD_NER_JAR_PATH       = 'data/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
ner_tagger = StanfordNERTagger(STANFORD_NER_CLASSIFER_PATH, STANFORD_NER_JAR_PATH)
tokens     = word_tokenize(text)
print(ner_tagger.tag(tokens))

# 장소에 해당하는 단어만 출력합니다.
all_locations = [token[0]  for token in ner_tagger.tag(tokens) 
                 if token[1] == 'LOCATION']
print(', '.join(all_locations))

/home/markbaum/Python/python/lib/python3.6/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: 
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use nltk.tag.corenlp.CoreNLPPOSTagger or nltk.tag.corenlp.CoreNLPNERTagger instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)

[('One', 'O'), ('day', 'O'), ('in', 'O'), ('November', 'DATE'), ('2016', 'DATE'), (',', 'O'), ('the', 'O'), ('two', 'O'), ('authors', 'O'), ('of', 'O'), ('this', 'O'), ('book', 'O'), (',', 'O'), ('Seungyeon', 'PERSON'), ('and', 'O'), ('Youngjoo', 'PERSON'), (',', 'O'), ('had', 'O'), ('a', 'O'), ('coffee', 'O'), ('at', 'O'), ('Red', 'ORGANIZATION'), ('Rock', 'ORGANIZATION'), ('cafe', 'O'), (',', 'O'), ('which', 'O'), ('is', 'O'), ('a', 'O'), ('very', 'O'), ('popular', 'O'), ('place', 'O'), ('in', 'O'), ('Mountain', 'O'), ('View', 'O'), ('.', 'O')]

5 한국어 위키백과를 이용한 Word2vec 만들기¶

In [50]:

from konlpy.tag import Mecab
from gensim.models import Word2Vec
import sys, time, glob, unicodedata

# 모델의 파라미터들입니다.
WINDOW         = 5
EMBEDDING_SIZE = 200
BATCH_SIZE     = 10000
ITER           = 10

In [2]:

# 전처리된 위키백과 파일을 읽어 들입니다. (Mecab 경로 https://bitbucket.org/eunjeon/mecab-ko-dic)
def read_text(fin):
    corpus_li = []
    mecab = Mecab(dicpath=' /usr/local/lib/mecab/dic/mecab-ko-dic')
    for line in open(fin):
        # 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해
        # NFKC로변환합니다.
        line = unicodedata.normalize('NFKC', line)
        try:
            # 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다.
            _ = int(line[0])
            corpus_li.append(' '.join(mecab.nouns(line)) + '\n')

        except ValueError:
            # 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다.
            if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'):
                corpus_li.append(' '.join(mecab.nouns(line))+'\n')
            else:
                pass
    print('# of lines in corpus',len(corpus_li))
    return(corpus_li)

def train_word2vec(corpus_li, fout_model):
    # read_text에서 생성한 말뭉치를 이용해 word2vec을 학습시킵니다.
    model = Word2Vec(corpus_li, sg=1, size=EMBEDDING_SIZE, window=WINDOW, min_count=5, workers=3, batch_words=BATCH_SIZE, iter=ITER)
    model.init_sims(replace=True) #clean up memory
    model.save(fout_model)
    return(model)

# # 전처리된 파일을 한번에 읽어 들이기 위한 정규식
# input_pattern = '파일위치/korean_wiki/kowiki-latest-pages-articles.xml-88.txt'
# fin_li = glob.glob(input_pattern)

# for fin in fin_li:
#     corpus_li = read_text(fin)

# # 모델학습
# model = train_word2vec(corpus_li, '파일위치/korean_wiki/test_model.txt')
# print(model.most_similar('프랑스', topn=20))
# print(model.most_similar(positive=['한국','파리'], negative=['서울']))