(스팸메일) 데이터로 문서 분류 시스템 만들기
ham Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham Ok lar... Joking wif u oni...
spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question
vocabulary = {} # 단어집을 처리
import numpy as np
with open('data/SMSSpamCollection') as file_handle:
for line in file_handle: # 파일을 한 줄씩 읽는다
splits = line.split() # 한 줄을 빈 칸으로 쪼개서 리스트로 변환
label = splits[0] # 맨 앞 단어는 레이블로 구분
text = splits[1:]
for word in text: # 단어 단위로 고유번호를 부여
lower = word.lower()
if not lower in vocabulary:
vocabulary[lower] = len(vocabulary)
# 단어집의 내용을 출력
print("전체 단어의 수", len(vocabulary))
for no, word in enumerate(vocabulary):
if no < 5: print(no, word)
전체 단어의 수 13627 0 go 1 until 2 jurong 3 point, 4 crazy..
features = [] # features 리스트 생성
with open('data/SMSSpamCollection') as file_handle:
for line in file_handle:
splits = line.split()
feature = np.zeros(len(vocabulary))
text = splits[1:]
for word in text: # Vocabulary 피처 갯수
lower = word.lower()
feature[vocabulary[lower]] += 1
# 단어 빈도 피처를 문서 총 단어 갯수로 벡터를 나누어 피처를 생성
feature = feature / sum(feature)
features.append(feature)
print("단어빈도 피쳐의 수", len(features))
features[:5]
단어빈도 피쳐의 수 5574
[array([0.05, 0.05, 0.05, ..., 0. , 0. , 0. ]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.]), array([0., 0., 0., ..., 0., 0., 0.])]
labels = [] # 레이블을 생성 합니다.
with open('data/SMSSpamCollection') as file_handle:
for line in file_handle: # 파일을 한 줄씩 읽습니다
splits = line.split()
label = splits[0]
# 맨 앞 단어(label)이 spam 1, 아니면 0
if label == 'spam': labels.append(1)
else: labels.append(0)
print("스팸 레이블 피쳐의 수", len(labels))
labels[:15]
스팸 레이블 피쳐의 수 5574
[0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0]
# 위의 분석내용을 바탕으로 스펨메일 분석을 위한 데이터셋을 생성합니다
spam_header = 'spam\t'
no_spam_header = 'ham\t'
documents, labels = [], []
with open('data/SMSSpamCollection') as file_handle:
for line in file_handle: # 각 줄에서 레이블만 뗀 documents
if line.startswith(spam_header):
labels.append(1)
documents.append(line[len(spam_header):])
elif line.startswith(no_spam_header):
labels.append(0)
documents.append(line[len(no_spam_header):])
# 단어 횟수 피처에서 단어 빈도 피처 생성(idf 없으면 단어빈도(term frequency)가 생성)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer() # 단어 횟수 피처
term_counts = vectorizer.fit_transform(documents)# 문서내 단어횟수
vocabulary = vectorizer.get_feature_names()
tf_transformer = TfidfTransformer(use_idf=False).fit(term_counts)
features = tf_transformer.transform(term_counts)
import pickle # 처리된 데이터를 파일로 저장
with open('data/processed.pickle', 'wb') as file_handle:
pickle.dump((vocabulary, features, labels), file_handle)
import pickle, warnings
warnings.simplefilter("ignore", category=FutureWarning)
with open('data/processed.pickle', 'rb') as file_handle:
vocabulary, features, labels = pickle.load(file_handle)
total_number = len(labels)
middle_index = total_number//2 # 50%를 Train, 나머지를 Test
# 학습 데이터 분류 후 학습
train_features = features[ :middle_index, :]
train_labels = labels[ :middle_index]
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_features, train_labels)
# 검증 데이터 분류허가
test_features = features[middle_index: , :]
test_labels = labels[middle_index: ]
print("train accuracy: {:.5f}\ntest accuracy : {:.5f}".format(
classifier.score(train_features, train_labels),
classifier.score(test_features, test_labels)))
train accuracy: 0.97309 test accuracy : 0.96125
# 어떤 항목이 판별에 영향을 많이 줬는지 찾아보기
weights, pairs = classifier.coef_[0, :], []
for index, value in enumerate(weights):
pairs.append( (abs(value), vocabulary[index]) )
# 생성된 pairs 자료를 점수 순서대로 정렬합니다
pairs.sort(key=lambda x: x[0], reverse=True)
for pair in pairs[:10]:
print("score {:.4f} 단어: {}".format(
pair[0], pair[1]))
score 4.3649 단어: txt score 4.0988 단어: call score 3.3730 단어: free score 2.6237 단어: text score 2.5961 단어: to score 2.4842 단어: uk score 2.4744 단어: www score 2.4241 단어: stop score 2.4017 단어: claim score 2.1648 단어: 150p
LDA 모델을 구현하여 문자 Topic을 추출합니다
spam_header, no_spam_header = 'spam\t', 'ham\t'
documents = []
# 단순히 문서를 추출
with open('data/SMSSpamCollection') as file_handle:
for line in file_handle:
if line.startswith(spam_header):
documents.append(line[len(spam_header):])
elif line.startswith(no_spam_header):
documents.append(line[len(no_spam_header):])
# 단어 feacture를 만듭니다
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
warnings.simplefilter("ignore", category=PendingDeprecationWarning)
vectorizer = CountVectorizer(stop_words='english', max_features=2000)
term_counts = vectorizer.fit_transform(documents)
vocabulary = vectorizer.get_feature_names()
# LDA는 단어출현 갯수로 동작하므로 CountVectorizer를 활용
from sklearn.decomposition import LatentDirichletAllocation
topic_model = LatentDirichletAllocation(n_components=10)
topic_model.fit(term_counts) # 토픽 모델을 학습
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7, learning_method='batch', learning_offset=10.0, max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001, n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1, random_state=None, topic_word_prior=None, total_samples=1000000.0, verbose=0)
# 학습된 토픽들을 하나씩 출력합니다.
topics = topic_model.components_
for topic_id, weights in enumerate(topics):
print('topic %d' % topic_id, end=': ')
pairs = [(abs(value), vocabulary[term_id]) for term_id, value in enumerate(weights)]
pairs.sort(key=lambda x: x[0], reverse=True)
for pair in pairs[:10]:
print(pair[1], end=',')
print()
topic 0: gt,lt,know,don,oh,let,money,like,make,sent, topic 1: day,good,dear,did,like,time,today,pls,morning,ur, topic 2: just,lol,tone,week,ur,like,txt,number,new,stop, topic 3: free,send,txt,ur,www,mobile,reply,stop,text,phone, topic 4: come,home,da,said,pick,wait,buy,happy,way,ask, topic 5: got,wat,time,say,come,dun,haha,need,soon,yeah, topic 6: ll,sorry,later,just,leave,talk,aight,text,ok,place, topic 7: love,hi,just,babe,hope,good,night,ur,miss,ya, topic 8: ok,just,work,feel,dont,im,want,getting,waiting,really, topic 9: lor,going,ok,urgent,prize,sleep,meet,trying,yup,dont,
nltk.tag.corenlp.CoreNLPPOSTagger
또는 nltk.tag.corenlp.CoreNLPNERTagger
대신 사용합니다# 분석할 Document
text = """One day in November 2016, the two authors of this book,
Seungyeon and Youngjoo, had a coffee at Red Rock cafe,
which is a very popular place in Mountain View."""
from nltk.tag import StanfordPOSTagger
from nltk.tokenize import word_tokenize
STANFORD_POS_MODEL_PATH = 'data/stanford-postagger-2018-10-16/models/english-bidirectional-distsim.tagger'
STANFORD_POS_JAR_PATH = 'data/stanford-postagger-2018-10-16/stanford-postagger-3.9.2.jar'
pos_tagger = StanfordPOSTagger(STANFORD_POS_MODEL_PATH, STANFORD_POS_JAR_PATH)
tokens = word_tokenize(text)
print(tokens, '\n\n', pos_tagger.tag(tokens))
# 동사와 명사만 뽑아봅시다.
noun_and_verbs = [token[0] for token in pos_tagger.tag(tokens)
if token[1].startswith('V') or token[1].startswith('N')]
print("\n동사, 명사추출: ", ', '.join(noun_and_verbs))
/home/markbaum/Python/python/lib/python3.6/site-packages/nltk/tag/stanford.py:149: DeprecationWarning: The StanfordTokenizer will be deprecated in version 3.2.5. Please use nltk.tag.corenlp.CoreNLPPOSTagger or nltk.tag.corenlp.CoreNLPNERTagger instead. super(StanfordPOSTagger, self).__init__(*args, **kwargs)
['One', 'day', 'in', 'November', '2016', ',', 'the', 'two', 'authors', 'of', 'this', 'book', ',', 'Seungyeon', 'and', 'Youngjoo', ',', 'had', 'a', 'coffee', 'at', 'Red', 'Rock', 'cafe', ',', 'which', 'is', 'a', 'very', 'popular', 'place', 'in', 'Mountain', 'View', '.'] [('One', 'CD'), ('day', 'NN'), ('in', 'IN'), ('November', 'NNP'), ('2016', 'CD'), (',', ','), ('the', 'DT'), ('two', 'CD'), ('authors', 'NNS'), ('of', 'IN'), ('this', 'DT'), ('book', 'NN'), (',', ','), ('Seungyeon', 'NNP'), ('and', 'CC'), ('Youngjoo', 'NNP'), (',', ','), ('had', 'VBD'), ('a', 'DT'), ('coffee', 'NN'), ('at', 'IN'), ('Red', 'NNP'), ('Rock', 'NNP'), ('cafe', 'NN'), (',', ','), ('which', 'WDT'), ('is', 'VBZ'), ('a', 'DT'), ('very', 'RB'), ('popular', 'JJ'), ('place', 'NN'), ('in', 'IN'), ('Mountain', 'NNP'), ('View', 'NNP'), ('.', '.')] 동사, 명사추출: day, November, authors, book, Seungyeon, Youngjoo, had, coffee, Red, Rock, cafe, is, place, Mountain, View
nltk.tag.corenlp.CoreNLPPOSTagger
또는 nltk.tag.corenlp.CoreNLPNERTagger
를 활용합니다# 분석할 Document
text = """One day in November 2016, the two authors of this book,
Seungyeon and Youngjoo, had a coffee at Red Rock cafe,
which is a very popular place in Mountain View."""
STANFORD_NER_CLASSIFER_PATH = 'data/stanford-ner-2018-10-16/classifiers/english.muc.7class.distsim.crf.ser.gz'
STANFORD_NER_JAR_PATH = 'data/stanford-ner-2018-10-16/stanford-ner-3.9.2.jar'
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
ner_tagger = StanfordNERTagger(STANFORD_NER_CLASSIFER_PATH, STANFORD_NER_JAR_PATH)
tokens = word_tokenize(text)
print(ner_tagger.tag(tokens))
# 장소에 해당하는 단어만 출력합니다.
all_locations = [token[0] for token in ner_tagger.tag(tokens)
if token[1] == 'LOCATION']
print(', '.join(all_locations))
/home/markbaum/Python/python/lib/python3.6/site-packages/nltk/tag/stanford.py:183: DeprecationWarning: The StanfordTokenizer will be deprecated in version 3.2.5. Please use nltk.tag.corenlp.CoreNLPPOSTagger or nltk.tag.corenlp.CoreNLPNERTagger instead. super(StanfordNERTagger, self).__init__(*args, **kwargs)
[('One', 'O'), ('day', 'O'), ('in', 'O'), ('November', 'DATE'), ('2016', 'DATE'), (',', 'O'), ('the', 'O'), ('two', 'O'), ('authors', 'O'), ('of', 'O'), ('this', 'O'), ('book', 'O'), (',', 'O'), ('Seungyeon', 'PERSON'), ('and', 'O'), ('Youngjoo', 'PERSON'), (',', 'O'), ('had', 'O'), ('a', 'O'), ('coffee', 'O'), ('at', 'O'), ('Red', 'ORGANIZATION'), ('Rock', 'ORGANIZATION'), ('cafe', 'O'), (',', 'O'), ('which', 'O'), ('is', 'O'), ('a', 'O'), ('very', 'O'), ('popular', 'O'), ('place', 'O'), ('in', 'O'), ('Mountain', 'O'), ('View', 'O'), ('.', 'O')]
from konlpy.tag import Mecab
from gensim.models import Word2Vec
import sys, time, glob, unicodedata
# 모델의 파라미터들입니다.
WINDOW = 5
EMBEDDING_SIZE = 200
BATCH_SIZE = 10000
ITER = 10
# 전처리된 위키백과 파일을 읽어 들입니다. (Mecab 경로 https://bitbucket.org/eunjeon/mecab-ko-dic)
def read_text(fin):
corpus_li = []
mecab = Mecab(dicpath=' /usr/local/lib/mecab/dic/mecab-ko-dic')
for line in open(fin):
# 깨지는 글자를 처리하기 위해 unicodedata.normalize 함수를 이용해
# NFKC로변환합니다.
line = unicodedata.normalize('NFKC', line)
try:
# 첫 글자가 숫자로 시작하는 문장을 말뭉치에 추가합니다.
_ = int(line[0])
corpus_li.append(' '.join(mecab.nouns(line)) + '\n')
except ValueError:
# 첫 글자가 한글로 시작하는 문장을 말뭉치에 추가합니다.
if ord(line[0]) >= ord('가') and ord(line[0]) <= ord('힇'):
corpus_li.append(' '.join(mecab.nouns(line))+'\n')
else:
pass
print('# of lines in corpus',len(corpus_li))
return(corpus_li)
def train_word2vec(corpus_li, fout_model):
# read_text에서 생성한 말뭉치를 이용해 word2vec을 학습시킵니다.
model = Word2Vec(corpus_li, sg=1, size=EMBEDDING_SIZE, window=WINDOW, min_count=5, workers=3, batch_words=BATCH_SIZE, iter=ITER)
model.init_sims(replace=True) #clean up memory
model.save(fout_model)
return(model)
# # 전처리된 파일을 한번에 읽어 들이기 위한 정규식
# input_pattern = '파일위치/korean_wiki/kowiki-latest-pages-articles.xml-88.txt'
# fin_li = glob.glob(input_pattern)
# for fin in fin_li:
# corpus_li = read_text(fin)
# # 모델학습
# model = train_word2vec(corpus_li, '파일위치/korean_wiki/test_model.txt')
# print(model.most_similar('프랑스', topn=20))
# print(model.most_similar(positive=['한국','파리'], negative=['서울']))