Mini Project One¶

Stop Words

1 text 문서에서 token 추출하기¶

Document 에서 한글 추출하기

In [ ]:

# Step 1 - pdf 에서 변환한 Document 불러오기
filename = '../data/kr-Report_2018.txt'
with open(filename, 'r', encoding='utf-8') as f:
    texts = f.read()
texts[:300]

In [ ]:

from txtutil import txtnoun
texts = txtnoun(filename, tags=["Noun", "Adjective", "Verb"], stem=True)
texts[:300]

In [ ]:

# Document 문서를 Token List 객체로 변환하기
from nltk.tokenize import word_tokenize
texts = word_tokenize(texts)
texts[:8]

2 StopWord 데이터 만들기¶

stopwords_list : 2015, 2016, 2017, 2018년 모두 존재하는 단어목록

In [ ]:

from glob import glob
filelist = glob('../data/kr-Report_201?.txt')
filelist

In [ ]:

stopword_list = []
for file in filelist:
    token_list = txtnoun(file, tags=["Noun", "Adjective", "Verb"], set_tokens=True)
    if len(stopword_list) == 0:
        stopword_list = token_list
    else:
        stopword_list = [token for token in token_list  
                               if token in stopword_list]
    print("{}로 필터링 된 StopWord 갯수 : {}".format(file, len(stopword_list)))

3 추출한 StopWord 로 Token 필터링¶

stopword 를 사용하여 필터링

In [ ]:

# Stopwords 를 활용하여 Token을 필터링
texts = [text for text in texts  
              if text not in stopword_list]

# pandas 를 활용하여 상위빈도 객체를 출력한다
import pandas as pd
from nltk import FreqDist
freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
freqtxt[:25]

3 Konlpy 의 단점들¶

오타/ 비정형 텍스트의 처리

In [ ]:

from konlpy.tag import Twitter
twitter = Twitter()
twitter.pos('가치창출', stem=True)

In [ ]:

twitter.pos('갤럭시', stem=True)

In [ ]:

twitter.pos('갤러시', stem=True)

4 WordCloud 출력¶

visualization

In [ ]:

# wordcloud 출력
from wordcloud import WordCloud
wcloud = WordCloud('../data/D2Coding.ttf',
                   relative_scaling = 0.2,
                   background_color = 'white').generate(" ".join(texts))
wcloud

In [ ]:

%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
plt.imshow(wcloud, interpolation='bilinear')
plt.axis("off")