# Step 1 - pdf 에서 변환한 Document 불러오기
filename = '../data/kr-Report_2018.txt'
with open(filename, 'r', encoding='utf-8') as f:
texts = f.read()
texts[:300]
from txtutil import txtnoun
texts = txtnoun(filename, tags=["Noun", "Adjective", "Verb"], stem=True)
texts[:300]
# Document 문서를 Token List 객체로 변환하기
from nltk.tokenize import word_tokenize
texts = word_tokenize(texts)
texts[:8]
from glob import glob
filelist = glob('../data/kr-Report_201?.txt')
filelist
stopword_list = []
for file in filelist:
token_list = txtnoun(file, tags=["Noun", "Adjective", "Verb"], set_tokens=True)
if len(stopword_list) == 0:
stopword_list = token_list
else:
stopword_list = [token for token in token_list
if token in stopword_list]
print("{}로 필터링 된 StopWord 갯수 : {}".format(file, len(stopword_list)))
# Stopwords 를 활용하여 Token을 필터링
texts = [text for text in texts
if text not in stopword_list]
# pandas 를 활용하여 상위빈도 객체를 출력한다
import pandas as pd
from nltk import FreqDist
freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
freqtxt[:25]
from konlpy.tag import Twitter
twitter = Twitter()
twitter.pos('가치창출', stem=True)
twitter.pos('갤럭시', stem=True)
twitter.pos('갤러시', stem=True)
# wordcloud 출력
from wordcloud import WordCloud
wcloud = WordCloud('../data/D2Coding.ttf',
relative_scaling = 0.2,
background_color = 'white').generate(" ".join(texts))
wcloud
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
plt.imshow(wcloud, interpolation='bilinear')
plt.axis("off")