#!/usr/bin/env python
# coding: utf-8
#
# # **Mini Project One**
# Stop Words
#
# ## **1 text 문서에서 token 추출하기**
# Document 에서 한글 추출하기
# In[ ]:
# Step 1 - pdf 에서 변환한 Document 불러오기
filename = '../data/kr-Report_2018.txt'
with open(filename, 'r', encoding='utf-8') as f:
texts = f.read()
texts[:300]
# In[ ]:
from txtutil import txtnoun
texts = txtnoun(filename, tags=["Noun", "Adjective", "Verb"], stem=True)
texts[:300]
# In[ ]:
# Document 문서를 Token List 객체로 변환하기
from nltk.tokenize import word_tokenize
texts = word_tokenize(texts)
texts[:8]
#
# ## **2 StopWord 데이터 만들기**
# **stopwords_list** : 2015, 2016, 2017, 2018년 모두 존재하는 단어목록
# In[ ]:
from glob import glob
filelist = glob('../data/kr-Report_201?.txt')
filelist
# In[ ]:
stopword_list = []
for file in filelist:
token_list = txtnoun(file, tags=["Noun", "Adjective", "Verb"], set_tokens=True)
if len(stopword_list) == 0:
stopword_list = token_list
else:
stopword_list = [token for token in token_list
if token in stopword_list]
print("{}로 필터링 된 StopWord 갯수 : {}".format(file, len(stopword_list)))
#
# ## **3 추출한 StopWord 로 Token 필터링**
# stopword 를 사용하여 필터링
# In[ ]:
# Stopwords 를 활용하여 Token을 필터링
texts = [text for text in texts
if text not in stopword_list]
# pandas 를 활용하여 상위빈도 객체를 출력한다
import pandas as pd
from nltk import FreqDist
freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False)
freqtxt[:25]
#
# ## **3 Konlpy 의 단점들**
# 오타/ 비정형 텍스트의 처리
# In[ ]:
from konlpy.tag import Twitter
twitter = Twitter()
twitter.pos('가치창출', stem=True)
# In[ ]:
twitter.pos('갤럭시', stem=True)
# In[ ]:
twitter.pos('갤러시', stem=True)
#
# ## **4 WordCloud 출력**
# visualization
# In[ ]:
# wordcloud 출력
from wordcloud import WordCloud
wcloud = WordCloud('../data/D2Coding.ttf',
relative_scaling = 0.2,
background_color = 'white').generate(" ".join(texts))
wcloud
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
plt.figure(figsize=(12,12))
plt.imshow(wcloud, interpolation='bilinear')
plt.axis("off")