#!/usr/bin/env python # coding: utf-8 #

# # **Mini Project One** # Stop Words #

# ## **1 text 문서에서 token 추출하기** # Document 에서 한글 추출하기 # In[ ]: # Step 1 - pdf 에서 변환한 Document 불러오기 filename = '../data/kr-Report_2018.txt' with open(filename, 'r', encoding='utf-8') as f: texts = f.read() texts[:300] # In[ ]: from txtutil import txtnoun texts = txtnoun(filename, tags=["Noun", "Adjective", "Verb"], stem=True) texts[:300] # In[ ]: # Document 문서를 Token List 객체로 변환하기 from nltk.tokenize import word_tokenize texts = word_tokenize(texts) texts[:8] #

# ## **2 StopWord 데이터 만들기** # **stopwords_list** : 2015, 2016, 2017, 2018년 모두 존재하는 단어목록 # In[ ]: from glob import glob filelist = glob('../data/kr-Report_201?.txt') filelist # In[ ]: stopword_list = [] for file in filelist: token_list = txtnoun(file, tags=["Noun", "Adjective", "Verb"], set_tokens=True) if len(stopword_list) == 0: stopword_list = token_list else: stopword_list = [token for token in token_list if token in stopword_list] print("{}로 필터링 된 StopWord 갯수 : {}".format(file, len(stopword_list))) #

# ## **3 추출한 StopWord 로 Token 필터링** # stopword 를 사용하여 필터링 # In[ ]: # Stopwords 를 활용하여 Token을 필터링 texts = [text for text in texts if text not in stopword_list] # pandas 를 활용하여 상위빈도 객체를 출력한다 import pandas as pd from nltk import FreqDist freqtxt = pd.Series(dict(FreqDist(texts))).sort_values(ascending=False) freqtxt[:25] #

# ## **3 Konlpy 의 단점들** # 오타/ 비정형 텍스트의 처리 # In[ ]: from konlpy.tag import Twitter twitter = Twitter() twitter.pos('가치창출', stem=True) # In[ ]: twitter.pos('갤럭시', stem=True) # In[ ]: twitter.pos('갤러시', stem=True) #

# ## **4 WordCloud 출력** # visualization # In[ ]: # wordcloud 출력 from wordcloud import WordCloud wcloud = WordCloud('../data/D2Coding.ttf', relative_scaling = 0.2, background_color = 'white').generate(" ".join(texts)) wcloud # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.figure(figsize=(12,12)) plt.imshow(wcloud, interpolation='bilinear') plt.axis("off")