# Docs 의 list 목록을 만드는게 우선 일이다
from txtutil import tf_idf
tf_idf('갤럭시', '갤럭시 노트 신제품 출시', ['갤럭시','갤럭시','노트','신제품','출시','출시'])
import re
from glob import glob
filelist = glob('./data/kr-Report_201?.txt')
# 불용어 자료를 활용하여 Token 을 Filtering
# # stopwords.txt : 2015, 2016, 2017, 2018년 모두 존재하는 단어목록
# f = open('./data/stopwords.txt', 'r', encoding='utf-8')
# stopwords = f.read(); f.close()
# stopwords = stopwords.split(' ')
# stopwords[:10]
%%time
docs_tokens = []
skips = {'갤러시':'갤럭시', '가치창출':'가치창출'}
from txtutil import txtnoun
from nltk.tokenize import word_tokenize
for file in filelist:
texts = txtnoun(file, skip=skips)
tokens = word_tokenize(texts)
tokens = [token for token in tokens
if len(token) > 2]
# if (len(token) > 2) and (token not in stopwords)]
docs_tokens += tokens
from nltk import FreqDist
import pandas as pd
pd.Series(FreqDist(docs_tokens)).sort_values(ascending=False)[:5]
# 분석할 대상 데이터
texts = txtnoun('./data/kr-Report_2018.txt', skip=skips)
tokens = word_tokenize(texts)
tokens = [token for token in tokens
if len(token) > 2]
# if (len(token) > 2) and (token not in stopwords)]
tokens[:7]
%%time
from txtutil import tf_idf
token_set = list(set(tokens))
result_dict = {}
for txt in token_set:
result_dict[txt] = tf_idf(txt, tokens, docs_tokens)
print('Calculating is Done.')
# 2018년도 tf-idf
# 생성한 TF-IDF 결과를 Pandas로 출력
import pandas as pd
tfidf = pd.Series(result_dict)
tfidf.sort_values(ascending=False)[:20]