# Document 자료를 불러온다 : 2017년 연간결산 리포트
with open('./data/News2017.txt', 'r', encoding='utf-8') as f:
texts = f.read()
texts = texts.lower()
texts[:300]
# 영문 Token만 추출한다
from nltk.tokenize import RegexpTokenizer
re_capt = RegexpTokenizer(r'[ =Quiz!= ]\w+')
tokens = re_capt.tokenize(texts)
document = " ".join(tokens)
document[:300]
# 추출한 Token의 빈도를 계산한다
from nltk import FreqDist
import pandas as pd
token_freq = FreqDist(tokens)
token_freq = pd.Series(token_freq).sort_values(ascending=False)
token_freq[:10]
# ! pip3 install sklearn
# ! pip3 install scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(stop_words=' =Quiz!= ')
transformed = tfidf_vec.fit_transform(raw_documents = [" =Quiz!= "])
transformed = np.array(transformed.todense())
transformed
index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
fully_indexed = {index_value[column]:value for row in transformed
for (column,value) in enumerate(row)}
token_tfidf = pd.Series(fully_indexed).sort_values(ascending=False)
token_tfidf[:10]