#!/usr/bin/env python
# coding: utf-8
#
# # **tf-idf**
#
# ## **1 Document 자료를 불러오기**
# sklearn을 활용한 tf-idf 계산
# [**연간 기업결과 리포트**](https://news.samsung.com/global/samsung-electronics-announces-fourth-quarter-and-fy-2017-results)
# In[1]:
# Document 자료를 불러온다 : 2017년 연간결산 리포트
with open('../data/News2017.txt', 'r', encoding='utf-8') as f:
texts = f.read()
texts = texts.lower()
texts[:300]
# In[2]:
# 영문 Token만 추출한다
from nltk.tokenize import RegexpTokenizer
re_capt = RegexpTokenizer(r'[a-z]\w+')
tokens = re_capt.tokenize(texts)
document = " ".join(tokens)
document[:300]
# In[3]:
# 추출한 Token의 빈도를 계산한다
from nltk import FreqDist
import pandas as pd
token_freq = FreqDist(tokens)
token_freq = pd.Series(token_freq).sort_values(ascending=False)
token_freq[:10]
#
# ## **2 sklean 을 활용한 tf idf 계산**
# sklearn의 기본 데이터를 활용하여 tf-idf 결과값 출력
# In[4]:
# ! pip3 install sklearn
# In[5]:
# ! pip3 install scipy
# In[6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(stop_words='english')
transformed = tfidf_vec.fit_transform(raw_documents = [document])
transformed = np.array(transformed.todense())
transformed
# In[7]:
index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}
fully_indexed = {index_value[column]:value for row in transformed
for (column,value) in enumerate(row)}
token_tfidf = pd.Series(fully_indexed).sort_values(ascending=False)
token_tfidf[:10]