#!/usr/bin/env python # coding: utf-8 #

# # **tf-idf** # # ## **1 Document 자료를 불러오기** # sklearn을 활용한 tf-idf 계산 # [**연간 기업결과 리포트**](https://news.samsung.com/global/samsung-electronics-announces-fourth-quarter-and-fy-2017-results) # In[1]: # Document 자료를 불러온다 : 2017년 연간결산 리포트 with open('../data/News2017.txt', 'r', encoding='utf-8') as f: texts = f.read() texts = texts.lower() texts[:300] # In[2]: # 영문 Token만 추출한다 from nltk.tokenize import RegexpTokenizer re_capt = RegexpTokenizer(r'[a-z]\w+') tokens = re_capt.tokenize(texts) document = " ".join(tokens) document[:300] # In[3]: # 추출한 Token의 빈도를 계산한다 from nltk import FreqDist import pandas as pd token_freq = FreqDist(tokens) token_freq = pd.Series(token_freq).sort_values(ascending=False) token_freq[:10] #

# ## **2 sklean 을 활용한 tf idf 계산** # sklearn의 기본 데이터를 활용하여 tf-idf 결과값 출력 # In[4]: # ! pip3 install sklearn # In[5]: # ! pip3 install scipy # In[6]: import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vec = TfidfVectorizer(stop_words='english') transformed = tfidf_vec.fit_transform(raw_documents = [document]) transformed = np.array(transformed.todense()) transformed # In[7]: index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()} fully_indexed = {index_value[column]:value for row in transformed for (column,value) in enumerate(row)} token_tfidf = pd.Series(fully_indexed).sort_values(ascending=False) token_tfidf[:10]