# # 분석에 필요한 모듈설치 ! pip3 install sklearn nltk import nltk nltk.download('punkt') nltk.download('stopwords') news_texts = "https://raw.githubusercontent.com/YongBeomKim/nltk_basic/master/data/News.txt" # Stopwords 사용가능한 언어목록 from nltk.corpus import stopwords stopwords.ensure_loaded stopwords.__dict__.get('_fileids') from nltk.corpus import stopwords stop_eng = stopwords.words("english") stop_eng[:8] # 영문 내용을 소문자로 전처리 texts = 'I like such a Wonderful Snow Ice Cream' texts = texts.lower() texts from nltk import word_tokenize tokens = word_tokenize(texts) tokens tokens = [word for word in tokens if word not in stop_eng] print(tokens) # with open(news_texts, 'r') as f: # texts = f.read() # texts = texts.lower() import requests texts = requests.get(news_texts).text.lower() texts[:300] # 영문 Token만 추출한다 (숫자와 문장기호를 제거) import re tokenizer = re.compile('[a-z]\w+') tokens = tokenizer.findall(texts) document = " ".join(tokens) document[:300] import numpy as np import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vec = TfidfVectorizer(stop_words='english') transformed = tfidf_vec.fit_transform(raw_documents = [document]) transformed = np.array(transformed.todense()) transformed index_value = {i[1]:i[0] for i in tfidf_vec.vocabulary_.items()} fully_indexed = {index_value[column]:value for row in transformed for (column,value) in enumerate(row)} fully_indexed