Stop Words¶

1 불용어 처리¶

Stop Words

In [ ]:

# 영문 내용을 소문자로 전처리
texts = 'I like such a Wonderful Snow Ice Cream'
texts = texts.lower()
texts

In [ ]:

# text를 token으로 변환
from nltk import word_tokenize
tokens = word_tokenize(texts)
tokens

In [ ]:

# import nltk
# nltk.download('stopwords')

In [ ]:

# Stopwords 사용가능한 언어목록 
from nltk.corpus import stopwords
stopwords.ensure_loaded
stopwords.__dict__.get('_fileids')

In [ ]:

from nltk.corpus import stopwords
stopwords.words("=Quiz!=")[::18]

In [ ]:

tokens = [word   for word in tokens   
                 if word not in stopwords.words("=Quiz!=")]
print(tokens)

2 한글의 불용어 처리¶

인터넷에 공개되어 있는 불용어100 자료 (idf 값까지 txt에는 포함)

In [ ]:

# 한글 텍스트자료 불러오기
f = open('./data/stopword_kr.txt', 'r', encoding='utf-8')
s = f.read()
f.close()

stop_words = [ txt.split('\t')[:3]  for txt in s.split('\n') ]
stop_words[:10]