#!/usr/bin/env python
# coding: utf-8
#
# # **N-gram 의 활용**
#
# ## **1 N-gram 생성하기**
# In[1]:
# Document 그대로 불러오기
with open('../data/kr-Report_2018.txt','r', encoding='utf-8') as f:
texts = f.read()
texts[:300]
# In[2]:
# Document에 포함된 한글만 추출
# 이들 중 명사 Token만 추출
from txtutil import txtnoun
skip_tokens = {'갤러시':'갤럭시', '가치창출의':'가치창출'}
texts = txtnoun('../data/kr-Report_2018.txt', skip=skip_tokens)
texts[:300]
# In[3]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(texts)
tokens[:7]
# In[4]:
from nltk.util import ngrams
texts_sample = [txt for txt in ngrams(tokens, 3)]
print(len(texts_sample))
texts_sample[:5]
#
# ## **2 Bi-Gram 을 대상으로 한 PMI**
# **Point wise Mutual Information** 를 측정하여 최상위 우도 10개 값을 추출
# In[5]:
from nltk import collocations
finder_bi = collocations.BigramCollocationFinder.from_words(tokens)
finder_bi
# In[6]:
measures_bi = collocations.BigramAssocMeasures()
finder_bi.nbest(measures_bi.pmi, 10)
#
# ## **3 Tri-Gram 을 대상으로한 PMI**
# In[7]:
finder_tri = collocations.TrigramCollocationFinder.from_words(tokens)
finder_tri
# In[8]:
measures_tri = collocations.TrigramAssocMeasures()
finder_tri.nbest(measures_tri.pmi, 10)