#!/usr/bin/env python # coding: utf-8 #

# # **N-gram 의 활용** #

# ## **1 N-gram 생성하기** # In[1]: # Document 그대로 불러오기 with open('../data/kr-Report_2018.txt','r', encoding='utf-8') as f: texts = f.read() texts[:300] # In[2]: # Document에 포함된 한글만 추출 # 이들 중 명사 Token만 추출 from txtutil import txtnoun skip_tokens = {'갤러시':'갤럭시', '가치창출의':'가치창출'} texts = txtnoun('../data/kr-Report_2018.txt', skip=skip_tokens) texts[:300] # In[3]: from nltk.tokenize import word_tokenize tokens = word_tokenize(texts) tokens[:7] # In[4]: from nltk.util import ngrams texts_sample = [txt for txt in ngrams(tokens, 3)] print(len(texts_sample)) texts_sample[:5] #

# ## **2 Bi-Gram 을 대상으로 한 PMI** # **Point wise Mutual Information** 를 측정하여 최상위 우도 10개 값을 추출 # In[5]: from nltk import collocations finder_bi = collocations.BigramCollocationFinder.from_words(tokens) finder_bi # In[6]: measures_bi = collocations.BigramAssocMeasures() finder_bi.nbest(measures_bi.pmi, 10) #

# ## **3 Tri-Gram 을 대상으로한 PMI** # In[7]: finder_tri = collocations.TrigramCollocationFinder.from_words(tokens) finder_tri # In[8]: measures_tri = collocations.TrigramAssocMeasures() finder_tri.nbest(measures_tri.pmi, 10)