#!/usr/bin/env python # coding: utf-8 # # **Append** # 통계적 문법접근방법 빠르고 간단하게 살펴보기 #

# ## **1 Parsing Tree** # 문법태그를 활용한 문법구조 생성하기 # In[1]: # %%time # text = '여배우 박민영은 높은 싱크로를 보여줬다' # from konlpy.tag import Twitter # twitter = Twitter() # words = twitter.pos(text, stem=True) # print(words) # In[2]: # from nltk import RegexpParser # grammar = """ # NP: {*?} # 명사구를 정의한다 # VP: {*} # 동사구를 정의한다 # AP: {*} # 형용사구를 정의한다 """ # parser = RegexpParser(grammar) # parser # In[3]: # chunks = parser.parse(words) # chunks # In[4]: # text_tree = [list(txt) for txt in chunks.subtrees()] # from pprint import pprint # pprint(text_tree[1:]) #

# ## **2. CFG 분석방법 맛보기** # 정해진 유형에 따른 문법적 구조예제 활용하기 # In[5]: from nltk.grammar import toy_pcfg2 grammar = toy_pcfg2 print(grammar) # In[6]: # # Early Chart 분석방법 맛보기 # import nltk # nltk.parse.featurechart.demo( print_times = False, # print_grammar = True, # parser = nltk.parse.featurechart.FeatureChartParser, # sent = 'I saw a dog' ) #

# ## **3 Word Net을 활용한 명사/동사/형용사 의미분석** # **SynSet** 내용 살펴보기 ( Word Net 에 포함된 같은단어 Node 모음) # In[7]: from nltk.corpus import wordnet as wn wn.synsets('dog') # In[8]: wn.synset('frump.n.01').examples() # In[9]: wn.synset('frump.n.01').definition() #

# ## **3 SynSet 활용하기** # Wordnet을 활용하여 단의 의미 구분하기 # In[10]: # NLTK 기본 모듈에 포함된 wordnet DB를 활용 get_ipython().system(' pip3 install pywsd') # In[11]: sent = 'He act like a real dog' ambiguous = 'dog' from pywsd.lesk import simple_lesk answer = simple_lesk(sent, ambiguous) answer # In[12]: answer.definition() # In[13]: sent = 'He looks like dirty dog' ambiguous = 'dog' answer = simple_lesk(sent, ambiguous) answer # In[14]: answer.definition() #

# ## **4 NLTK 객체 활용** # nltk 객체를 활용하여 작업을 효율적으로 활용한다 # ### **01 nltk 객체 정의하기** # Token List 객체를 생성한 뒤, 이를 활용하여 nltk 객체를 만든다 # In[15]: # # 삼성전자 지속가능경영 보고서 # skipword = ['갤러시', '가치창출'] # from txtutil import txtnoun # from nltk.tokenize import word_tokenize # texts = txtnoun("../data/kr-Report_2018.txt", skip=skipword) # tokens = word_tokenize(texts) # tokens[:5] # In[16]: # # nltk Token 객체를 활용한 다양한 메소드를 제동 # import nltk # ss_nltk = nltk.Text(tokens, name='2018지속성장') # ss_nltk # ### **02 nltk 객체 활용하기** # 내부 메서드를 활용한다 # In[17]: # # 객체의 이름을 출력 # ss_nltk.name # In[18]: # # Token 과 연어관계에 있는 단어목록 # ss_nltk.collocations(num=30, window_size=2) # In[19]: # # Token의 주변에 등장하는 단어들 # ss_nltk.common_contexts(['책임경영']) # In[20]: # # 인접하여 위치하는 Token 을 출력 # ss_nltk.concordance('책임경영', lines=2) # In[21]: # ss_nltk.concordance_list('책임경영')[1] # In[22]: # # Token 의 빈도값 출력 # ss_nltk.count('책임경영') # In[23]: # %matplotlib inline # from matplotlib import rc # rc('font', family=['NanumGothic','Malgun Gothic']) # # 해당 단어별 출현빈도 비교출력 # ss_nltk.dispersion_plot(['책임경영', '경영진', '갤럭시', '갤러시', '업사이클링']) # In[24]: # # 객체의 빈도를 Matplot linechart 로 출력 # ss_nltk.plot(10) # In[25]: # # ko.readability('biline') # ss_nltk.similar('삼성전자',num=3) # ### **03 nltk.vocab() 객체 활용하기** # Token 객체들 다루기 # In[26]: # # Token의 출현빈도 상위객체 출력 # # ko.tokens(['초등학교', '저학년']) # ss_nltk.vocab().most_common(10) # In[27]: # list(ss_nltk.vocab().keys())[:5] # In[28]: # list(ss_nltk.vocab().values())[:5] # In[29]: # ss_nltk.vocab().freq('삼성전자')