통계적 문법접근방법 빠르고 간단하게 살펴보기
# %%time
# text = '여배우 박민영은 높은 싱크로를 보여줬다'
# from konlpy.tag import Twitter
# twitter = Twitter()
# words = twitter.pos(text, stem=True)
# print(words)
# from nltk import RegexpParser
# grammar = """
# NP: {<N.*>*<Suffix>?} # 명사구를 정의한다
# VP: {<V.*>*} # 동사구를 정의한다
# AP: {<A.*>*} # 형용사구를 정의한다 """
# parser = RegexpParser(grammar)
# parser
# chunks = parser.parse(words)
# chunks
# text_tree = [list(txt) for txt in chunks.subtrees()]
# from pprint import pprint
# pprint(text_tree[1:])
from nltk.grammar import toy_pcfg2
grammar = toy_pcfg2
print(grammar)
Grammar with 23 productions (start state = S) S -> NP VP [1.0] VP -> V NP [0.59] VP -> V [0.4] VP -> VP PP [0.01] NP -> Det N [0.41] NP -> Name [0.28] NP -> NP PP [0.31] PP -> P NP [1.0] V -> 'saw' [0.21] V -> 'ate' [0.51] V -> 'ran' [0.28] N -> 'boy' [0.11] N -> 'cookie' [0.12] N -> 'table' [0.13] N -> 'telescope' [0.14] N -> 'hill' [0.5] Name -> 'Jack' [0.52] Name -> 'Bob' [0.48] P -> 'with' [0.61] P -> 'under' [0.39] Det -> 'the' [0.41] Det -> 'a' [0.31] Det -> 'my' [0.28]
# # Early Chart 분석방법 맛보기
# import nltk
# nltk.parse.featurechart.demo( print_times = False,
# print_grammar = True,
# parser = nltk.parse.featurechart.FeatureChartParser,
# sent = 'I saw a dog' )
from nltk.corpus import wordnet as wn
wn.synsets('dog')
[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
wn.synset('frump.n.01').examples()
['she got a reputation as a frump', "she's a real dog"]
wn.synset('frump.n.01').definition()
'a dull unattractive unpleasant girl or woman'
# NLTK 기본 모듈에 포함된 wordnet DB를 활용
! pip3 install pywsd
Requirement already satisfied: pywsd in /home/markbaum/Python/nltk/lib/python3.6/site-packages (1.1.7) Requirement already satisfied: pandas in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from pywsd) (0.23.3) Requirement already satisfied: nltk in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from pywsd) (3.3) Requirement already satisfied: numpy in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from pywsd) (1.14.5) Requirement already satisfied: pytz>=2011k in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from pandas->pywsd) (2018.5) Requirement already satisfied: python-dateutil>=2.5.0 in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from pandas->pywsd) (2.7.3) Requirement already satisfied: six in /home/markbaum/Python/nltk/lib/python3.6/site-packages (from nltk->pywsd) (1.11.0)
sent = 'He act like a real dog'
ambiguous = 'dog'
from pywsd.lesk import simple_lesk
answer = simple_lesk(sent, ambiguous)
answer
Warming up PyWSD (takes ~10 secs)... took 2.9580390453338623 secs.
Synset('frump.n.01')
answer.definition()
'a dull unattractive unpleasant girl or woman'
sent = 'He looks like dirty dog'
ambiguous = 'dog'
answer = simple_lesk(sent, ambiguous)
answer
Synset('cad.n.01')
answer.definition()
'someone who is morally reprehensible'
Token List 객체를 생성한 뒤, 이를 활용하여 nltk 객체를 만든다
# # 삼성전자 지속가능경영 보고서
# skipword = ['갤러시', '가치창출']
# from txtutil import txtnoun
# from nltk.tokenize import word_tokenize
# texts = txtnoun("../data/kr-Report_2018.txt", skip=skipword)
# tokens = word_tokenize(texts)
# tokens[:5]
# # nltk Token 객체를 활용한 다양한 메소드를 제동
# import nltk
# ss_nltk = nltk.Text(tokens, name='2018지속성장')
# ss_nltk
내부 메서드를 활용한다
# # 객체의 이름을 출력
# ss_nltk.name
# # Token 과 연어관계에 있는 단어목록
# ss_nltk.collocations(num=30, window_size=2)
# # Token의 주변에 등장하는 단어들
# ss_nltk.common_contexts(['책임경영'])
# # 인접하여 위치하는 Token 을 출력
# ss_nltk.concordance('책임경영', lines=2)
# ss_nltk.concordance_list('책임경영')[1]
# # Token 의 빈도값 출력
# ss_nltk.count('책임경영')
# %matplotlib inline
# from matplotlib import rc
# rc('font', family=['NanumGothic','Malgun Gothic'])
# # 해당 단어별 출현빈도 비교출력
# ss_nltk.dispersion_plot(['책임경영', '경영진', '갤럭시', '갤러시', '업사이클링'])
# # 객체의 빈도를 Matplot linechart 로 출력
# ss_nltk.plot(10)
# # ko.readability('biline')
# ss_nltk.similar('삼성전자',num=3)
Token 객체들 다루기
# # Token의 출현빈도 상위객체 출력
# # ko.tokens(['초등학교', '저학년'])
# ss_nltk.vocab().most_common(10)
# list(ss_nltk.vocab().keys())[:5]
# list(ss_nltk.vocab().values())[:5]
# ss_nltk.vocab().freq('삼성전자')