! pip3 install nltk pywsd symspellpy
import nltk
nltk.download('treebank')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5) Requirement already satisfied: pywsd in /usr/local/lib/python3.6/dist-packages (1.1.7) Collecting symspellpy Downloading https://files.pythonhosted.org/packages/4c/d5/9cf41f05a30f205c00489e3d37639c348349ba6f8d0e1005f26dc9a9ac60/symspellpy-6.3.8-py3-none-any.whl Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0) Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages (from pywsd) (0.22.0) Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pywsd) (1.14.6) Requirement already satisfied: python-dateutil>=2 in /usr/local/lib/python3.6/dist-packages (from pandas->pywsd) (2.5.3) Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas->pywsd) (2018.9) Installing collected packages: symspellpy Successfully installed symspellpy-6.3.8 [nltk_data] Downloading package treebank to /root/nltk_data... [nltk_data] Package treebank is already up-to-date! [nltk_data] Downloading package wordnet to /root/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
# 촘스키 CGF 문법규칙
from nltk.grammar import toy_pcfg2
grammar = toy_pcfg2
print(grammar)
Grammar with 23 productions (start state = S) S -> NP VP [1.0] VP -> V NP [0.59] VP -> V [0.4] VP -> VP PP [0.01] NP -> Det N [0.41] NP -> Name [0.28] NP -> NP PP [0.31] PP -> P NP [1.0] V -> 'saw' [0.21] V -> 'ate' [0.51] V -> 'ran' [0.28] N -> 'boy' [0.11] N -> 'cookie' [0.12] N -> 'table' [0.13] N -> 'telescope' [0.14] N -> 'hill' [0.5] Name -> 'Jack' [0.52] Name -> 'Bob' [0.48] P -> 'with' [0.61] P -> 'under' [0.39] Det -> 'the' [0.41] Det -> 'a' [0.31] Det -> 'my' [0.28]
# WordNet 데이터를 활용한 의미관계 분석
from nltk.corpus import wordnet as wn
word = 'bank'
print("Synset list : \n{}\n\nbank.n.02 뜻과 예제 : \n{}\n{}\n\ntrust.v.01 뜻과 예제 :\n{}\n{}".format(
wn.synsets(word),
wn.synset('bank.n.02').definition(), wn.synset('bank.n.02').examples(),
wn.synset('trust.v.01').definition(), wn.synset('trust.v.01').examples()))
Synset list : [Synset('bank.n.01'), Synset('depository_financial_institution.n.01'), Synset('bank.n.03'), Synset('bank.n.04'), Synset('bank.n.05'), Synset('bank.n.06'), Synset('bank.n.07'), Synset('savings_bank.n.02'), Synset('bank.n.09'), Synset('bank.n.10'), Synset('bank.v.01'), Synset('bank.v.02'), Synset('bank.v.03'), Synset('bank.v.04'), Synset('bank.v.05'), Synset('deposit.v.02'), Synset('bank.v.07'), Synset('trust.v.01')] bank.n.02 뜻과 예제 : a financial institution that accepts deposits and channels the money into lending activities ['he cashed a check at the bank', 'that bank holds the mortgage on my home'] trust.v.01 뜻과 예제 : have confidence or faith in ['We can trust in God', 'Rely on your friends', 'bank on your good education', "I swear by my grandmother's recipes"]
통계적인 방법을 활용하여 결과를 출력합니다
# pywsd 를 사용한 문장 내 의미분석
sent = 'I went to the bank to deposit my money'
# sent = 'I bank my money'
ambiguous = 'bank'
from pywsd.lesk import simple_lesk
answer = simple_lesk(sent, ambiguous)
answer.definition()
Warming up PyWSD (takes ~10 secs)... took 4.01186728477478 secs.
'a financial institution that accepts deposits and channels the money into lending activities'
단어별 빈도정보를 활용하여 단어들을 재배치 합니다
from symspellpy.symspellpy import SymSpell
def spellCheck(dict_file, sentence):
max_edit_dist, prefix_length = 0, 7
term_index, count_index = 0, 1
sym_spell = SymSpell(max_edit_dist, prefix_length)
if not sym_spell.load_dictionary(dict_file, term_index, count_index):
print("사전파일을 정의하지 않았습니다"); return
result = sym_spell.word_segmentation(sentence)
print("오타 수정결과: {}\n편집거리 총합: {}".format(
result.corrected_string,
result.distance_sum))
text = "thequickbrownfoxjumpsoverthelazydog"
spellCheck("frequency_dictionary_en_82_765.txt", sentence=text)
오타 수정결과: the quick brown fox jumps over the lazy dog 편집거리 총합: 8