import nltk
from nltk import CFG
from nltk.tree import Tree
# Part 1: NLTK 기본 모듈을 사용한 결과출력
def definegrammar_pasrereult():
Grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = "I shot an elephant".split()
parser = nltk.ChartParser(Grammar)
trees = parser.parse(sent)
for tree in trees:
print (tree)
print ("\n--------Parsing result as per defined grammar-------")
definegrammar_pasrereult()
--------Parsing result as per defined grammar------- (S (NP I) (VP (V shot) (NP (Det an) (N elephant))))
# Part 2: Tree 함수를 사용한 parse tree 그리기
def draw_parser_tree():
dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
vp = Tree('vp', [Tree('v', ['chased']), dp2])
tree = Tree('s', [dp1, vp])
print(tree)
print(tree.pformat_latex_qtree())
tree.pretty_print()
print ("\n--------Drawing Parse Tree-------")
draw_parser_tree()
--------Drawing Parse Tree------- (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat)))) \Tree [.s [.dp [.d the ] [.np dog ] ] [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ] s ________|_____ | vp | _____|___ dp | dp ___|___ | ___|___ d np v d np | | | | | the dog chased the cat
# from pycorenlp import StanfordCoreNLP
# from collections import defaultdict
# Part 3: "pycorenlp" 을 사용한 시각화
# pycorenlp 설치 후 standford corenlp website 에서 stanford-corenlp-full-* 설치가 필요
# def stanford_parsing_result():
# text = """ I shot an elephant. The dog chased the cat. School go to boy. """
# nlp = StanfordCoreNLP('http://localhost:9000')
# res = nlp.annotate(text, properties={
# 'annotators': 'tokenize,ssplit,pos,depparse,parse',
# 'outputFormat': 'json'
# })
# print(res['sentences'][0]['parse'])
# print(res['sentences'][2]['parse'])
# Exception: Check whether you have started the CoreNLP server e.g. 오류가 발생
# print ("\n--------Stanford Parser result------")
# stanford_parsing_result()
개체명 인식을 활용한 NER 구현
import spacy
nlp = spacy.load('en')
doc = nlp(u'London is a big city in the United Kingdom.')
print ("-------Example 1 ------")
for _ in doc.ents:
print("{} : {}".format(_.label_, _.text))
-------Example 1 ------ GPE : London GPE : the United Kingdom
doc1 = nlp(u'While in France, Christine Lagarde discussed short-term stimulus efforts in a '
u'recent interview on 5:00 P.M. with the Wall Street Journal')
print ("-------Example 2 ------")
for _ in doc1.ents:
print("{} : {}".format(_.label_, _.text))
-------Example 2 ------ GPE : France PERSON : Christine Lagarde TIME : 5:00 P.M. ORG : the Wall Street Journal
sklearn 을 활용한 Bag of Words 만들기
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
ngram_vectorizer.get_feature_names() == ([' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp'])
counts.toarray().astype(int)
array([[1, 1, 1, 0, 1, 1, 1, 0], [1, 1, 0, 1, 1, 1, 0, 1]])
ngram_vectorizer
CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(2, 2), preprocessor=None, stop_words=None, strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)
text = 'tf idf, short form of term frequency, inverse document frequency'
text2 = 'is a numerical statistic that is intended to reflect how important'
text3 = 'a word is to a document in a collection or corpus'
from textblob import TextBlob
blob = TextBlob(text)
blob2 = TextBlob(text2)
blob3 = TextBlob(text3)
bloblist = [blob, blob2, blob3]
bloblist
[TextBlob("tf idf, short form of term frequency, inverse document frequency"), TextBlob("is a numerical statistic that is intended to reflect how important"), TextBlob("a word is to a document in a collection or corpus")]
import math
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def idf(word, bloblist):
# n_containing (0이 없도록 1 smoothing)
x = 1 + sum(1 for blob in bloblist if word in blob)
return math.log(len(bloblist) / (x if x else 1))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
tf_score = tf('short', blob)
idf_score = idf('short', bloblist)
tfidf_score = tfidf('short', blob, bloblist)
print ("""tf : {:.7}\nidf : {:.7}\ntf*idf : {:.7}""".format(
str(tf_score), str(idf_score), str(tfidf_score)))
tf : 0.1 idf : 0.40546 tf*idf : 0.04054
# 문서 1개 호출한 뒤 Token의 생성
import nltk, string, os
from collections import Counter
# 전처리 : 소문자 처리, 문장기호 삭제 등
def get_tokens(file):
with open(file, 'r') as shakes:
text = shakes.read()
lowers = text.lower() # 소문자 전처리
no_punctuation = lowers.translate(str.maketrans('','',string.punctuation)) # 문장기호 삭제
tokens = no_punctuation.split(' ') # 문장 내용을 Token 으로 분리
return tokens
tokens = get_tokens('./data/shakes1.txt')
count = Counter(tokens)
count.most_common(5)
[('and', 48), ('the', 33), ('to', 29), ('i', 26), ('of', 25)]
# nlkt 의 stopword 를 활용한 Token 정규화
from nltk.corpus import stopwords
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
count.most_common(5)
[('thy', 11), ('go', 7), ('love', 7), ('would', 5), ('thou', 5)]
# PorterStemmer 사용 Stemming 활용
from nltk.stem.porter import PorterStemmer
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)
count = Counter(stemmed)
count.most_common(5)
[('thi', 11), ('go', 7), ('love', 7), ('would', 5), ('natur', 5)]
from glob import glob
fileList = glob("./data/shake*.txt")
token_dict = {}
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
for file in fileList:
shakes = open(file, 'r')
text = shakes.read()
lowers = text.lower()
no_punctuation = lowers.translate(str.maketrans('','',string.punctuation)) # 문장기호 삭제
token_dict[file] = no_punctuation
# token_dict
# this can take some time
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
sample_str = 'this sentence has unseen text such as computer but also king lord juliet'
response = tfidf.transform([sample_str])
response.data
/home/markbaum/Python/nltk/lib/python3.6/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['abov', 'afterward', 'alon', 'alreadi', 'alway', 'ani', 'anoth', 'anyon', 'anyth', 'anywher', 'becam', 'becaus', 'becom', 'befor', 'besid', 'cri', 'describ', 'dure', 'els', 'elsewher', 'empti', 'everi', 'everyon', 'everyth', 'everywher', 'fifti', 'formerli', 'forti', 'ha', 'henc', 'hereaft', 'herebi', 'hi', 'howev', 'hundr', 'inde', 'latterli', 'mani', 'meanwhil', 'moreov', 'mostli', 'nobodi', 'noon', 'noth', 'nowher', 'onc', 'onli', 'otherwis', 'ourselv', 'perhap', 'pleas', 'seriou', 'sever', 'sinc', 'sincer', 'sixti', 'someon', 'someth', 'sometim', 'somewher', 'themselv', 'thenc', 'thereaft', 'therebi', 'therefor', 'thi', 'thu', 'togeth', 'twelv', 'twenti', 'veri', 'wa', 'whatev', 'whenc', 'whenev', 'wherea', 'whereaft', 'wherebi', 'wherev', 'whi', 'yourselv'] not in stop_words. 'stop_words.' % sorted(inconsistent))
array([0.34618161, 0.66338461, 0.66338461])
feature_names = tfidf.get_feature_names()
for col in response.nonzero()[1]:
print (feature_names[col], ' - ', response[0, col])
thi - 0.34618161159873423 lord - 0.6633846138519129 king - 0.6633846138519129
import numpy as np
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]
top_n_tri = feature_array[tfidf_sorting][: 3]
top_n_forth = feature_array[tfidf_sorting][: 4]
top_n_tri, top_n_forth
(array(['king', 'lord', 'thi'], dtype='<U12'), array(['king', 'lord', 'thi', 'youth'], dtype='<U12'))
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
df = pd.DataFrame([['rick','young'],['phil','old']], columns=['name','age-group'])
df
name | age-group | |
---|---|---|
0 | rick | young |
1 | phil | old |
# pandas 모듈을 활용한 Encoding
pd.get_dummies(df)
name_phil | name_rick | age-group_old | age-group_young | |
---|---|---|---|---|
0 | 0 | 1 | 0 | 1 |
1 | 1 | 0 | 1 | 0 |
# Scikit-learn 모듈을 사용한 Tf-IDF
X = pd.DataFrame({'income': [100000,110000,90000,30000,14000,50000],
'country':['US', 'CAN', 'US', 'CAN', 'MEX', 'US'],
'race':['White', 'Black', 'Latino', 'White', 'White', 'Black']})
v = DictVectorizer()
qualitative_features = ['country']
X_qual = v.fit_transform(X[qualitative_features].to_dict('records'))
v.vocabulary_ ,"\n", X_qual.toarray()
({'country=US': 2, 'country=CAN': 0, 'country=MEX': 1}, '\n', array([[0., 0., 1.], [1., 0., 0.], [0., 0., 1.], [1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]))