! apt-get update ! apt-get install g++ openjdk-8-jdk ! pip3 install nltk konlpy import nltk nltk.download('punkt') nltk.download('tagsets') nltk.download('averaged_perceptron_tagger') text_eng = " Don't hesitate to ask questions" text_kor = """삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. 홍보:유관순 031-478-2311 010-8888-9999. 삼성 페이지 https://www.samsung.com/sec/index.html""" text_kor from nltk import sent_tokenize, word_tokenize, FreqDist sent_tokenize(text_kor) tokens = word_tokenize(text_kor) tokens dict(FreqDist(tokens)) text_kor import re tokenizer = re.compile(r'[가-힣]+') tokenizer.findall(text_kor) tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-[0-9]{4}') tokenizer.findall(text_kor) tokenizer = re.compile(r'\d{3}-\d{3,4}-\d{4}') tokenizer.findall(text_kor) tokenizer = re.compile(r'[^ 가-힣]+') tokenizer.findall(text_kor) tokenizer.sub("", text_kor)#.split(" ") tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+') tokenizer.findall(text_kor) text_eng = text_eng.lower() text_eng from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() token = tokenizer.tokenize(text_eng) token from nltk import pos_tag pos_tag(token) import nltk.help as nltk_help nltk_help.upenn_tagset('PRP') # 대명사 nltk_help.upenn_tagset('JJ') # 형용사 from konlpy.tag import Okt twitter = Okt() # Stemming text = "파이썬을 활용하여 자연어 분석 특강입니다" print(twitter.pos(text, stem="true")) print(twitter.pos(text)) %%time from konlpy.tag import Kkma kkma = Kkma() print(kkma.pos(text)) %%time from konlpy.tag import Hannanum han = Hannanum() print(han.pos(text))