! apt-get update
! apt-get install g++ openjdk-8-jdk
! pip3 install nltk konlpy
import nltk
nltk.download('punkt')
nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
text_eng = " Don't hesitate to ask questions"
text_kor = """삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.
홍보:유관순 031-478-2311 010-8888-9999.
삼성 페이지 https://www.samsung.com/sec/index.html"""
Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,609 B] Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Release Get:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release [564 B] Get:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release.gpg [801 B] Get:7 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB] Hit:8 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB] Hit:11 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease Get:13 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Packages [6,819 B] Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB] Get:15 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [363 kB] Get:16 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [956 kB] Get:17 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [161 kB] Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [727 kB] Fetched 2,470 kB in 3s (850 kB/s) Reading package lists... Done Reading package lists... Done Building dependency tree Reading state information... Done g++ is already the newest version (4:7.3.0-3ubuntu2.1). g++ set to manually installed. The following package was automatically installed and is no longer required: libnvidia-common-410 Use 'apt autoremove' to remove it. The following additional packages will be installed: fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jre x11-utils Suggested packages: openjdk-8-demo openjdk-8-source visualvm icedtea-8-plugin mesa-utils The following NEW packages will be installed: fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java libatk-wrapper-java-jni libxxf86dga1 openjdk-8-jdk openjdk-8-jre x11-utils 0 upgraded, 8 newly installed, 0 to remove and 6 not upgraded. Need to get 4,771 kB of archives. After this operation, 13.1 MB of additional disk space will be used. Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB] Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-core all 2.37-1 [1,041 kB] Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-dejavu-extra all 2.37-1 [1,953 kB] Get:4 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB] Get:5 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java all 0.33.3-20ubuntu0.1 [34.7 kB] Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libatk-wrapper-java-jni amd64 0.33.3-20ubuntu0.1 [28.3 kB] Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jre amd64 8u191-b12-2ubuntu0.18.04.1 [69.7 kB] Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 openjdk-8-jdk amd64 8u191-b12-2ubuntu0.18.04.1 [1,435 kB] Fetched 4,771 kB in 2s (3,041 kB/s) Selecting previously unselected package libxxf86dga1:amd64. (Reading database ... 131294 files and directories currently installed.) Preparing to unpack .../0-libxxf86dga1_2%3a1.1.4-1_amd64.deb ... Unpacking libxxf86dga1:amd64 (2:1.1.4-1) ... Selecting previously unselected package fonts-dejavu-core. Preparing to unpack .../1-fonts-dejavu-core_2.37-1_all.deb ... Unpacking fonts-dejavu-core (2.37-1) ... Selecting previously unselected package fonts-dejavu-extra. Preparing to unpack .../2-fonts-dejavu-extra_2.37-1_all.deb ... Unpacking fonts-dejavu-extra (2.37-1) ... Selecting previously unselected package x11-utils. Preparing to unpack .../3-x11-utils_7.7+3build1_amd64.deb ... Unpacking x11-utils (7.7+3build1) ... Selecting previously unselected package libatk-wrapper-java. Preparing to unpack .../4-libatk-wrapper-java_0.33.3-20ubuntu0.1_all.deb ... Unpacking libatk-wrapper-java (0.33.3-20ubuntu0.1) ... Selecting previously unselected package libatk-wrapper-java-jni:amd64. Preparing to unpack .../5-libatk-wrapper-java-jni_0.33.3-20ubuntu0.1_amd64.deb ... Unpacking libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ... Selecting previously unselected package openjdk-8-jre:amd64. Preparing to unpack .../6-openjdk-8-jre_8u191-b12-2ubuntu0.18.04.1_amd64.deb ... Unpacking openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ... Selecting previously unselected package openjdk-8-jdk:amd64. Preparing to unpack .../7-openjdk-8-jdk_8u191-b12-2ubuntu0.18.04.1_amd64.deb ... Unpacking openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ... Processing triggers for mime-support (3.60ubuntu1) ... Setting up fonts-dejavu-core (2.37-1) ... Setting up libxxf86dga1:amd64 (2:1.1.4-1) ... Processing triggers for libc-bin (2.27-3ubuntu1) ... Processing triggers for man-db (2.8.3-2ubuntu0.1) ... Setting up fonts-dejavu-extra (2.37-1) ... Processing triggers for hicolor-icon-theme (0.17-2) ... Processing triggers for fontconfig (2.12.6-0ubuntu2) ... Setting up x11-utils (7.7+3build1) ... Setting up libatk-wrapper-java (0.33.3-20ubuntu0.1) ... Setting up libatk-wrapper-java-jni:amd64 (0.33.3-20ubuntu0.1) ... Setting up openjdk-8-jre:amd64 (8u191-b12-2ubuntu0.18.04.1) ... update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/policytool to provide /usr/bin/policytool (policytool) in auto mode Setting up openjdk-8-jdk:amd64 (8u191-b12-2ubuntu0.18.04.1) ... update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/appletviewer to provide /usr/bin/appletviewer (appletviewer) in auto mode update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/bin/jconsole to provide /usr/bin/jconsole (jconsole) in auto mode Processing triggers for libc-bin (2.27-3ubuntu1) ... Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (3.2.5) Collecting konlpy Downloading https://files.pythonhosted.org/packages/e5/3d/4e983cd98d87b50b2ab0387d73fa946f745aa8164e8888a714d5129f9765/konlpy-0.5.1-py2.py3-none-any.whl (19.4MB) 100% |████████████████████████████████| 19.4MB 1.9MB/s Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk) (1.11.0) Collecting JPype1>=0.5.7 (from konlpy) Downloading https://files.pythonhosted.org/packages/c4/4b/60a3e63d51714d4d7ef1b1efdf84315d118a0a80a5b085bb52a7e2428cdc/JPype1-0.6.3.tar.gz (168kB) 100% |████████████████████████████████| 174kB 29.0MB/s Building wheels for collected packages: JPype1 Building wheel for JPype1 (setup.py) ... done Stored in directory: /root/.cache/pip/wheels/0e/2b/e8/c0b818ac4b3d35104d35e48cdc7afe27fc06ea277feed2831a Successfully built JPype1 Installing collected packages: JPype1, konlpy Successfully installed JPype1-0.6.3 konlpy-0.5.1 [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package tagsets to /root/nltk_data... [nltk_data] Unzipping help/tagsets.zip. [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
text_kor
'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n홍보:유관순 031-478-2311 010-8888-9999.\n삼성 페이지 https://www.samsung.com/sec/index.html'
from nltk import sent_tokenize, word_tokenize, FreqDist
sent_tokenize(text_kor)
['삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다.', '홍보:유관순 031-478-2311 010-8888-9999.', '삼성 페이지 https://www.samsung.com/sec/index.html']
tokens = word_tokenize(text_kor)
tokens
['삼성', '갤럭시', '(', 'GalaxyNote', ')', '노트의', '신형을', '홍보', '합니다', '.', '홍보', ':', '유관순', '031-478-2311', '010-8888-9999', '.', '삼성', '페이지', 'https', ':', '//www.samsung.com/sec/index.html']
dict(FreqDist(tokens))
{'(': 1, ')': 1, '.': 2, '//www.samsung.com/sec/index.html': 1, '010-8888-9999': 1, '031-478-2311': 1, ':': 2, 'GalaxyNote': 1, 'https': 1, '갤럭시': 1, '노트의': 1, '삼성': 2, '신형을': 1, '유관순': 1, '페이지': 1, '합니다': 1, '홍보': 2}
text_kor
'삼성 갤럭시(GalaxyNote)노트의 신형을 홍보 합니다. \n홍보:유관순 031-478-2311 010-8888-9999.\n삼성 페이지 https://www.samsung.com/sec/index.html'
import re
tokenizer = re.compile(r'[가-힣]+')
tokenizer.findall(text_kor)
['삼성', '갤럭시', '노트의', '신형을', '홍보', '합니다', '홍보', '유관순', '삼성', '페이지']
tokenizer = re.compile(r'[0-9]{3}-[0-9]{3,4}-[0-9]{4}')
tokenizer.findall(text_kor)
['031-478-2311', '010-8888-9999']
tokenizer = re.compile(r'\d{3}-\d{3,4}-\d{4}')
tokenizer.findall(text_kor)
['031-478-2311', '010-8888-9999']
tokenizer = re.compile(r'[^ 가-힣]+')
tokenizer.findall(text_kor)
['(GalaxyNote)', '.', '\n', ':', '031-478-2311', '010-8888-9999.\n', 'https://www.samsung.com/sec/index.html']
tokenizer.sub("", text_kor)#.split(" ")
'삼성 갤럭시노트의 신형을 홍보 합니다 홍보유관순 삼성 페이지 '
tokenizer = re.compile(r'https://[w]{3}.[A-z]+.[./A-z]+')
tokenizer.findall(text_kor)
['https://www.samsung.com/sec/index.html']
text_eng = text_eng.lower()
text_eng
" don't hesitate to ask questions"
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
token = tokenizer.tokenize(text_eng)
token
['do', "n't", 'hesitate', 'to', 'ask', 'questions']
from nltk import pos_tag
pos_tag(token)
[('do', 'VBP'), ("n't", 'RB'), ('hesitate', 'VB'), ('to', 'TO'), ('ask', 'VB'), ('questions', 'NNS')]
import nltk.help as nltk_help
nltk_help.upenn_tagset('PRP') # 대명사
PRP: pronoun, personal hers herself him himself hisself it itself me myself one oneself ours ourselves ownself self she thee theirs them themselves they thou thy us
nltk_help.upenn_tagset('JJ') # 형용사
JJ: adjective or numeral, ordinal third ill-mannered pre-war regrettable oiled calamitous first separable ectoplasmic battery-powered participatory fourth still-to-be-named multilingual multi-disciplinary ...
from konlpy.tag import Okt
twitter = Okt()
# Stemming
text = "파이썬을 활용하여 자연어 분석 특강입니다"
print(twitter.pos(text, stem="true"))
[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하다', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('이다', 'Adjective')]
print(twitter.pos(text))
[('파이썬', 'Noun'), ('을', 'Josa'), ('활용', 'Noun'), ('하여', 'Verb'), ('자연어', 'Noun'), ('분석', 'Noun'), ('특강', 'Noun'), ('입니다', 'Adjective')]
%%time
from konlpy.tag import Kkma
kkma = Kkma()
print(kkma.pos(text))
[('파이', 'NNG'), ('썰', 'VV'), ('ㄴ', 'ETD'), ('을', 'NNG'), ('활용', 'NNG'), ('하', 'XSV'), ('여', 'ECS'), ('자연어', 'NNG'), ('분석', 'NNG'), ('특강', 'NNG'), ('이', 'VCP'), ('ㅂ니다', 'EFN')] CPU times: user 17.1 s, sys: 584 ms, total: 17.7 s Wall time: 9.41 s
%%time
from konlpy.tag import Hannanum
han = Hannanum()
print(han.pos(text))
[('파이썬', 'N'), ('을', 'J'), ('활용', 'N'), ('하', 'X'), ('어', 'E'), ('자연어', 'N'), ('분석', 'N'), ('특강', 'N'), ('이', 'J'), ('ㅂ니다', 'E')] CPU times: user 4.28 s, sys: 69.4 ms, total: 4.35 s Wall time: 2.2 s