! apt-get update
! apt-get install g++ openjdk-8-jdk 
! pip3  install  nltk konlpy wordcloud matplotlib gensim 

! apt-get install fonts-nanum*
! apt-get install fontconfig
! fc-cache -fv
! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
! rm -rf /content/.cache/matplotlib/*

speech_text = "https://raw.githubusercontent.com/YongBeomKim/nltk_rnd/master/data/pyongyang_fin.txt"
font_file   = "/usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/NanumGothicCoding.ttf"
# speech_text = "../data/pyongyang_fin.txt"
# font_file   = "../data/D2Coding.ttf"

import nltk
nltk.download('punkt')

import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk import FreqDist
from nltk.tokenize import word_tokenize

# for Colab
import requests
texts = requests.get(speech_text).text
texts[:100]

# for LocalHost
# with open(speech_text, 'r') as f:
#     texts = f.read()
# texts[:100]

%matplotlib inline
wcloud = WordCloud(font_file).generate(texts)
plt.figure(figsize=(12,12))
plt.imshow(wcloud)
plt.axis("off")

from collections import Counter
dictionary = Counter(texts.split())
dictionary.most_common()[:10]

texts[:200]

# Twitter 모듈을 활용하여 명사만 추출
from konlpy.tag import Okt 
twitter     = Okt()
tokens      = twitter.pos(texts, stem=True)
tokens_noun = [token[0]   for token in tokens  
                          if token[1] == "Noun"]

texts_noun  = " ".join(tokens_noun)
texts_noun[:300]

%matplotlib inline
plt.figure(figsize=(12,12))
wcloud = WordCloud(font_file).generate(texts_noun)
plt.imshow(wcloud)
plt.axis("off")

# Token 빈도결과값
from collections import Counter
dictionary = Counter(texts_noun.split())
dictionary.most_common()[:10]