# ! apt-get update
# ! apt-get install g++ openjdk-8-jdk
# ! pip3 install nltk konlpy wordcloud matplotlib gensim
# ! apt-get install fonts-nanum*
# ! apt-get install fontconfig
# ! fc-cache -fv
# ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
# ! rm -rf /content/.cache/matplotlib/*
# speech_text = "https://raw.githubusercontent.com/YongBeomKim/nltk_rnd/master/data/pyongyang_fin.txt"
# font_file = "/usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/NanumGothicCoding.ttf"
speech_text = "../data/pyongyang_fin.txt"
font_file = "../data/D2Coding.ttf"
# import nltk
# nltk.download('punkt')
import pandas as pd
from nltk import FreqDist
from nltk.tokenize import word_tokenize
# for Colab
# import requests
# texts = requests.get(speech_text).text
# texts[:100]
# for LocalHost
with open(speech_text, 'r') as f:
texts = f.read()
texts[:100]
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud
wcloud = WordCloud(font_file).generate(texts)
plt.figure(figsize=(12,12))
plt.imshow(wcloud)
plt.axis("off")
from collections import Counter
dictionary = Counter(texts.split())
dictionary.most_common()
# Twitter 모듈을 활용하여 명사만 추출
from konlpy.tag import Okt
twitter = Okt()
tokens = twitter.pos(texts, stem=True)
tokens_noun = [token[0] for token in tokens
if token[1] == "Noun"]
texts_noun = " ".join(tokens_noun)
texts_noun[:300]
%matplotlib inline
plt.figure(figsize=(12,12))
wcloud = WordCloud(font_file).generate(texts_noun)
plt.imshow(wcloud)
plt.axis("off")
# Token 빈도결과값
from collections import Counter
dictionary = Counter(texts_noun.split())
dictionary.most_common()[:10]