! apt-get update ! apt-get install g++ openjdk-8-jdk ! pip3 install nltk konlpy wordcloud matplotlib gensim ! apt-get install fonts-nanum* ! apt-get install fontconfig ! fc-cache -fv ! cp /usr/share/fonts/truetype/nanum/Nanum* /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/ ! rm -rf /content/.cache/matplotlib/* speech_text = "https://raw.githubusercontent.com/YongBeomKim/nltk_rnd/master/data/pyongyang_fin.txt" font_file = "/usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/NanumGothicCoding.ttf" # speech_text = "../data/pyongyang_fin.txt" # font_file = "../data/D2Coding.ttf" import nltk nltk.download('punkt') import pandas as pd import matplotlib.pyplot as plt from wordcloud import WordCloud from nltk import FreqDist from nltk.tokenize import word_tokenize # for Colab import requests texts = requests.get(speech_text).text texts[:100] # for LocalHost # with open(speech_text, 'r') as f: # texts = f.read() # texts[:100] %matplotlib inline wcloud = WordCloud(font_file).generate(texts) plt.figure(figsize=(12,12)) plt.imshow(wcloud) plt.axis("off") from collections import Counter dictionary = Counter(texts.split()) dictionary.most_common()[:10] texts[:200] # Twitter 모듈을 활용하여 명사만 추출 from konlpy.tag import Okt twitter = Okt() tokens = twitter.pos(texts, stem=True) tokens_noun = [token[0] for token in tokens if token[1] == "Noun"] texts_noun = " ".join(tokens_noun) texts_noun[:300] %matplotlib inline plt.figure(figsize=(12,12)) wcloud = WordCloud(font_file).generate(texts_noun) plt.imshow(wcloud) plt.axis("off") # Token 빈도결과값 from collections import Counter dictionary = Counter(texts_noun.split()) dictionary.most_common()[:10]