뉴스그룹 데이터 Set 분석
NLTK 에서 자연어 분석 작업
# 어간의 추출
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
porter_stemmer.stem('machines')
'machin'
# Token 의 원형을 복구한다
from nltk.stem import WordNetLemmatizer
lemmatization = WordNetLemmatizer()
lemmatization.lemmatize('machines')
'machine'
# 뉴스 데이터를 가져온다 (약 14MB)
# 뉴스 그룹 0 ~ 19 (20개 목록)
from sklearn.datasets import fetch_20newsgroups
groups = fetch_20newsgroups(data_home='data/news/')
groups['target_names']
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
# groups Json 데이터 살펴보기
for key in groups.keys():
print(key, ':', type(groups[key]))
groups.keys()
data : <class 'list'> filenames : <class 'numpy.ndarray'> target_names : <class 'list'> target : <class 'numpy.ndarray'> DESCR : <class 'str'>
dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
# 뉴스그룹 Primary Key 값인 정수값 인코딩
# 정수들이 중복되지 않게 정리된 결과를 출력한다
import numpy as np
print(groups.target)
np.unique(groups.target)
[7 4 4 ... 3 1 8]
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
0 번 뉴스의 정보를 확인하기
# 해당 뉴스의 내용 살펴보기
print("News Group 포함된 자료갯수: {:,} 개\n\n0번 샘플보기: \n{}".format(
len(groups.data), groups.data[0]))
News Group 포함된 자료갯수: 11,314 개 0번 샘플보기: From: lerxst@wam.umd.edu (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ----
# 0번 뉴스의 해당 그룹 Category 확인
groups.target_names[groups.target[0]]
'rec.autos'
%matplotlib inline
# Seaborn 내부함수에 대한 FutureWarning이 출력
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# 전체 뉴스그룹 데이터의 길이를 시각화 한다
# 전체적으로 11,316개가 비슷한 길이를 갖음을 확인할 수 있다
import matplotlib.pyplot as plt
import seaborn as sns
sns.distplot(groups.target)
plt.grid()
/home/markbaum/Python/python/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6521: MatplotlibDeprecationWarning: The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead. alternative="'density'", removal="3.1")
CountVectorizer() 의 파라미터 확인
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# 빈도상위 500개의 단어로만 추출한 결과를 분석
cv = CountVectorizer(stop_words="english", max_features=500)
transformed = cv.fit_transform(groups.data)
print(cv.get_feature_names()[:100])
# 빈도상위 100개의 Token을 출력한다 : 문장간 식별력이 낮은 숫자와 기호들이 포함
['00', '000', '0d', '0t', '10', '100', '11', '12', '13', '14', '145', '15', '16', '17', '18', '19', '1993', '1d9', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '34u', '35', '40', '45', '50', '55', '80', '92', '93', '__', '___', 'a86', 'able', 'ac', 'access', 'actually', 'address', 'ago', 'agree', 'al', 'american', 'andrew', 'answer', 'anybody', 'apple', 'application', 'apr', 'april', 'area', 'argument', 'armenian', 'armenians', 'article', 'ask', 'asked', 'att', 'au', 'available', 'away', 'ax', 'b8f', 'bad', 'based', 'believe', 'berkeley', 'best', 'better', 'bible', 'big', 'bike', 'bit', 'black', 'board', 'body', 'book', 'box', 'buy', 'ca', 'california', 'called', 'came', 'canada', 'car', 'card', 'care', 'case', 'cause']
# 위에서 추출한 임베딩 데이터로 히스토그램을 보여준다
sns.distplot(np.log(transformed.toarray().sum(axis=0)))
plt.xlabel('Log Count')
plt.ylabel('Frequency')
plt.title('Distribution Plot of 500 Word Counts')
plt.grid(); plt.show()
# 아래에서 사용하는 알파벳 판단함수
'names'.isalpha()
True
%%time
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
# 영문 제시어만 추출한다
def letters_only(astr):
for c in astr:
if not c.isalpha(): return False
return True
# 추출한 데이터 Token을 하나씩 표제어복원을 진행한다
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()
cleaned = [' '.join([lemmatizer.lemmatize(word.lower())
for word in post.split()
if letters_only(word) and word not in all_names])
for post in groups.data]
# cv = CountVectorizer(stop_words="english", max_features=500)
transformed = cv.fit_transform(cleaned)
print(cv.get_feature_names()[:10])
len(names.words())
['able', 'accept', 'access', 'according', 'act', 'action', 'actually', 'add', 'address', 'ago'] CPU times: user 12.6 s, sys: 30.8 ms, total: 12.6 s Wall time: 12.6 s
KMeans(클러스터수, 샘플갯수, 반복횟수)
%%time
# K Mean를 활용한 묶음처리
from sklearn.cluster import KMeans
km = KMeans(n_clusters=20, n_jobs=-1)
km.fit(transformed)
labels = groups.target
plt.scatter(labels, km.labels_)
plt.xlabel('Newsgroup'); plt.ylabel('Cluster')
plt.show()
CPU times: user 448 ms, sys: 401 ms, total: 848 ms Wall time: 1min 28s
%%time
from sklearn.decomposition import NMF
nmf = NMF(n_components=100, random_state=43).fit(transformed)
for topic_idx, topic in enumerate(nmf.components_):
label = '{}: '.format(topic_idx)
print(label, " ".join([cv.get_feature_names()[i]
for i in topic.argsort()[:-9:-1]]))
0: wa thought later took left order seen taken 1: db bit data place stuff add time line 2: server using display screen support code mouse application 3: file section information write source change entry number 4: disk drive hard controller support card board head 5: entry rule program source number info email build 6: new york sale change service result study early 7: image software user package using display include support 8: window manager application using offer user information course 9: gun united control house american second national issue 10: hockey league team game division player list san 11: turkish government sent war study came american world 12: program change technology display information version application rate 13: space nasa technology service national international small communication 14: government political federal sure free private local country 15: output line open write read return build section 16: people country doing tell live killed lot saying 17: widget application value set type return function list 18: child case rate le report area research group 19: jew jewish world war history help research arab 20: armenian russian muslim turkish world city road today 21: president said group tax press working package job 22: ground box usually power code current house white 23: russian president american support food money important private 24: ibm color week memory hardware monitor software standard 25: anonymous posting service server user group message post 26: la win san went list year radio near 27: work job young school lot private create business 28: encryption technology access device policy security government data 29: tape driver work memory using cause note following 30: war military world attack way united russian force 31: god bible shall man come life hell love 32: atheist religious religion belief god sort feel idea 33: data available information user research set model based 34: center research medical institute national study test north 35: think lot try trying talk kind agree certainly 36: water city division list public similar north high 37: section military shall weapon person division application mean 38: good cover great pretty probably bad issue life 39: drive head single mode set using model type 40: israeli arab attack policy true apr fact stop 41: use note using usually similar available standard work 42: know tell way come sure understand let saw 43: car speed driver change high buy different design 44: internet email address information anonymous user network mail 45: like look sound long little guy pretty having 46: going come way mean kind sure working got 47: state united public national political federal member local 48: dod bike member computer list started live email 49: greek killed act word western muslim turkish talk 50: computer information public internet list issue network communication 51: law act federal specific issue clear order moral 52: book read reference list copy second study offer 53: argument form true evidence event truth particular known 54: make sense difference little sure making end tell 55: scsi hard pc drive device bus different data 56: time long having able lot order light response 57: gun rate crime city death study control difference 58: right second free shall security mean left american 59: went came said told started saw took woman 60: power period second san special le play goal 61: used using product way function version note single 62: problem work having using help apple running error 63: available version widget server includes sun set support 64: question answer ask asked science reason claim post 65: san information police said group league political including 66: number serial large men report following million le 67: year ago old best sale hit long project 68: want help let life reason trying copy tell 69: point way different line algorithm exactly idea view 70: run running home version start hit win speed 71: got shot play took goal went hit lead 72: thing saw sure got trying kind seen asked 73: graphic send mail message package server various computer 74: university science department general computer thanks engineering texas 75: just maybe start thought big probably look getting 76: key message public security algorithm standard method attack 77: doe mean anybody actually different ask reading difference 78: game win sound play left second lead great 79: ha able called taken given past exactly looking 80: believe belief christian truth evidence claim mean different 81: drug study information war group reason usa evidence 82: need help phone able needed kind thanks bike 83: did death let money fact man wanted body 84: chip clipper serial algorithm phone communication encryption key 85: card driver video support mode mouse board bus 86: church christian member group true bible different view 87: ftp available anonymous general nasa package source version 88: better player best play probably hit maybe big 89: human life person moral kill claim reason world 90: bit using let change mode attack size quite 91: say mean word act clear said read simply 92: health medical public national care study service user 93: article post usa read world discussion opinion gmt 94: team player win play city look bad great 95: day come word christian said tell little way 96: really lot sure look fact idea actually feel 97: unit disk size serial total national got return 98: image color version free available display current better 99: woman men muslim religion way man great world CPU times: user 52.3 s, sys: 32.2 s, total: 1min 24s Wall time: 38.7 s