%load_ext autoreload
%autoreload 2
import sys, os
sys.path.append('/Users/arimorcos/Github/getRedditDataset/')
#sys.path.append('D:\Documents\GitHub\getRedditDataset')
#from celebReddit import countWords
import redditDB
import datetime
import praw
from matplotlib import pyplot as pp
from matplotlib import colorbar as cb
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as PD
import scipy
from scipy import spatial
from sklearn import cluster, decomposition
%matplotlib inline
The autoreload extension is already loaded. To reload it, use: %reload_ext autoreload
def addCopyright(ax, xOffset=0, yOffset=0):
year = datetime.date.today().year
if any([i == '_axis3don' for i in dir(ax)]):
ax.text(0 + xOffset, -.05 + yOffset, 0, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes)
else:
ax.text(0 + xOffset, -.05 + yOffset, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes)
""" Establish database connection """
db = redditDB.RedditDB('March2-8_Top200')
""" Get comments from subreddits """
if not os.path.isfile('popSubs.npy'):
subList = praw.Reddit(user_agent='pop').get_popular_subreddits(limit=200)
subList = [sub.display_name for sub in subList]
np.save('popSubs.npy', subList)
else:
subList = list(np.load('popSubs.npy'))
comments = db.getSubredditCommentText(subList[0])
""" Strip formatting """
comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]
""" Enumerate words to get list and values """
def countNWords(allWords):
wordCounts = {}
for word in allWords:
if word in wordCounts:
wordCounts[word] += 1
else:
wordCounts[word] = 1
return wordCounts
""" Concatenate all comments together and split into individual words """
allComments = " ".join(comments)
allWords = allComments.split()
allWords = [word.lower() for word in allWords]
wordCounts = countNWords(allWords)
""" sort by values and get 100 most frequent words"""
sortedWords = sorted(wordCounts, key=wordCounts.get, reverse=True)
mostFreqWords = sortedWords[0:100]
freqWordCounts = [wordCounts[word] for word in mostFreqWords]
reload = False
usedSubs = subList[0:50]
if not os.path.isfile('allCounts.npy') or reload:
allCounts = np.zeros(shape=(len(mostFreqWords) ,len(usedSubs)))
subInd = 0
for sub in usedSubs:
# get comments
comments = db.getSubredditCommentText(sub)
comments = [''.join(s for s in comment if ord(s)>31 and ord(s)<126) for comment in comments]
# convert to all words
allComments = " ".join(comments)
allWords = allComments.split()
allWords = [word.lower() for word in allWords]
# count words
wordCounts = countNWords(allWords)
# get total words
totWords = float(len(allWords))
# get freqWordCounts
freqWordCounts = [wordCounts[word] if word in wordCounts else 0 for word in mostFreqWords ]
freqWordCounts = [i/totWords for i in freqWordCounts]
# concatenate
allCounts[:,subInd] = freqWordCounts
# increment subInd
subInd += 1
# save
np.save('allCounts',allCounts)
else:
allCounts = np.load('allCounts.npy')
""" get pairwise distance """
normMax = np.max(allCounts, axis=1)
normCounts = np.transpose(np.divide(np.transpose(allCounts), normMax))
distVec = spatial.distance.pdist(np.transpose(normCounts), 'euclidean')
distMat = spatial.distance.squareform(distVec)
""" print top words """
print '100 Most Frequent Words'
for word, ind in zip(mostFreqWords, range(len(mostFreqWords))):
print str(ind+1) + ': ' + word
100 Most Frequent Words 1: the 2: a 3: to 4: i 5: and 6: of 7: is 8: you 9: that 10: it 11: in 12: this 13: for 14: was 15: on 16: my 17: have 18: like 19: with 20: not 21: be 22: just 23: but 24: are 25: they 26: it's 27: he 28: your 29: if 30: so 31: at 32: as 33: or 34: what 35: one 36: me 37: all 38: get 39: don't 40: would 41: can 42: i'm 43: when 44: from 45: an 46: about 47: do 48: we 49: his 50: out 51: up 52: how 53: no 54: people 55: that's 56: more 57: think 58: there 59: some 60: because 61: know 62: had 63: has 64: who 65: by 66: only 67: their 68: she 69: then 70: really 71: than 72: them 73: it. 74: will 75: you're 76: see 77: even 78: why 79: got 80: time 81: did 82: were 83: make 84: could 85: good 86: go 87: never 88: her 89: been 90: those 91: still 92: other 93: where 94: actually 95: also 96: much 97: looks 98: going 99: want 100: didn't
%matplotlib inline
fig, ax = pp.subplots()
fig.set_figheight(15)
fig.set_figwidth(15)
axMat = ax.matshow(distMat)
ax.set_xticks(range(len(distMat)))
ax.set_xticklabels(subList, rotation=90);
ax.set_yticks(range(len(distMat)))
ax.set_yticklabels(subList);
cbar = fig.colorbar(axMat, shrink=0.8)
cbar.set_label('Euclidean distance')
addCopyright(ax)
""" Plot most 10 most similar pairs"""
x = distMat.copy()
np.fill_diagonal(x, np.inf)
for ind in range(30):
minInd = np.argmin(x)
inds = np.unravel_index(minInd, distMat.shape)
print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
x[inds] = np.inf
x[inds[1], inds[0]] = np.inf
pics, WTF: 0.543759113231 worldnews, politics: 0.599345581854 leagueoflegends, DotA2: 0.601210671981 funny, WTF: 0.602313380524 funny, gifs: 0.621706933305 gifs, videos: 0.626716585181 funny, pics: 0.640588437023 funny, videos: 0.651860141535 WTF, videos: 0.693787405031 worldnews, news: 0.703743248721 news, politics: 0.706332605302 GlobalOffensive, DotA2: 0.70882878253 pics, videos: 0.710356004246 leagueoflegends, GlobalOffensive: 0.716566799853 pics, gifs: 0.717228536034 gifs, WTF: 0.723489229483 soccer, nba: 0.751889065628 pics, mildlyinteresting: 0.783822840038 gaming, pokemon: 0.796209985884 nfl, hockey: 0.799350222545 soccer, hockey: 0.821293217825 nba, nfl: 0.827574873963 WTF, mildlyinteresting: 0.841931458449 nba, hockey: 0.856905008487 funny, Jokes: 0.887100200675 news, atheism: 0.924369948462 funny, mildlyinteresting: 0.925252089592 funny, reactiongifs: 0.931403241427 DotA2, hearthstone: 0.934018748533 leagueoflegends, hearthstone: 0.965835121013
""" Plot most 10 most dissimilar pairs"""
x = distMat.copy()
np.fill_diagonal(x, -1*np.inf)
for ind in range(30):
maxInd = np.argmax(x)
inds = np.unravel_index(maxInd, distMat.shape)
print usedSubs[inds[0]] + ', ' + usedSubs[inds[1]] + ': ' + str(x[inds])
x[inds] = -1*np.inf
x[inds[1], inds[0]] = -1*np.inf
thewalkingdead, circlejerk: 3.72546853929 gentlemanboners, circlejerk: 3.63429059482 Celebs, circlejerk: 3.51177898884 atheism, circlejerk: 3.45955903137 tumblr, circlejerk: 3.45577212578 CitiesSkylines, circlejerk: 3.45192167389 DestinyTheGame, circlejerk: 3.4319856229 nba, circlejerk: 3.42767314984 cringepics, circlejerk: 3.42654051308 nfl, circlejerk: 3.4203558873 TrollXChromosomes, circlejerk: 3.40431403526 tifu, circlejerk: 3.3686961706 technology, circlejerk: 3.34521983014 GlobalOffensive, circlejerk: 3.33084585987 science, circlejerk: 3.32799725149 Showerthoughts, circlejerk: 3.32153626566 trees, circlejerk: 3.32067892558 SquaredCircle, circlejerk: 3.32063189186 worldnews, circlejerk: 3.30517637544 TumblrInAction, circlejerk: 3.30423636681 reactiongifs, circlejerk: 3.28984964383 woahdude, circlejerk: 3.28281824828 soccer, circlejerk: 3.28108418512 movies, circlejerk: 3.27862732158 news, circlejerk: 3.26794021605 fatpeoplehate, circlejerk: 3.26387350263 leagueoflegends, circlejerk: 3.26382575757 gaming, circlejerk: 3.26274468077 worldnews, gentlemanboners: 3.25785124757 politics, gentlemanboners: 3.25149424013
""" perform pca """
pcaObj = decomposition.PCA()
reducedCounts = pcaObj.fit_transform(normCounts.transpose())[:,0:3]
""" plot cumsum of variance explained """
varExp = np.cumsum(pcaObj.explained_variance_ratio_)
fig = pp.figure(figsize=(15,15))
ax = pp.subplot()
ax.plot(varExp, marker='o');
ax.set_xlabel("Number of principal components")
ax.set_ylabel("Cumulative variance explained")
ax.set_ylim(bottom=0, top=1)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(20)
addCopyright(ax)
""" cluster points """
apObj = cluster.AffinityPropagation(damping=0.75)
apObj.fit(reducedCounts);
len(set(apObj.labels_))
7
# plot 3d
#%matplotlib qt
%matplotlib inline
fig = pp.figure(figsize=(15,15))
ax = pp.subplot(projection='3d')
ax.scatter(reducedCounts[:,0], reducedCounts[:,1], reducedCounts[:,2], s=120, c=apObj.labels_)
ax.set_xlabel('PC1'); ax.set_ylabel('PC2'); ax.set_zlabel('PC3')
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label, ax.zaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
item.set_fontsize(20)
addCopyright(ax)
%matplotlib inline
#plot 2d
fig = pp.figure(figsize=(15,15))
ax2 = pp.subplot()
ax2.scatter(reducedCounts[:,0], reducedCounts[:,1], s=50, c=apObj.labels_)
ax2.set_xlabel('PC1'); ax2.set_ylabel('PC2')
# get distance between all points
dists = spatial.distance.squareform(spatial.distance.pdist(reducedCounts[:,0:2], 'euclidean'))
np.fill_diagonal(dists,np.inf)
# label points
for point in range(len(usedSubs)):
yOffset = 0
if any(np.argwhere(dists[point,:] < 0.05)):
yOffset = 0.01
if reducedCounts[point,1] < reducedCounts[np.argwhere(dists[point,:] < 0.05),1]:
yOffset = -1*yOffset
if point == 37 or point == 23: # ALL THE KLUGE
yOffset = -0.02
elif point == 27:
yOffset = 0.02
ax2.text(reducedCounts[point,0]+0.02, reducedCounts[point,1]+yOffset, usedSubs[point],
size=10, rotation=0, va='center', ha='left')
for item in ([ax2.title, ax2.xaxis.label, ax2.yaxis.label] +
ax2.get_xticklabels() + ax2.get_yticklabels()):
item.set_fontsize(20)
ax2.set_xlim(left=-1.6, right=1.6)
addCopyright(ax2)