import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.datasets as sk_data
import sklearn.metrics as metrics
from sklearn.cluster import KMeans
#import matplotlib as mpl
import seaborn as sns
%matplotlib inline
Generating our data
X, y = sk_data.make_blobs(n_samples=100, centers=3, n_features=30,center_box=(-10.0, 10.0),random_state=0)
sns.heatmap(X, xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10aab5490>
Computing the pairwise distances for visualization purposes
We can compute pairwise distances using the sklearn.metrics functions summarized here: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
euclidean_dists = metrics.euclidean_distances(X)
Visualizing the data using the heatmap of pairwise distances
sns.heatmap(euclidean_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10b3c1290>
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)
kmeans.fit_predict(X)
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
error = kmeans.inertia_
print "The total error of the clustering is: ", error
print '\nCluster labels'
print labels
print '\n Cluster Centroids'
print centroids
The total error of the clustering is: 2733.8430819 Cluster labels [1 2 0 0 2 1 2 2 1 2 0 1 0 1 2 0 0 1 0 2 0 2 0 1 2 2 1 0 0 0 0 1 0 1 0 2 0 2 0 2 2 2 1 0 1 0 1 2 1 0 0 1 1 1 1 0 1 2 2 0 1 0 2 1 1 2 0 1 1 2 2 2 1 1 0 1 2 1 2 1 2 2 2 1 0 0 2 1 1 0 1 2 2 0 1 0 0 2 2 0] Cluster Centroids [[-4.7833887 5.32946939 -0.87141823 1.38900567 -9.59956915 2.35207348 2.22988468 2.03394692 8.9797878 3.67857655 -2.67618716 -1.17595897 3.76433199 -8.46317271 3.28114395 3.73803392 -5.73436869 -7.0844462 -3.75643598 -3.07904369 1.36974653 -0.95918462 9.91135428 -8.17722281 -5.8656831 -6.76869078 3.12196673 -4.85745245 -0.70449349 -4.94582258] [ 0.88697885 4.29142902 1.93200132 1.10877989 -1.55994342 2.80616392 -1.11495818 7.74595341 8.92512875 -2.29656298 6.09588722 0.47062896 1.36408008 8.63168509 -8.54512921 -8.59161818 -9.64308952 6.92270491 5.65321496 7.29061444 9.58822315 5.79602014 -0.84970449 5.46127493 -7.77730238 2.75092191 -7.17026663 9.07475984 0.04245798 -1.98719465] [-7.0489904 -7.92501873 2.89710462 -7.17088692 -6.01151677 -2.66405834 6.43970052 -8.20341647 6.54146052 -7.92978843 9.56983319 -0.86327902 9.25897119 1.73061823 4.84528928 -9.26418246 -4.54021612 -7.47784575 -4.15060719 -7.85665458 -3.76688414 -1.6692291 -8.78048843 3.78904162 1.24247168 -4.73618733 0.27327032 -7.93180624 1.59974866 8.78601576]]
There are 3 functions in all the clustering classes, fit() predict() and fit_predict().
fit() is building the model from the training data (e.g. finding the centroids),
predict() is assigning labels to test data after building the model,
fit_predict() is doing both in the same data (e.g in kmeans, it finds the centroids and assigns the labels to the dataset)
#print original and cluster data
idx = np.argsort(labels)
rX = X[idx,:]
sns.heatmap( rX,xticklabels=False, yticklabels=False, linewidths=0,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10ce74890>
#Rearrange so that all same labels are consecutive
#print labels
#print labels[idx]
rearranged_dists = euclidean_dists[idx,:][:,idx]
sns.heatmap(rearranged_dists, xticklabels=False, yticklabels=False, linewidths=0, square=True,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x10d68aa10>
error = np.zeros(11)
error[0] = 0;
for k in range(1,11):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
error[k] = kmeans.inertia_
plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')
<matplotlib.text.Text at 0x10e9fc890>
Making this a function
def evaluate_clusters(X,max_clusters):
error = np.zeros(max_clusters+1)
error[0] = 0;
for k in range(1,max_clusters+1):
kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
kmeans.fit_predict(X)
error[k] = kmeans.inertia_
plt.plot(range(1,len(error)),error[1:])
plt.xlabel('Number of clusters')
plt.ylabel('Error')
evaluate_clusters(X,10)
from sklearn.datasets import fetch_20newsgroups
"""
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
'rec.autos',
'rec.sport.baseball'
]"""
categories = ['alt.atheism', 'sci.space','rec.sport.baseball']
news_data = fetch_20newsgroups(subset='train', categories=categories)
print news_data.target, len(news_data.target)
print news_data.target_names
[0 0 2 ..., 1 1 2] 1670 ['alt.atheism', 'rec.sport.baseball', 'sci.space']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', min_df=4, max_df=0.8)
data = vectorizer.fit_transform(news_data.data)
In a large text corpus, some words will be very present (e.g. “the”, “a”, “is” in English) hence carrying very little meaningful information about the actual contents of the document. If we were to feed the direct count data directly to a classifier those very frequent terms would shadow the frequencies of rarer yet more interesting terms.
In order to re-weight the count features into floating point values suitable for usage by a classifier it is very common to use the tf–idf transform.
Tf means term-frequency while tf–idf means term-frequency times inverse document-frequency. This is a originally a term weighting scheme developed for information retrieval (as a ranking function for search engines results), that has also found good use in document classification and clustering.
If $N$ is the total number of documents in the corpus $D$ then
$$\text{idf}(t,D)=\frac{N}{|\{d\in D\mid t\in d \}|}$$$$\text{tf-idf}(t,d)=\text{tf}(t,d)\times \text{idf}(t,D)$$print type(data), data.shape
<class 'scipy.sparse.csr.csr_matrix'> (1670, 8238)
fig, ax1 = plt.subplots(1,1,figsize=(15,10))
sns.heatmap(data[1:100,1:200].todense(), xticklabels=False, yticklabels=False,
linewidths=0, cbar=False, ax=ax1)
<matplotlib.axes._subplots.AxesSubplot at 0x10d964410>
print news_data.target
print news_data.target_names
[0 0 2 ..., 1 1 2] ['alt.atheism', 'rec.sport.baseball', 'sci.space']
evaluate_clusters(data, 10)
ri_evaluate_clusters(data,10,news_data.target)
sc_evaluate_clusters(data,10)
k=4
kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
kmeans.fit_predict(data)
array([1, 1, 3, ..., 2, 2, 3], dtype=int32)
print("Top terms per cluster:")
asc_order_centroids = kmeans.cluster_centers_.argsort()#[:, ::-1]
order_centroids = asc_order_centroids[:,::-1]
terms = vectorizer.get_feature_names()
for i in range(k):
print "Cluster %d:" % i
for ind in order_centroids[i, :10]:
print ' %s' % terms[ind]
print
Top terms per cluster: Cluster 0: jewish players baseball edu roger princeton hall vb30 lafibm lafayette Cluster 1: edu god keith com caltech people writes sgi atheists livesey Cluster 2: edu year team game com baseball cs games runs university Cluster 3: space nasa edu access henry com gov alaska moon digex
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("../theme/custom.css", "r").read()
return HTML(styles)
css_styling()