# Use rpy2 for loading R datasets
from rpy2.robjects.packages import importr
from rpy2.robjects.packages import data as rdata
from rpy2.robjects import pandas2ri
# Math and data processing
import numpy as np
import scipy as sp
import pandas as pd
# scikit-learn
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
# scipy
from scipy.cluster import hierarchy
# Visulization
from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.style.use('ggplot')
# USArrests dataset is in R base package
rbase = importr('base')
usarrests_rdf = rdata(rbase).fetch('USArrests')['USArrests']
usarrests = pandas2ri.ri2py(usarrests_rdf)
display(usarrests.head(5))
usarrests.info()
# Mean and variance
print(usarrests.mean())
print(usarrests.var())
# Standardize the data
X = pd.DataFrame(scale(usarrests), index=usarrests.index, columns=usarrests.columns)
print(X.mean())
print(X.var())
# Principal Component Analysis
pca = PCA()
usarrests_loadings = pd.DataFrame(pca.fit(X).components_.T, index=usarrests.columns, columns=['PC1', 'PC2', 'PC3', 'PC4'])
display(usarrests_loadings)
usarrests_score = pd.DataFrame(pca.fit_transform(X), index=X.index, columns=['PC1', 'PC2', 'PC3', 'PC4'])
display(usarrests_score)
# Standard deviation, Variance, and EVR of principal components
usarrests_score_stdvar = pd.DataFrame([np.sqrt(pca.explained_variance_), pca.explained_variance_, pca.explained_variance_ratio_], index=['STDEV', 'VAR', 'Explained VAR Ratio'], columns=['PC1', 'PC2', 'PC3', 'PC4'])
display(usarrests_score_stdvar)
mpl.style.use('seaborn-white')
fig , ax1 = plt.subplots(figsize=(9,7))
ax1.set_xlim(-3.5,3.5)
ax1.set_ylim(-3.5,3.5)
# Plot Principal Components 1 and 2
for i in usarrests_score.index:
ax1.annotate(i, (usarrests_score.PC1.loc[i], -usarrests_score.PC2.loc[i]), ha='center')
# Plot reference lines
ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
# Plot Principal Component loading vectors, using a second y-axis.
ax2 = ax1.twinx().twiny()
ax2.set_ylim(-1,1)
ax2.set_xlim(-1,1)
ax2.tick_params(axis='y', colors='orange')
ax2.set_xlabel('Principal Component loading vectors', color='orange')
# Plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.
a = 1.07
for i in usarrests_loadings[['PC1', 'PC2']].index:
ax2.annotate(i, (usarrests_loadings.PC1.loc[i]*a, -usarrests_loadings.PC2.loc[i]*a), color='orange')
# Plot vectors
ax2.arrow(0,0,usarrests_loadings.PC1[0], -usarrests_loadings.PC2[0], width=0.006)
ax2.arrow(0,0,usarrests_loadings.PC1[1], -usarrests_loadings.PC2[1], width=0.006)
ax2.arrow(0,0,usarrests_loadings.PC1[2], -usarrests_loadings.PC2[2], width=0.006)
ax2.arrow(0,0,usarrests_loadings.PC1[3], -usarrests_loadings.PC2[3], width=0.006);
mpl.style.use('ggplot')
plt.figure(figsize=(7,5))
plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component')
plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative')
plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0.75,4.25)
plt.ylim(0,1.05)
plt.xticks([1,2,3,4])
plt.legend(loc=2);
# Generate data
np.random.seed(2)
X = np.random.standard_normal((50,2))
X[:25,0] = X[:25,0]+3
X[:25,1] = X[:25,1]-4
K = 2
km1 = KMeans(n_clusters=2, n_init=20)
km1.fit(X)
km1.labels_
K = 3
np.random.seed(4)
km2 = KMeans(n_clusters=3, n_init=20)
km2.fit(X)
pd.Series(km2.labels_).value_counts()
km2.cluster_centers_
km2.labels_
# Sum of distances of samples to their closest cluster center.
km2.inertia_
# Plots
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5))
ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism)
ax1.set_title('K-Means Clustering Results with K=2')
ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)
ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism)
ax2.set_title('K-Means Clustering Results with K=3')
ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2);
fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))
for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
[ax1,ax2,ax3]):
cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)
ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');
# NCI60 dataset is in R ISLR package
islr = importr('ISLR')
nci60_rdf = rdata(islr).fetch('NCI60')['NCI60']
list(nci60_rdf.names)
nci60_labs = pd.DataFrame(pandas2ri.ri2py(nci60_rdf[1]))
nci60_data = pd.DataFrame(scale(pandas2ri.ri2py(nci60_rdf[0])), index=nci60_labs[0])
display(nci60_data.head(5))
display(nci60_labs.head(5))
nci60_data.info()
nci60_labs[0].value_counts()
# PCA
pca = PCA()
nci60_pca = pd.DataFrame(pca.fit_transform(nci60_data))
# Plot
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))
color_idx = pd.factorize(nci60_labs[0])[0]
cmap = plt.cm.hsv
# Left plot
ax1.scatter(nci60_pca.iloc[:,0], nci60_pca.iloc[:,1], c=color_idx, cmap=cmap, alpha=0.5, s=50)
ax1.set_ylabel('Principal Component 2')
# Right plot
ax2.scatter(nci60_pca.iloc[:,0], nci60_pca.iloc[:,2], c=color_idx, cmap=cmap, alpha=0.5, s=50)
ax2.set_ylabel('Principal Component 3')
# Custom legend for the classes since we do not create scatter plots per class (which could have their own labels).
handles = []
labels = pd.factorize(nci60_labs[0].unique())
norm = mpl.colors.Normalize(vmin=0.0, vmax=14.0)
for i, v in zip(labels[0], labels[1]):
handles.append(mpl.patches.Patch(color=cmap(norm(i)), label=v, alpha=0.5))
ax2.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
# xlabel for both plots
for ax in fig.axes:
ax.set_xlabel('Principal Component 1')
pd.DataFrame([nci60_pca.iloc[:,:5].std(axis=0, ddof=0).as_matrix(),
pca.explained_variance_ratio_[:5],
np.cumsum(pca.explained_variance_ratio_[:5])],
index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'],
columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])
nci60_pca.iloc[:,:10].var(axis=0, ddof=0).plot(kind='bar', rot=0)
plt.ylabel('Variances');
# scree plot
fig , (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))
# Left plot
ax1.plot(pca.explained_variance_ratio_, '-o')
ax1.set_ylabel('Proportion of Variance Explained')
ax1.set_ylim(ymin=-0.01)
# Right plot
ax2.plot(np.cumsum(pca.explained_variance_ratio_), '-ro')
ax2.set_ylabel('Cumulative Proportion of Variance Explained')
ax2.set_ylim(ymax=1.05)
for ax in fig.axes:
ax.set_xlabel('Principal Component')
ax.set_xlim(-1,65)
# dendrogram
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,20))
for linkage, cluster, ax in zip([hierarchy.complete(nci60_data), hierarchy.average(nci60_data), hierarchy.single(nci60_data)],
['c1','c2','c3'],
[ax1,ax2,ax3]):
cluster = hierarchy.dendrogram(linkage, labels=nci60_data.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax)
ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');
# Cut dendrogram with complete linkage
plt.figure(figsize=(10,20))
cut4 = hierarchy.dendrogram(hierarchy.complete(nci60_data),
labels=nci60_data.index, orientation='right', color_threshold=140, leaf_font_size=10)
plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed');
KMeans
np.random.seed(2)
km_nci60 = KMeans(n_clusters=4, n_init=50)
km_nci60.fit(nci60_data)
km_nci60.labels_
# Observations per KMeans cluster
pd.Series(km_nci60.labels_).value_counts().sort_index()
Hierachical
# Observations per Hierarchical cluster
nci60_cut = hierarchy.dendrogram(hierarchy.complete(nci60_data), truncate_mode='lastp', p=4, show_leaf_counts=True)
# Hierarchy based on Principal Components 1 to 5
plt.figure(figsize=(10,20))
pca_cluster = hierarchy.dendrogram(hierarchy.complete(nci60_pca.iloc[:,:5]), labels=nci60_labs[0].values, orientation='right', color_threshold=100, leaf_font_size=10)
hierarchy.dendrogram(hierarchy.complete(nci60_pca), truncate_mode='lastp', p=4,
show_leaf_counts=True);