Chapter 10 - Unsupervised Learning¶

Lab 1: Principal Component Analysis
Lab 2: K-Means Clustering
Lab 2: Hierarchical Clustering
Lab 3: NCI60 Data Example

In [1]:

# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from scipy.cluster import hierarchy

%matplotlib inline
plt.style.use('seaborn-white')

Lab 1: Principal Component Analysis¶

In [2]:

# In R, I exported the dataset to a csv file. It is part of the base R distribution.
df = pd.read_csv('Data/USArrests.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Alabama to Wyoming
Data columns (total 4 columns):
Murder      50 non-null float64
Assault     50 non-null int64
UrbanPop    50 non-null int64
Rape        50 non-null float64
dtypes: float64(2), int64(2)
memory usage: 2.0+ KB

In [3]:

df.mean()

Out[3]:

Murder        7.788
Assault     170.760
UrbanPop     65.540
Rape         21.232
dtype: float64

In [4]:

df.var()

Out[4]:

Murder        18.970465
Assault     6945.165714
UrbanPop     209.518776
Rape          87.729159
dtype: float64

In [5]:

X = pd.DataFrame(scale(df), index=df.index, columns=df.columns)

In [6]:

# The loading vectors
pca_loadings = pd.DataFrame(PCA().fit(X).components_.T, index=df.columns, columns=['V1', 'V2', 'V3', 'V4'])
pca_loadings

Out[6]:

	V1	V2	V3	V4
Murder	0.535899	0.418181	-0.341233	0.649228
Assault	0.583184	0.187986	-0.268148	-0.743407
UrbanPop	0.278191	-0.872806	-0.378016	0.133878
Rape	0.543432	-0.167319	0.817778	0.089024

In [7]:

# Fit the PCA model and transform X to get the principal components
pca = PCA()
df_plot = pd.DataFrame(pca.fit_transform(X), columns=['PC1', 'PC2', 'PC3', 'PC4'], index=X.index)
df_plot

Out[7]:

	PC1	PC2	PC3	PC4
Alabama	0.985566	1.133392	-0.444269	0.156267
Alaska	1.950138	1.073213	2.040003	-0.438583
Arizona	1.763164	-0.745957	0.054781	-0.834653
Arkansas	-0.141420	1.119797	0.114574	-0.182811
California	2.523980	-1.542934	0.598557	-0.341996
Colorado	1.514563	-0.987555	1.095007	0.001465
Connecticut	-1.358647	-1.088928	-0.643258	-0.118469
Delaware	0.047709	-0.325359	-0.718633	-0.881978
Florida	3.013042	0.039229	-0.576829	-0.096285
Georgia	1.639283	1.278942	-0.342460	1.076797
Hawaii	-0.912657	-1.570460	0.050782	0.902807
Idaho	-1.639800	0.210973	0.259801	-0.499104
Illinois	1.378911	-0.681841	-0.677496	-0.122021
Indiana	-0.505461	-0.151563	0.228055	0.424666
Iowa	-2.253646	-0.104054	0.164564	0.017556
Kansas	-0.796881	-0.270165	0.025553	0.206496
Kentucky	-0.750859	0.958440	-0.028369	0.670557
Louisiana	1.564818	0.871055	-0.783480	0.454728
Maine	-2.396829	0.376392	-0.065682	-0.330460
Maryland	1.763369	0.427655	-0.157250	-0.559070
Massachusetts	-0.486166	-1.474496	-0.609497	-0.179599
Michigan	2.108441	-0.155397	0.384869	0.102372
Minnesota	-1.692682	-0.632261	0.153070	0.067317
Mississippi	0.996494	2.393796	-0.740808	0.215508
Missouri	0.696787	-0.263355	0.377444	0.225824
Montana	-1.185452	0.536874	0.246889	0.123742
Nebraska	-1.265637	-0.193954	0.175574	0.015893
Nevada	2.874395	-0.775600	1.163380	0.314515
New Hampshire	-2.383915	-0.018082	0.036855	-0.033137
New Jersey	0.181566	-1.449506	-0.764454	0.243383
New Mexico	1.980024	0.142849	0.183692	-0.339534
New York	1.682577	-0.823184	-0.643075	-0.013484
North Carolina	1.123379	2.228003	-0.863572	-0.954382
North Dakota	-2.992226	0.599119	0.301277	-0.253987
Ohio	-0.225965	-0.742238	-0.031139	0.473916
Oklahoma	-0.311783	-0.287854	-0.015310	0.010332
Oregon	0.059122	-0.541411	0.939833	-0.237781
Pennsylvania	-0.888416	-0.571100	-0.400629	0.359061
Rhode Island	-0.863772	-1.491978	-1.369946	-0.613569
South Carolina	1.320724	1.933405	-0.300538	-0.131467
South Dakota	-1.987775	0.823343	0.389293	-0.109572
Tennessee	0.999742	0.860251	0.188083	0.652864
Texas	1.355138	-0.412481	-0.492069	0.643195
Utah	-0.550565	-1.471505	0.293728	-0.082314
Vermont	-2.801412	1.402288	0.841263	-0.144890
Virginia	-0.096335	0.199735	0.011713	0.211371
Washington	-0.216903	-0.970124	0.624871	-0.220848
West Virginia	-2.108585	1.424847	0.104775	0.131909
Wisconsin	-2.079714	-0.611269	-0.138865	0.184104
Wyoming	-0.629427	0.321013	-0.240659	-0.166652

In [8]:

fig , ax1 = plt.subplots(figsize=(9,7))

ax1.set_xlim(-3.5,3.5)
ax1.set_ylim(-3.5,3.5)

# Plot Principal Components 1 and 2
for i in df_plot.index:
    ax1.annotate(i, (df_plot.PC1.loc[i], -df_plot.PC2.loc[i]), ha='center')

# Plot reference lines
ax1.hlines(0,-3.5,3.5, linestyles='dotted', colors='grey')
ax1.vlines(0,-3.5,3.5, linestyles='dotted', colors='grey')

ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
    
# Plot Principal Component loading vectors, using a second y-axis.
ax2 = ax1.twinx().twiny() 

ax2.set_ylim(-1,1)
ax2.set_xlim(-1,1)
ax2.tick_params(axis='y', colors='orange')
ax2.set_xlabel('Principal Component loading vectors', color='orange')

# Plot labels for vectors. Variable 'a' is a small offset parameter to separate arrow tip and text.
a = 1.07  
for i in pca_loadings[['V1', 'V2']].index:
    ax2.annotate(i, (pca_loadings.V1.loc[i]*a, -pca_loadings.V2.loc[i]*a), color='orange')

# Plot vectors
ax2.arrow(0,0,pca_loadings.V1[0], -pca_loadings.V2[0])
ax2.arrow(0,0,pca_loadings.V1[1], -pca_loadings.V2[1])
ax2.arrow(0,0,pca_loadings.V1[2], -pca_loadings.V2[2])
ax2.arrow(0,0,pca_loadings.V1[3], -pca_loadings.V2[3]);

In [9]:

# Standard deviation of the four principal components
np.sqrt(pca.explained_variance_)

Out[9]:

array([ 1.5908673 ,  1.00496987,  0.6031915 ,  0.4206774 ])

In [10]:

pca.explained_variance_

Out[10]:

array([ 2.53085875,  1.00996444,  0.36383998,  0.17696948])

In [11]:

pca.explained_variance_ratio_

Out[11]:

array([ 0.62006039,  0.24744129,  0.0891408 ,  0.04335752])

In [12]:

plt.figure(figsize=(7,5))

plt.plot([1,2,3,4], pca.explained_variance_ratio_, '-o', label='Individual component')
plt.plot([1,2,3,4], np.cumsum(pca.explained_variance_ratio_), '-s', label='Cumulative')

plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0.75,4.25)
plt.ylim(0,1.05)
plt.xticks([1,2,3,4])
plt.legend(loc=2);

Lab 2: Clustering¶

10.5.1 K-Means Clustering¶

In [13]:

# Generate data
np.random.seed(2)
X = np.random.standard_normal((50,2))
X[:25,0] = X[:25,0]+3
X[:25,1] = X[:25,1]-4

K = 2¶

In [14]:

km1 = KMeans(n_clusters=2, n_init=20)
km1.fit(X)

Out[14]:

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=20, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [15]:

km1.labels_

Out[15]:

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1], dtype=int32)

See plot for K=2 below.

K = 3¶

In [16]:

np.random.seed(4)
km2 = KMeans(n_clusters=3, n_init=20)
km2.fit(X)

Out[16]:

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=20, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:

pd.Series(km2.labels_).value_counts()

Out[17]:

1    21
0    20
2     9
dtype: int64

In [18]:

km2.cluster_centers_

Out[18]:

array([[-0.27876523,  0.51224152],
       [ 2.82805911, -4.11351797],
       [ 0.69945422, -2.14934345]])

In [19]:

km2.labels_

Out[19]:

array([1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1,
       1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2], dtype=int32)

In [20]:

# Sum of distances of samples to their closest cluster center.
km2.inertia_

Out[20]:

68.973792009397258

In [21]:

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(14,5))

ax1.scatter(X[:,0], X[:,1], s=40, c=km1.labels_, cmap=plt.cm.prism) 
ax1.set_title('K-Means Clustering Results with K=2')
ax1.scatter(km1.cluster_centers_[:,0], km1.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2)

ax2.scatter(X[:,0], X[:,1], s=40, c=km2.labels_, cmap=plt.cm.prism) 
ax2.set_title('K-Means Clustering Results with K=3')
ax2.scatter(km2.cluster_centers_[:,0], km2.cluster_centers_[:,1], marker='+', s=100, c='k', linewidth=2);

10.5.3 Hierarchical Clustering¶

scipy¶

In [22]:

fig, (ax1,ax2,ax3) = plt.subplots(3,1, figsize=(15,18))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)], ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');

Lab 3: NCI60 Data Example¶

§ 10.6.1 PCA¶

In [23]:

# In R, I exported the two elements of this ISLR dataset to csv files.
# There is one file for the features and another file for the classes/types.
df2 = pd.read_csv('Data/NCI60_X.csv').drop('Unnamed: 0', axis=1)
df2.columns = np.arange(df2.columns.size)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Columns: 6830 entries, 0 to 6829
dtypes: float64(6830)
memory usage: 3.3 MB

In [24]:

X = pd.DataFrame(scale(df2))
X.shape

Out[24]:

(64, 6830)

In [25]:

y = pd.read_csv('Data/NCI60_y.csv', usecols=[1], skiprows=1, names=['type'])
y.shape

Out[25]:

(64, 1)

In [26]:

y.type.value_counts()

Out[26]:

RENAL          9
NSCLC          9
MELANOMA       8
BREAST         7
COLON          7
OVARIAN        6
LEUKEMIA       6
CNS            5
PROSTATE       2
MCF7D-repro    1
K562B-repro    1
K562A-repro    1
MCF7A-repro    1
UNKNOWN        1
Name: type, dtype: int64

In [27]:

# Fit the PCA model and transform X to get the principal components
pca2 = PCA()
df2_plot = pd.DataFrame(pca2.fit_transform(X))

In [28]:

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,6))

color_idx = pd.factorize(y.type)[0]
cmap = plt.cm.hsv

# Left plot
ax1.scatter(df2_plot.iloc[:,0], -df2_plot.iloc[:,1], c=color_idx, cmap=cmap, alpha=0.5, s=50)
ax1.set_ylabel('Principal Component 2')

# Right plot
ax2.scatter(df2_plot.iloc[:,0], df2_plot.iloc[:,2], c=color_idx, cmap=cmap, alpha=0.5, s=50)
ax2.set_ylabel('Principal Component 3')

# Custom legend for the classes (y) since we do not create scatter plots per class (which could have their own labels).
handles = []
labels = pd.factorize(y.type.unique())
norm = mpl.colors.Normalize(vmin=0.0, vmax=14.0)

for i, v in zip(labels[0], labels[1]):
    handles.append(mpl.patches.Patch(color=cmap(norm(i)), label=v, alpha=0.5))

ax2.legend(handles=handles, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

# xlabel for both plots
for ax in fig.axes:
    ax.set_xlabel('Principal Component 1')    

In [29]:

pd.DataFrame([df2_plot.iloc[:,:5].std(axis=0, ddof=0).as_matrix(),
              pca2.explained_variance_ratio_[:5],
              np.cumsum(pca2.explained_variance_ratio_[:5])],
             index=['Standard Deviation', 'Proportion of Variance', 'Cumulative Proportion'],
             columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5'])

Out[29]:

	PC1	PC2	PC3	PC4	PC5
Standard Deviation	27.853469	21.481355	19.820465	17.032556	15.971807
Proportion of Variance	0.113589	0.067562	0.057518	0.042476	0.037350
Cumulative Proportion	0.113589	0.181151	0.238670	0.281145	0.318495

In [30]:

df2_plot.iloc[:,:10].var(axis=0, ddof=0).plot(kind='bar', rot=0)
plt.ylabel('Variances');

In [31]:

fig , (ax1,ax2) = plt.subplots(1,2, figsize=(15,5))

# Left plot
ax1.plot(pca2.explained_variance_ratio_, '-o')
ax1.set_ylabel('Proportion of Variance Explained')
ax1.set_ylim(ymin=-0.01)

# Right plot
ax2.plot(np.cumsum(pca2.explained_variance_ratio_), '-ro')
ax2.set_ylabel('Cumulative Proportion of Variance Explained')
ax2.set_ylim(ymax=1.05)

for ax in fig.axes:
    ax.set_xlabel('Principal Component')
    ax.set_xlim(-1,65)    

§ 10.6.2 Clustering¶

In [32]:

X= pd.DataFrame(scale(df2), index=y.type, columns=df2.columns)

In [33]:

fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(20,20))

for linkage, cluster, ax in zip([hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)],
                                ['c1','c2','c3'],
                                [ax1,ax2,ax3]):
    cluster = hierarchy.dendrogram(linkage, labels=X.index, orientation='right', color_threshold=0, leaf_font_size=10, ax=ax)

ax1.set_title('Complete Linkage')
ax2.set_title('Average Linkage')
ax3.set_title('Single Linkage');

In [34]:

plt.figure(figsize=(10,20))
cut4 = hierarchy.dendrogram(hierarchy.complete(X),
                            labels=X.index, orientation='right', color_threshold=140, leaf_font_size=10)
plt.vlines(140,0,plt.gca().yaxis.get_data_interval()[1], colors='r', linestyles='dashed');

KMeans¶

In [35]:

np.random.seed(2)
km4 = KMeans(n_clusters=4, n_init=50)
km4.fit(X)

Out[35]:

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=50, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [36]:

km4.labels_

Out[36]:

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [37]:

# Observations per KMeans cluster
pd.Series(km4.labels_).value_counts().sort_index()

Out[37]:

0     8
1    23
2    24
3     9
dtype: int64

Hierarchical¶

In [38]:

# Observations per Hierarchical cluster
cut4b = hierarchy.dendrogram(hierarchy.complete(X), truncate_mode='lastp', p=4, show_leaf_counts=True)

In [39]:

# Hierarchy based on Principal Components 1 to 5
plt.figure(figsize=(10,20))
pca_cluster = hierarchy.dendrogram(hierarchy.complete(df2_plot.iloc[:,:5]), labels=y.type.values, orientation='right', color_threshold=100, leaf_font_size=10)

In [40]:

cut4c = hierarchy.dendrogram(hierarchy.complete(df2_plot), truncate_mode='lastp', p=4,
                             show_leaf_counts=True)
# See also color coding in plot above.