#!/usr/bin/env python # coding: utf-8 # # A Network Tour of Data Science # ###       Xavier Bresson, Winter 2016/17 # ## Assignment 1 : Unsupervised Clustering with the Normalized Association # In[ ]: # Load libraries # Math import numpy as np # Visualization get_ipython().run_line_magic('matplotlib', 'notebook') import matplotlib.pyplot as plt plt.rcParams.update({'figure.max_open_warning': 0}) from mpl_toolkits.axes_grid1 import make_axes_locatable from scipy import ndimage # Print output of LFR code import subprocess # Sparse matrix import scipy.sparse import scipy.sparse.linalg # 3D visualization import pylab from mpl_toolkits.mplot3d import Axes3D from matplotlib import pyplot # Import data import scipy.io # Import functions in lib folder import sys sys.path.insert(1, 'lib') # Import helper functions get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') from lib.utils import construct_kernel from lib.utils import compute_kernel_kmeans_EM from lib.utils import compute_purity # Import distance function import sklearn.metrics.pairwise # Remove warnings import warnings warnings.filterwarnings("ignore") # **Question 1:** Write down the mathematical relationship between Normalized Cut (NCut) and Normalized Association (NAssoc) for K clusters. It is not necessary to provide details. # # The Normalized Cut problem is defined as:

# $$ # \min_{\{S_k\}}\ NCut(\{S_k\}) := \sum_{k=1}^K \frac{Cut(S_k,S_k^c)}{Vol(S_k)} \ \textrm{ s.t. } \ \cup_{k=1}^{K} S_k = V, \ S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' \quad\quad\quad(1) # $$ # # and the Normalized Association problem is defined as:

# $$ # \max_{\{S_k\}}\ NAssoc(\{S_k\}):= \sum_{k=1}^K \frac{Assoc(S_k,S_k)}{Vol(S_k)} \ \textrm{ s.t. } \ \cup_{k=1}^{K} S_k = V, \ S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' . # $$ # # # We may rewrite the Cut operator and the Volume operator with the Assoc operator as:

# $$ # Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} \\ # Assoc(S_k,S_k) = \sum_{i\in S_k, j\in S_k} W_{ij} \\ # Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in S_k^c=V\setminus S_k} W_{ij} = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = Vol(S_k) - Assoc(S_k,S_k) # $$ # # # **Answer to Q1:** Your answer here. # # We have

# $$ # \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{Vol(S_k) - Assoc(S_k,S_k)}{Vol(S_k)} = 1- \frac{Assoc(S_k,S_k)}{Vol(S_k)} # $$ # # and

# # $$ # \sum_{k=1}^K \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = K - \sum_{k=1}^K \frac{Assoc(S_k,S_k)}{Vol(S_k)} # $$ # # The relationship between Normalized Cut (NCut) and Normalized Association (NAssoc) for K clusters is thus

# # $$ # NCut(\{S_k\}) = K - NAssoc(\{S_k\}). # $$ # # **Question 2:** Using the relationship between NCut and NAssoc from Q1, it is therefore equivalent to maximize NAssoc by minimizing or maximizing NCut? That is # # $$ # \max_{\{S_k\}}\ NAssoc(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' # $$ # # $$ # \Updownarrow # $$ # # $$ # \min_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' # $$ # # or # # $$ # \max_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' # $$ # # It is not necessary to provide details. # **Answer to Q2:** Your answer here. # # As $\min F \Leftrightarrow \max -F$, we have equivalence between the max NAssoc problem: # # $$ # \max_{\{S_k\}}\ NAssoc(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' # $$ # # and the min NCut problem: # # $$ # \min_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' # $$ # **Question 3:** Solving the NCut problem in Q2 is NP-hard => let us consider a spectral relaxation of NCut. Write down the Spectral Matrix A of NCut that satisfies the equivalent functional optimization problem of Q2: # # $$ # \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S, \quad\quad\quad(3) # $$ # # where # # $$ # Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} = # \left\{ # \begin{array}{ll} # \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\ # 0 & \textrm{otherwise} # \end{array} # \right.. # $$ # # and # # $$ # A=??? # $$ # # It is not necessary to provide details. # # *Hint:* Let us introduce the indicator matrix $F$ of the clusters $S_k$ such that: # # $$ # F_{ik} = # \left\{ # \begin{array}{ll} # 1 & \textrm{if} \ i \in S_k\\ # 0 & \textrm{otherwise} # \end{array} # \right.. # $$ # # We may rewrite the Cut operator and the Volume operator with $F$ as: # # $$ # Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k}\\ # Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k} - F_{\cdot,k}^\top W F_{\cdot,k} = F_{\cdot,k}^\top (D - W) F_{\cdot,k} \quad # $$ # # We thus have # # $$ # \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } # $$ # # # Set $\hat{F}_{\cdot,k}=D^{1/2}F_{\cdot,k}$ and observe that # # $$ # \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top D^{-1/2}(D - W)D^{-1/2} \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top (I - D^{-1/2}WD^{-1/2}) \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } , # $$ # # with $L_N=I - D^{-1/2}WD^{-1/2}$ is the normalized graph Laplacian. Set $Y_{\cdot,k}=\frac{\hat{F}_{\cdot,k}}{\|\hat{F}_{\cdot,k}\|_2}$: # # $$ # \frac{ \hat{F}_{\cdot,k}^\top L_N \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = Y_{\cdot,k}^\top L_N Y_{\cdot,k} \quad\quad\quad(2) # $$ # # # Using (2), we can rewrite (1) as a functional optimization problem: # # $$ # \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S, # $$ # # where # # # $$ # Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} = # \left\{ # \begin{array}{ll} # \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\ # 0 & \textrm{otherwise} # \end{array} # \right.. # $$ # # and # # $$ # A=??? # $$ # **Answer to Q3:** Let us introduce the indicator matrix $F$ of the clusters $S_k$ such that: # # $$ # F_{ik} = # \left\{ # \begin{array}{ll} # 1 & \textrm{if} \ i \in S_k\\ # 0 & \textrm{otherwise} # \end{array} # \right.. # $$ # # We may rewrite the Cut operator and the Volume operator with $F$ as: # # $$ # Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k}\\ # Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k} - F_{\cdot,k}^\top W F_{\cdot,k} = F_{\cdot,k}^\top (D - W) F_{\cdot,k} \quad # $$ # # We thus have # # $$ # \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } # $$ # # # Set $\hat{F}_{\cdot,k}=D^{1/2}F_{\cdot,k}$ and observe that # # $$ # \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top D^{-1/2}(D - W)D^{-1/2} \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top (I - D^{-1/2}WD^{-1/2}) \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } , # $$ # # where $L_N=I - D^{-1/2}WD^{-1/2}$ is the normalized graph Laplacian. Set $Y_{\cdot,k}=\frac{\hat{F}_{\cdot,k}}{\|\hat{F}_{\cdot,k}\|_2}$, we have: # # $$ # \frac{ \hat{F}_{\cdot,k}^\top L_N \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = Y_{\cdot,k}^\top L_N Y_{\cdot,k} \quad\quad\quad(2) # $$ # # # Using (2), we can rewrite (1) as a functional optimization problem: # # $$ # \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S, # $$ # # where # # $$ # Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} = # \left\{ # \begin{array}{ll} # \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\ # 0 & \textrm{otherwise} # \end{array} # \right.. # $$ # # and # # $$ # A=L_N=I-D^{-1/2}WD^{-1/2}. # $$ # **Question 4:** Drop the cluster indicator constraint $Y\in Ind_S$ in Q3, how do you compute the solution $Y^\star$ of (3)? Why the first column of $Y^\star$ is not relevant for clustering? # **Answer to Q4:** Your answer here. # # Dropping the constraint $Y\in Ind_S$ in (3) leads to a standard spectral relaxation problem: # # $$ # \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K, # $$ # # which solution $Y^\star$ is given by the $K$ smallest eigenvectors/eigenvalues of $A=L_N=I-D^{-1/2}WD^{-1/2}$. Note that the first column of $Y^\star$ is the constant signal $y_1=\frac{1}{\sqrt{n}}1_{n\times 1}$ associated to the smallest eigenvalue of $L_N$, which has value $\lambda_1=0$. # **Question 5:** Plot in 3D the 2nd, 3rd, 4th columns of $Y^\star$.
# Hint: Compute the degree matrix $D$.
# Hint: You may use function *D_sqrt_inv = scipy.sparse.diags(d_sqrt_inv.A.squeeze(), 0)* for creating $D^{-1/2}$.
# Hint: You may use function *I = scipy.sparse.identity(d.size, dtype=W.dtype)* for creating a sparse identity matrix.
# Hint: You may use function *lamb, U = scipy.sparse.linalg.eigsh(A, k=4, which='SM')* to perform the eigenvalue decomposition of A.
# Hint: You may use function *ax.scatter(Xdisp, Ydisp, Zdisp, c=Cgt)* for 3D visualization. # In[ ]: # Load dataset: W is the Adjacency Matrix and Cgt is the ground truth clusters mat = scipy.io.loadmat('datasets/mnist_2000_graph.mat') W = mat['W'] n = W.shape[0] Cgt = mat['Cgt'] - 1; Cgt = Cgt.squeeze() nc = len(np.unique(Cgt)) print('Number of nodes =',n) print('Number of classes =',nc); # In[ ]: # Your code here # Construct Spectal Matrix A d = W.sum(axis=0) + 1e-6 # degree vector d = 1.0 / np.sqrt(d) Dinv = scipy.sparse.diags(d.A.squeeze(), 0) I = scipy.sparse.identity(d.size, dtype=W.dtype) A = I - Dinv* (W* Dinv) # Compute K smallest eigenvectors/eigenvalues of A lamb, U = scipy.sparse.linalg.eigsh(A, k=4, which='SM') # Sort eigenvalue from smallest to largest values idx = lamb.argsort() # increasing order lamb, U = lamb[idx], U[:,idx] print(lamb) # Y* Y = U # Plot in 3D the 2nd, 3rd, 4th columns of Y* Xdisp = Y[:,1] Ydisp = Y[:,2] Zdisp = Y[:,3] # 2D Visualization plt.figure(14) size_vertex_plot = 10 plt.scatter(Xdisp, Ydisp, s=size_vertex_plot*np.ones(n), c=Cgt) plt.title('2D Visualization') plt.show() # 3D Visualization fig = pylab.figure(15) ax = Axes3D(fig) ax.scatter(Xdisp, Ydisp, Zdisp, c=Cgt) pylab.title('3D Visualization') pyplot.show() # **Question 6:** Solve the unsupervised clustering problem for MNIST following the popular technique of [Ng, Jordan, Weiss, “On Spectral Clustering: Analysis and an algorithm”, 2002], i.e.
# (1) Compute $Y^\star$? solution of Q4.
# (2) Normalize the rows of $Y^\star$? with the L2-norm.
# Hint: You may use function X = ( X.T / np.sqrt(np.sum(X**2,axis=1)+1e-10) ).T for the L2-normalization of the rows of X.
# (3) Run standard K-Means on normalized $Y^\star$? to get the clusters, and compute the clustering accuracy. You should get more than 50% accuracy. # In[ ]: # Your code here # Normalize the rows of Y* with the L2 norm, i.e. ||y_i||_2 = 1 Y = ( Y.T / np.sqrt(np.sum(Y**2,axis=1)+1e-10) ).T # In[ ]: # Your code here # Run standard K-Means Ker = construct_kernel(Y,'linear') # Compute linear Kernel for standard K-Means Theta = np.ones(n) # Equal weight for each data [C_kmeans,En_kmeans] = compute_kernel_kmeans_EM(nc,Ker,Theta,10) acc= compute_purity(C_kmeans,Cgt,nc) print('accuracy standard kmeans=',acc)