#!/usr/bin/env python
# coding: utf-8
# # A Network Tour of Data Science
# ### Xavier Bresson, Winter 2016/17
# ## Assignment 1 : Unsupervised Clustering with the Normalized Association
# In[ ]:
# Load libraries
# Math
import numpy as np
# Visualization
get_ipython().run_line_magic('matplotlib', 'notebook')
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy import ndimage
# Print output of LFR code
import subprocess
# Sparse matrix
import scipy.sparse
import scipy.sparse.linalg
# 3D visualization
import pylab
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot
# Import data
import scipy.io
# Import functions in lib folder
import sys
sys.path.insert(1, 'lib')
# Import helper functions
get_ipython().run_line_magic('load_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
from lib.utils import construct_kernel
from lib.utils import compute_kernel_kmeans_EM
from lib.utils import compute_purity
# Import distance function
import sklearn.metrics.pairwise
# Remove warnings
import warnings
warnings.filterwarnings("ignore")
# **Question 1:** Write down the mathematical relationship between Normalized Cut (NCut) and Normalized Association (NAssoc) for K clusters. It is not necessary to provide details.
#
# The Normalized Cut problem is defined as:
# $$
# \min_{\{S_k\}}\ NCut(\{S_k\}) := \sum_{k=1}^K \frac{Cut(S_k,S_k^c)}{Vol(S_k)} \ \textrm{ s.t. } \ \cup_{k=1}^{K} S_k = V, \ S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' \quad\quad\quad(1)
# $$
#
# and the Normalized Association problem is defined as:
# $$
# \max_{\{S_k\}}\ NAssoc(\{S_k\}):= \sum_{k=1}^K \frac{Assoc(S_k,S_k)}{Vol(S_k)} \ \textrm{ s.t. } \ \cup_{k=1}^{K} S_k = V, \ S_k \cap S_{k'}=\emptyset, \ \forall k \not= k' .
# $$
#
#
# We may rewrite the Cut operator and the Volume operator with the Assoc operator as:
# $$
# Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} \\
# Assoc(S_k,S_k) = \sum_{i\in S_k, j\in S_k} W_{ij} \\
# Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in S_k^c=V\setminus S_k} W_{ij} = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = Vol(S_k) - Assoc(S_k,S_k)
# $$
#
#
# **Answer to Q1:** Your answer here.
#
# We have
# $$
# \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{Vol(S_k) - Assoc(S_k,S_k)}{Vol(S_k)} = 1- \frac{Assoc(S_k,S_k)}{Vol(S_k)}
# $$
#
# and
#
# $$
# \sum_{k=1}^K \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = K - \sum_{k=1}^K \frac{Assoc(S_k,S_k)}{Vol(S_k)}
# $$
#
# The relationship between Normalized Cut (NCut) and Normalized Association (NAssoc) for K clusters is thus
#
# $$
# NCut(\{S_k\}) = K - NAssoc(\{S_k\}).
# $$
#
# **Question 2:** Using the relationship between NCut and NAssoc from Q1, it is therefore equivalent to maximize NAssoc by minimizing or maximizing NCut? That is
#
# $$
# \max_{\{S_k\}}\ NAssoc(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k'
# $$
#
# $$
# \Updownarrow
# $$
#
# $$
# \min_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k'
# $$
#
# or
#
# $$
# \max_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k'
# $$
#
# It is not necessary to provide details.
# **Answer to Q2:** Your answer here.
#
# As $\min F \Leftrightarrow \max -F$, we have equivalence between the max NAssoc problem:
#
# $$
# \max_{\{S_k\}}\ NAssoc(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k'
# $$
#
# and the min NCut problem:
#
# $$
# \min_{\{S_k\}}\ NCut(\{S_k\}) \ \textrm{ s.t. } \cup_{k=1}^{K} S_k = V, \quad S_k \cap S_{k'}=\emptyset, \ \forall k \not= k'
# $$
# **Question 3:** Solving the NCut problem in Q2 is NP-hard => let us consider a spectral relaxation of NCut. Write down the Spectral Matrix A of NCut that satisfies the equivalent functional optimization problem of Q2:
#
# $$
# \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S, \quad\quad\quad(3)
# $$
#
# where
#
# $$
# Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} =
# \left\{
# \begin{array}{ll}
# \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\
# 0 & \textrm{otherwise}
# \end{array}
# \right..
# $$
#
# and
#
# $$
# A=???
# $$
#
# It is not necessary to provide details.
#
# *Hint:* Let us introduce the indicator matrix $F$ of the clusters $S_k$ such that:
#
# $$
# F_{ik} =
# \left\{
# \begin{array}{ll}
# 1 & \textrm{if} \ i \in S_k\\
# 0 & \textrm{otherwise}
# \end{array}
# \right..
# $$
#
# We may rewrite the Cut operator and the Volume operator with $F$ as:
#
# $$
# Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k}\\
# Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k} - F_{\cdot,k}^\top W F_{\cdot,k} = F_{\cdot,k}^\top (D - W) F_{\cdot,k} \quad
# $$
#
# We thus have
#
# $$
# \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} }
# $$
#
#
# Set $\hat{F}_{\cdot,k}=D^{1/2}F_{\cdot,k}$ and observe that
#
# $$
# \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top D^{-1/2}(D - W)D^{-1/2} \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top (I - D^{-1/2}WD^{-1/2}) \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } ,
# $$
#
# with $L_N=I - D^{-1/2}WD^{-1/2}$ is the normalized graph Laplacian. Set $Y_{\cdot,k}=\frac{\hat{F}_{\cdot,k}}{\|\hat{F}_{\cdot,k}\|_2}$:
#
# $$
# \frac{ \hat{F}_{\cdot,k}^\top L_N \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = Y_{\cdot,k}^\top L_N Y_{\cdot,k} \quad\quad\quad(2)
# $$
#
#
# Using (2), we can rewrite (1) as a functional optimization problem:
#
# $$
# \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S,
# $$
#
# where
#
#
# $$
# Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} =
# \left\{
# \begin{array}{ll}
# \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\
# 0 & \textrm{otherwise}
# \end{array}
# \right..
# $$
#
# and
#
# $$
# A=???
# $$
# **Answer to Q3:** Let us introduce the indicator matrix $F$ of the clusters $S_k$ such that:
#
# $$
# F_{ik} =
# \left\{
# \begin{array}{ll}
# 1 & \textrm{if} \ i \in S_k\\
# 0 & \textrm{otherwise}
# \end{array}
# \right..
# $$
#
# We may rewrite the Cut operator and the Volume operator with $F$ as:
#
# $$
# Vol(S_k) = \sum_{i\in S_k, j\in V} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k}\\
# Cut(S_k,S_k^c) = \sum_{i\in S_k, j\in V} W_{ij} - \sum_{i\in S_k, j\in S_k} W_{ij} = F_{\cdot,k}^\top D F_{\cdot,k} - F_{\cdot,k}^\top W F_{\cdot,k} = F_{\cdot,k}^\top (D - W) F_{\cdot,k} \quad
# $$
#
# We thus have
#
# $$
# \frac{Cut(S_k,S_k^c)}{Vol(S_k)} = \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} }
# $$
#
#
# Set $\hat{F}_{\cdot,k}=D^{1/2}F_{\cdot,k}$ and observe that
#
# $$
# \frac{ F_{\cdot,k}^\top (D - W) F_{\cdot,k} }{ F_{\cdot,k}^\top D F_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top D^{-1/2}(D - W)D^{-1/2} \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = \frac{ \hat{F}_{\cdot,k}^\top (I - D^{-1/2}WD^{-1/2}) \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } ,
# $$
#
# where $L_N=I - D^{-1/2}WD^{-1/2}$ is the normalized graph Laplacian. Set $Y_{\cdot,k}=\frac{\hat{F}_{\cdot,k}}{\|\hat{F}_{\cdot,k}\|_2}$, we have:
#
# $$
# \frac{ \hat{F}_{\cdot,k}^\top L_N \hat{F}_{\cdot,k} }{ \hat{F}_{\cdot,k}^\top \hat{F}_{\cdot,k} } = Y_{\cdot,k}^\top L_N Y_{\cdot,k} \quad\quad\quad(2)
# $$
#
#
# Using (2), we can rewrite (1) as a functional optimization problem:
#
# $$
# \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K \textrm{ and } Y \in Ind_S,
# $$
#
# where
#
# $$
# Y \in Ind_S \ \textrm{ reads as } \ Y_{ik} =
# \left\{
# \begin{array}{ll}
# \big(\frac{D_{ii}}{Vol(S_k)}\big)^{1/2} & \textrm{if} \ i \in S_k\\
# 0 & \textrm{otherwise}
# \end{array}
# \right..
# $$
#
# and
#
# $$
# A=L_N=I-D^{-1/2}WD^{-1/2}.
# $$
# **Question 4:** Drop the cluster indicator constraint $Y\in Ind_S$ in Q3, how do you compute the solution $Y^\star$ of (3)? Why the first column of $Y^\star$ is not relevant for clustering?
# **Answer to Q4:** Your answer here.
#
# Dropping the constraint $Y\in Ind_S$ in (3) leads to a standard spectral relaxation problem:
#
# $$
# \min_{Y}\ tr( Y^\top A Y) \ \textrm{ s.t. } \ Y^\top Y = I_K,
# $$
#
# which solution $Y^\star$ is given by the $K$ smallest eigenvectors/eigenvalues of $A=L_N=I-D^{-1/2}WD^{-1/2}$. Note that the first column of $Y^\star$ is the constant signal $y_1=\frac{1}{\sqrt{n}}1_{n\times 1}$ associated to the smallest eigenvalue of $L_N$, which has value $\lambda_1=0$.
# **Question 5:** Plot in 3D the 2nd, 3rd, 4th columns of $Y^\star$.
# Hint: Compute the degree matrix $D$.
# Hint: You may use function *D_sqrt_inv = scipy.sparse.diags(d_sqrt_inv.A.squeeze(), 0)* for creating $D^{-1/2}$.
# Hint: You may use function *I = scipy.sparse.identity(d.size, dtype=W.dtype)* for creating a sparse identity matrix.
# Hint: You may use function *lamb, U = scipy.sparse.linalg.eigsh(A, k=4, which='SM')* to perform the eigenvalue decomposition of A.
# Hint: You may use function *ax.scatter(Xdisp, Ydisp, Zdisp, c=Cgt)* for 3D visualization.
# In[ ]:
# Load dataset: W is the Adjacency Matrix and Cgt is the ground truth clusters
mat = scipy.io.loadmat('datasets/mnist_2000_graph.mat')
W = mat['W']
n = W.shape[0]
Cgt = mat['Cgt'] - 1; Cgt = Cgt.squeeze()
nc = len(np.unique(Cgt))
print('Number of nodes =',n)
print('Number of classes =',nc);
# In[ ]:
# Your code here
# Construct Spectal Matrix A
d = W.sum(axis=0) + 1e-6 # degree vector
d = 1.0 / np.sqrt(d)
Dinv = scipy.sparse.diags(d.A.squeeze(), 0)
I = scipy.sparse.identity(d.size, dtype=W.dtype)
A = I - Dinv* (W* Dinv)
# Compute K smallest eigenvectors/eigenvalues of A
lamb, U = scipy.sparse.linalg.eigsh(A, k=4, which='SM')
# Sort eigenvalue from smallest to largest values
idx = lamb.argsort() # increasing order
lamb, U = lamb[idx], U[:,idx]
print(lamb)
# Y*
Y = U
# Plot in 3D the 2nd, 3rd, 4th columns of Y*
Xdisp = Y[:,1]
Ydisp = Y[:,2]
Zdisp = Y[:,3]
# 2D Visualization
plt.figure(14)
size_vertex_plot = 10
plt.scatter(Xdisp, Ydisp, s=size_vertex_plot*np.ones(n), c=Cgt)
plt.title('2D Visualization')
plt.show()
# 3D Visualization
fig = pylab.figure(15)
ax = Axes3D(fig)
ax.scatter(Xdisp, Ydisp, Zdisp, c=Cgt)
pylab.title('3D Visualization')
pyplot.show()
# **Question 6:** Solve the unsupervised clustering problem for MNIST following the popular technique of [Ng, Jordan, Weiss, “On Spectral Clustering: Analysis and an algorithm”, 2002], i.e.
# (1) Compute $Y^\star$? solution of Q4.
# (2) Normalize the rows of $Y^\star$? with the L2-norm.
# Hint: You may use function X = ( X.T / np.sqrt(np.sum(X**2,axis=1)+1e-10) ).T for the L2-normalization of the rows of X.
# (3) Run standard K-Means on normalized $Y^\star$? to get the clusters, and compute the clustering accuracy. You should get more than 50% accuracy.
# In[ ]:
# Your code here
# Normalize the rows of Y* with the L2 norm, i.e. ||y_i||_2 = 1
Y = ( Y.T / np.sqrt(np.sum(Y**2,axis=1)+1e-10) ).T
# In[ ]:
# Your code here
# Run standard K-Means
Ker = construct_kernel(Y,'linear') # Compute linear Kernel for standard K-Means
Theta = np.ones(n) # Equal weight for each data
[C_kmeans,En_kmeans] = compute_kernel_kmeans_EM(nc,Ker,Theta,10)
acc= compute_purity(C_kmeans,Cgt,nc)
print('accuracy standard kmeans=',acc)