#!/usr/bin/env python # coding: utf-8 # In[1]: # Copy and paste from # https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b # with trivial modifications # # Author: Nobody in Computer Vision # Date: 2022-03-05 # # Contacts: # Company: Longer Vision Technology # Email: jiapei@longervision.com # Website: https://www.longervision.com # In[2]: # Fashion MNIST reader def load_mnist(path, kind='train'): import os import gzip import numpy as np """Load MNIST data from `path`""" labels_path = os.path.join(path, '%s-labels-idx1-ubyte.gz' % kind) images_path = os.path.join(path, '%s-images-idx3-ubyte.gz' % kind) with gzip.open(labels_path, 'rb') as lbpath: labels = np.frombuffer(lbpath.read(), dtype=np.uint8, offset=8) with gzip.open(images_path, 'rb') as imgpath: images = np.frombuffer(imgpath.read(), dtype=np.uint8, offset=16).reshape(len(labels), 784) return images, labels # In[3]: X_train, y_train = load_mnist('/media/lvision/Sabrent/datasets/fashion_mnist', kind='train') print(X_train.shape) print(X_train[0].shape) print(y_train) print(y_train.shape) # In[4]: import time import pandas as pd import numpy as np from PIL import Image as im import matplotlib.pyplot as plt import matplotlib.patheffects as PathEffects get_ipython().run_line_magic('matplotlib', 'inline') import seaborn as sns sns.set_style('darkgrid') sns.set_palette('muted') sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5}) RS = 123 # In[5]: plt.gray() fig = plt.figure(figsize=(16, 8)) columns = 12 rows = 6 for i in range(0, columns*rows): data = im.fromarray(X_train[i].reshape(28,28)) fig.add_subplot(rows, columns, i+1) plt.imshow(data) plt.axis('off') plt.show() # In[6]: # Utility function to visualize the outputs of PCA and t-SNE def fashion_scatter(x, colors): # choose a color palette with seaborn. num_classes = len(np.unique(colors)) palette = np.array(sns.color_palette("hls", num_classes)) # create a scatter plot. f = plt.figure(figsize=(8, 8)) ax = plt.subplot(aspect='equal') sc = ax.scatter(x[:,0], x[:,1], lw=0, s=40, c=palette[colors.astype(np.int)]) plt.xlim(-25, 25) plt.ylim(-25, 25) ax.axis('off') ax.axis('tight') # add the labels for each digit corresponding to the label txts = [] for i in range(num_classes): # Position of each label at median of data points. xtext, ytext = np.median(x[colors == i, :], axis=0) txt = ax.text(xtext, ytext, str(i), fontsize=24) txt.set_path_effects([ PathEffects.Stroke(linewidth=5, foreground="w"), PathEffects.Normal()]) txts.append(txt) return f, ax, sc, txts # In[7]: # Subset first 20k data points to visualize x_subset = X_train[0:20000] y_subset = y_train[0:20000] print (np.unique(y_subset)) # In[8]: from sklearn.decomposition import PCA time_start = time.time() pca = PCA(n_components=4) pca_result = pca.fit_transform(x_subset) print ('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start)) # In[9]: pca_df = pd.DataFrame(columns = ['pca1','pca2','pca3','pca4']) pca_df['pca1'] = pca_result[:,0] pca_df['pca2'] = pca_result[:,1] pca_df['pca3'] = pca_result[:,2] pca_df['pca4'] = pca_result[:,3] print ('Variance explained per principal component: {}'.format(pca.explained_variance_ratio_)) # In[10]: top_two_comp = pca_df[['pca1','pca2']] # taking first and second principal component fashion_scatter(top_two_comp.values,y_subset) # Visualizing the PCA output # In[11]: from sklearn.manifold import TSNE import time time_start = time.time() fashion_tsne = TSNE(learning_rate='auto', init='pca', random_state=RS).fit_transform(x_subset) print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)) # In[12]: fashion_scatter(fashion_tsne, y_subset) # In[13]: time_start = time.time() pca_50 = PCA(n_components=50) pca_result_50 = pca_50.fit_transform(x_subset) print ('PCA with 50 components done! Time elapsed: {} seconds'.format(time.time()-time_start)) print ('Cumulative variance explained by 50 principal components: {}'.format(np.sum(pca_50.explained_variance_ratio_))) # In[14]: import time time_start = time.time() fashion_pca_tsne = TSNE(learning_rate='auto', init='pca', random_state=RS).fit_transform(pca_result_50) print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)) # In[15]: fashion_scatter(fashion_pca_tsne, y_subset)