import random from google.colab import drive drive.mount('/content/drive') import pandas as pd import numpy as np import seaborn as sns import random import matplotlib.pyplot as plt %cd /content/drive/My\ Drive/colab_notebooks/machine_learning/data/ df = pd.read_csv("clustering.csv") df = df[['ApplicantIncome','LoanAmount']] y1 = df['ApplicantIncome'] n_bins = 20 plt.hist(y1, bins=n_bins,edgecolor = "white") plt.show() y1 = df['LoanAmount'] n_bins = 20 plt.hist(y1, bins=n_bins,edgecolor = "white") plt.show() def KMeanClustering(arr,K,eps): # No of clusters are equivalent to # Initialize random Centroids n = len(arr) random_centroids = random.sample(range(1, n), K) centroid_val = arr[random_centroids,:] centroids_lst=[] centroids_lst.append(centroid_val) clusters_lst = [] diff = 9999 j = 0 while diff > eps: ########################################################### # 1. Code to calculate the Eucledian distance between the centroids # and all the other observations. # 2. Assigning observations to centroids with least distance. ########################################################### euclidean_centroid_dist = np.sqrt(np.sum(np.square(arr[:,np.newaxis,:] - centroid_val),axis=2)) assigned_cluster = np.argmin(euclidean_centroid_dist,axis=1).reshape(n,1) clusters_lst.append(assigned_cluster) ########################################################### # 3. Code segment to calculate the new centroids, based on # on the assignment in the previous "assigned cluster" # ASSIGNMENT. ########################################################### centroid_val_old = centroid_val centroid_val = np.zeros([K,arr.shape[1]]) for i in range(0,K): cluster = np.where(assigned_cluster==i)[0] cluster_arr = arr[cluster,:] centroid_val[i,:] = np.mean(cluster_arr,axis=0) ########################################################### # 4. Code segment for the exit condition of the while loop # - Calculate the difference between new and the previous # centroid, if the difference is below the given eps, # end the while and return the clusters and the # cluster centroids. # - and if not, keep executing the while loop ########################################################### diff = (1/n)*np.sum(np.square(centroid_val_old - centroid_val)) j+=1 return assigned_cluster,centroid_val,clusters_lst,centroids_lst,j arr = np.array(df) K = 3 eps = 1e-5 clusters,centroids,clusters_lst,centroids_lst,iter = KMeanClustering(arr,K,eps) print(arr.shape) index0 = np.argwhere(clusters == 0).ravel() print(len(index0)) cluster0 = arr[index0,:] print(cluster0.shape) index1 = np.argwhere(clusters == 1).ravel() cluster1 = arr[index1,:] index2 = np.argwhere(clusters == 2).ravel() cluster2 = arr[index2,:] plt.scatter(cluster0[:,0],cluster0[:,1],color='red',alpha=0.5,label='Cluster-1') plt.scatter(cluster1[:,0],cluster1[:,1],color='green',alpha=0.5,label='Cluster-2') plt.scatter(cluster2[:,0],cluster2[:,1],color='blue',alpha=0.5,label='Cluster-3') plt.title("Outcome of K-Mean clustering algorithm") plt.legend() plt.grid() plt.show() # to run the code on the data set above and then a little larger data set class HierarchicalClustering: def __init__(self,arr): self.arr = arr self.n = len(self.arr) self.stagearr = np.zeros([2,self.n]) l = int(np.ceil(self.n/2)) print("l",l) self.clusters = np.empty(l,dtype=object) self.clusterCounter = 0 #def euclideanDist(self) def CompleteLinkage(self): # call the EuclideanDist Function # For this example, we are skipping that step self.arr = np.where(self.arr == 0, 100, self.arr) for k in range(len(self.arr),2,-1): print("k",k) self.pos = np.where(self.arr == np.min(self.arr))[0] print("self.pos",self.pos) # First calling the cluster function, to save the newly groups # cluster self.storeclusters() self.stagearr = self.arr[self.pos,:] #2 Delete statement one row and one column self.arr[self.pos,:] = 100 self.arr[:,self.pos] = 100 print("self.arr","\n",self.arr) #self.arr = np.delete(self.arr,self.pos,axis=1) #self.arr = np.delete(self.arr,self.pos,axis=0) # deleting the present cluster indexes from the stage arr print("stagearr","\n",self.stagearr) self.stagearr[:,self.pos] = 100 print("After placeete",self.stagearr) self.arr[self.pos,:] = self.stagearr self.arr[:,self.pos] = self.stagearr.T newrow = np.max(self.stagearr,axis=0) newrow = newrow.reshape(len(self.stagearr[1]),1) #self.arr = np.hstack((self.arr, newrow)) newrow = np.append(newrow,100) #self.arr = np.vstack((self.arr, newrow.T)) print("self.arr") print(self.arr) def storeclusters(self): print("we are in clusters ") is_looping = True # When no clusters are created yet. if self.clusterCounter == 0: self.clusters[0] = self.pos is_looping = False else: i = 0 while i < self.clusterCounter: for j in range(0,2): print("j0",j) print("i",i) if self.clusters[i] is None: self.clusters[i] = self.pos is_looping = False break elif any(x in self.pos for x in self.clusters[i]): self.clusters[i] = np.append(self.clusters[i],self.pos) print("elIf", self.clusters ) is_looping = False break else: continue if is_looping is False: break else: i+=1 # Case when none of the existing clusters contain any of the # 2 new cluster values. if is_looping is True: self.clusters[self.clusterCounter+1] = self.pos self.clusterCounter +=1 print("Cluster Counter",self.clusterCounter) print("Cluster",self.clusters) return self.clusters f = HierarchicalClustering(x) f.CompleteLinkage() #f.clusters() import random random.seed(2) x = np.random.normal(10,5,50).reshape(25,2) x[0:25,0] = x[0:25,0]+ 3 x[0:25,1] = x[0:25,1]-4 print(x) import scipy.cluster.hierarchy as sch dendrogram = sch.dendrogram(sch.linkage(x, method='ward')) import numpy as np g = np.array([[0,9,3,6,11], [9,0,7,5,10], [3,7,0,9,2], [6,5,9,0,8], [11,10,2,8,0]]) f = [3,5] t = [[7,8],[4,5],[6,0]] any(x in f for x in t[1]) import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt %cd /content/drive/My\ Drive/colab_notebooks/machine_learning/data/ df = pd.read_csv("Mall_Customers.csv") df.columns df = df.loc[:,['Annual Income (k$)','Spending Score (1-100)']] df import numpy as np a = np.array([[1,2,3],[4,6,7],[10,11,12]]) print(a) g = np.square(a - a[:,np.newaxis,:]) print(g) h = np.sum(g,axis=2) print(h)