#!/usr/bin/env python # coding: utf-8 # # GMM # # It is a probabilistic approach to clustering addressing many of these problems. In this approach we describe each cluster by its centroid (mean), covariance , and the size of the cluster(Weight) # # ### Aproach # # * Rather than identifying clusters by “nearest” centroids like k means, we fit a set of k gaussians to the data. # * Then we estimate gaussian distribution parameters such as mean and Variance for each cluster and weight of a cluster. # * After learning the parameters for each data point we can calculate the probabilities of it belonging to each of the clusters. # ### How do we estimate GD params # # * Expectation maximization is the technique most commonly used to estimate the mixture model's parameters. # * In frequentist probability theory, models are typically learned by using maximum likelihood estimation techniques, which seek to maximize the probability, or likelihood, of the observed data given the model parameters. # Well it's hard to visualize and grab these concepts in first shot. Here is some material for deep dive in gmm # * http://www.cse.iitm.ac.in/~vplab/courses/DVP/PDF/gmm.pdf # In[49]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set(style="white", color_codes=True) get_ipython().run_line_magic('matplotlib', 'inline') # In[50]: data = pd.read_csv('my_machine-learning/datasets/iris.csv') data = data.drop('Id', axis=1) # get rid of the Id column - don't need it data.sample(5) # In[51]: X = data.iloc[:,0:4] y = data.iloc[:,-1] # ### Standard Scaling # In[52]: from sklearn import preprocessing scaler = preprocessing.StandardScaler() scaler.fit(X) X_scaled_array = scaler.transform(X) X_scaled = pd.DataFrame(X_scaled_array, columns = X.columns) # In[53]: from sklearn.cluster import KMeans kmean = KMeans(n_clusters=3) y_kmeans = kmean.fit_predict(X_scaled) # #### Adjusted Rand score # # * you can't just compare the SpeciesId with the cluster numbers, because they are both arbitrarily assigned integers. # * But you can use the *adjusted Rand score* to quantify the goodness of the clustering, as compared with SpeciesId (the true labels). # # * e.g. this will give a perfect score of 1.0, even though the labels are reversed - adjusted_rand_score([0,0,1,1], [1,1,0,0]) # => 1.0 # # see http://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html # # In[54]: from sklearn.metrics.cluster import adjusted_rand_score # first let's see how the k-means clustering did - score = adjusted_rand_score(y, y_kmeans) score # In[55]: from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=3) y_gmm = gmm.fit_predict(X_scaled) # * GMM tries to fit normally distributed clusters, which is probably the case with this data, # * so it fit it better. k-means is biased towards spherically distributed clusters. # In[56]: score = adjusted_rand_score(y, y_gmm) score # * **Its impossible to create 4d graph without some dimensionality reduction technique like pca** # * So we will create one after learning pca # ### Part 2 # In[70]: import matplotlib.pyplot as plt from sklearn.cluster import KMeans import seaborn as sns; sns.set() import numpy as np from sklearn.datasets import make_blobs, make_circles X, y_true = make_blobs(n_samples=700, centers=4,cluster_std=0.5, random_state=2019) X = X[:, ::-1] from sklearn.mixture import GaussianMixture as GMM gmm = GMM(n_components=4).fit(X) labels = gmm.predict(X) plt.figure(figsize=(10,6)) plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') plt.show() # In[17]: plt.savefig('gmm.jpg') # In[18]: probs = gmm.predict_proba(X) print(probs[:10].round(2)) # In[23]: size = probs.max(1) plt.figure(figsize=(10,6)) plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=size) plt.show() # In[21]: from matplotlib.patches import Ellipse def draw_ellipse(position, covariance, ax=None, **kwargs): ax = ax or plt.gca() if covariance.shape == (2, 2): U, s, Vt = np.linalg.svd(covariance) angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) width, height = 2 * np.sqrt(s) else: angle = 0 width, height = 2 * np.sqrt(covariance) for nsig in range(1, 4): ax.add_patch(Ellipse(position, nsig * width, nsig * height, angle, **kwargs)) def plot_gmm(gmm, X, label=True, ax=None): ax = ax or plt.gca() labels = gmm.fit(X).predict(X) if label: ax.scatter(X[:, 0], X[:, 1], c=labels, s=4, cmap='viridis', zorder=2) else: ax.scatter(X[:, 0], X[:, 1], s=4, zorder=2) ax.axis('equal') w_factor = 0.2 / gmm.weights_.max() for pos, covar, w in zip(gmm.means_, gmm.covariances_ , gmm.weights_): draw_ellipse(pos, covar, alpha=w * w_factor) # In[24]: rng = np.random.RandomState(13) X_stretched = np.dot(X, rng.randn(2, 2)) gmm = GMM(n_components=4, covariance_type='full', random_state=42) plt.figure(figsize=(10,6)) plot_gmm(gmm, X_stretched) # In[54]: from sklearn.datasets import make_moons moon,y = make_moons(100, noise=.04, random_state=0) plt.figure(figsize=(8,5)) plt.scatter(x=moon[:, 0], y=moon[:, 1]) plt.show() # In[53]: gmm2 = GMM(n_components=2, covariance_type='full', random_state=0) plt.figure(figsize=(8,5)) plot_gmm(gmm2, moon) # In[56]: gmm10 = GMM(n_components=10, covariance_type='full', random_state=0) plt.figure(figsize=(8,5)) plot_gmm(gmm10, moon, label=False) # In[65]: Xnew = gmm10.sample(200)[0] plt.figure(figsize=(10,6)) plt.scatter(Xnew[:, 0], Xnew[:, 1]) plt.show() # In[62]: n_components = np.arange(1, 21) models = [GMM(n, covariance_type='full', random_state=0).fit(moon) for n in n_components] plt.figure(figsize=(10,6)) plt.plot(n_components, [m.bic(moon) for m in models], label='BIC') plt.plot(n_components, [m.aic(moon) for m in models], label='AIC') plt.legend(loc='best') plt.xlabel('n_components') plt.show() # In[72]: X1 = make_circles(factor=0.5, noise=0.05, n_samples=1500) # Moons X2 = make_moons(n_samples=1500, noise=0.05) fig, ax = plt.subplots(1, 2) for i, X in enumerate([X1, X2]): fig.set_size_inches(18, 7) km = KMeans(n_clusters=2) km.fit(X[0]) labels = km.predict(X[0]) centroids = km.cluster_centers_ ax[i].scatter(X[0][:, 0], X[0][:, 1], c=labels) ax[i].scatter(centroids[0, 0], centroids[0, 1], marker='*', s=400, c='r') ax[i].scatter(centroids[1, 0], centroids[1, 1], marker='+', s=300, c='green') plt.suptitle('Simulated data', y=1.05, fontsize=22, fontweight='semibold') plt.tight_layout() # In[ ]: