from google.colab import drive
import os
drive.mount('/content/gdrive')
# Establecer ruta de acceso en dr
import os
print(os.getcwd())
os.chdir("/content/gdrive/My Drive")
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True). /content/gdrive/My Drive
# K-Means Clustering
# Importacion de librerias
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Carga del conjunto de datos
dataset = pd.read_csv("Mall_Customers.csv", sep = ",")
dataset.head()
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 |
1 | 2 | Male | 21 | 15 | 81 |
2 | 3 | Female | 20 | 16 | 6 |
3 | 4 | Female | 23 | 16 | 77 |
4 | 5 | Female | 31 | 17 | 40 |
#Generamos nuestra x
X = dataset.iloc[:, [3, 4]].values
X
array([[ 15, 39], [ 15, 81], [ 16, 6], [ 16, 77], [ 17, 40], [ 17, 76], [ 18, 6], [ 18, 94], [ 19, 3], [ 19, 72], [ 19, 14], [ 19, 99], [ 20, 15], [ 20, 77], [ 20, 13], [ 20, 79], [ 21, 35], [ 21, 66], [ 23, 29], [ 23, 98], [ 24, 35], [ 24, 73], [ 25, 5], [ 25, 73], [ 28, 14], [ 28, 82], [ 28, 32], [ 28, 61], [ 29, 31], [ 29, 87], [ 30, 4], [ 30, 73], [ 33, 4], [ 33, 92], [ 33, 14], [ 33, 81], [ 34, 17], [ 34, 73], [ 37, 26], [ 37, 75], [ 38, 35], [ 38, 92], [ 39, 36], [ 39, 61], [ 39, 28], [ 39, 65], [ 40, 55], [ 40, 47], [ 40, 42], [ 40, 42], [ 42, 52], [ 42, 60], [ 43, 54], [ 43, 60], [ 43, 45], [ 43, 41], [ 44, 50], [ 44, 46], [ 46, 51], [ 46, 46], [ 46, 56], [ 46, 55], [ 47, 52], [ 47, 59], [ 48, 51], [ 48, 59], [ 48, 50], [ 48, 48], [ 48, 59], [ 48, 47], [ 49, 55], [ 49, 42], [ 50, 49], [ 50, 56], [ 54, 47], [ 54, 54], [ 54, 53], [ 54, 48], [ 54, 52], [ 54, 42], [ 54, 51], [ 54, 55], [ 54, 41], [ 54, 44], [ 54, 57], [ 54, 46], [ 57, 58], [ 57, 55], [ 58, 60], [ 58, 46], [ 59, 55], [ 59, 41], [ 60, 49], [ 60, 40], [ 60, 42], [ 60, 52], [ 60, 47], [ 60, 50], [ 61, 42], [ 61, 49], [ 62, 41], [ 62, 48], [ 62, 59], [ 62, 55], [ 62, 56], [ 62, 42], [ 63, 50], [ 63, 46], [ 63, 43], [ 63, 48], [ 63, 52], [ 63, 54], [ 64, 42], [ 64, 46], [ 65, 48], [ 65, 50], [ 65, 43], [ 65, 59], [ 67, 43], [ 67, 57], [ 67, 56], [ 67, 40], [ 69, 58], [ 69, 91], [ 70, 29], [ 70, 77], [ 71, 35], [ 71, 95], [ 71, 11], [ 71, 75], [ 71, 9], [ 71, 75], [ 72, 34], [ 72, 71], [ 73, 5], [ 73, 88], [ 73, 7], [ 73, 73], [ 74, 10], [ 74, 72], [ 75, 5], [ 75, 93], [ 76, 40], [ 76, 87], [ 77, 12], [ 77, 97], [ 77, 36], [ 77, 74], [ 78, 22], [ 78, 90], [ 78, 17], [ 78, 88], [ 78, 20], [ 78, 76], [ 78, 16], [ 78, 89], [ 78, 1], [ 78, 78], [ 78, 1], [ 78, 73], [ 79, 35], [ 79, 83], [ 81, 5], [ 81, 93], [ 85, 26], [ 85, 75], [ 86, 20], [ 86, 95], [ 87, 27], [ 87, 63], [ 87, 13], [ 87, 75], [ 87, 10], [ 87, 92], [ 88, 13], [ 88, 86], [ 88, 15], [ 88, 69], [ 93, 14], [ 93, 90], [ 97, 32], [ 97, 86], [ 98, 15], [ 98, 88], [ 99, 39], [ 99, 97], [101, 24], [101, 68], [103, 17], [103, 85], [103, 23], [103, 69], [113, 8], [113, 91], [120, 16], [120, 79], [126, 28], [126, 74], [137, 18], [137, 83]])
Nos quedamos con las variables 3 y 4 de conjunto de datos, las cuales corresponden al ingreso anual en miles y la puntuación del cliente.
# Metodo del Codo para encontrar el numero optimo de clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
# Grafica de la suma de las distancias
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
En el bloque anterior, generamos los clusters para valores de 1 a 10 (en el rango de 1 a 11) y obtenemos para cada uno de ellos, la suma de las distancias con el tributo inertia_ del objeto kmeans. En la gráfica observamos que la disminución en la suma de las distancias se atenúa cuando el número de clusters es igual a 5, por lo que, para este caso práctico, el número óptimo de clusters será de 5.
# Creando el k-Means para los 5 grupos encontrados
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42) #k-means++ acelera la convergencia, respecto de random
y_kmeans = kmeans.fit_predict(X)
y_kmeans
array([2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 4, 0, 4, 1, 4, 1, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 0, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4], dtype=int32)
dataset['pred']=y_kmeans
dataset
CustomerID | Genre | Age | Annual Income (k$) | Spending Score (1-100) | pred | |
---|---|---|---|---|---|---|
0 | 1 | Male | 19 | 15 | 39 | 2 |
1 | 2 | Male | 21 | 15 | 81 | 3 |
2 | 3 | Female | 20 | 16 | 6 | 2 |
3 | 4 | Female | 23 | 16 | 77 | 3 |
4 | 5 | Female | 31 | 17 | 40 | 2 |
... | ... | ... | ... | ... | ... | ... |
195 | 196 | Female | 35 | 120 | 79 | 4 |
196 | 197 | Female | 45 | 126 | 28 | 1 |
197 | 198 | Male | 32 | 126 | 74 | 4 |
198 | 199 | Male | 32 | 137 | 18 | 1 |
199 | 200 | Male | 30 | 137 | 83 | 4 |
200 rows × 6 columns
dataset.groupby(['pred']).mean()
CustomerID | Age | Annual Income (k$) | Spending Score (1-100) | |
---|---|---|---|---|
pred | ||||
0 | 86.320988 | 42.716049 | 55.296296 | 49.518519 |
1 | 164.371429 | 41.114286 | 88.200000 | 17.114286 |
2 | 23.000000 | 45.217391 | 26.304348 | 20.913043 |
3 | 23.090909 | 25.272727 | 25.727273 | 79.363636 |
4 | 162.000000 | 32.692308 | 86.538462 | 82.128205 |
Para poder observar gráficamente la asignación de los 200 clientes a 5 grupos o clusters realizamos lo siguiente, le asignamos un color a cada grupo y marcamos los centroides en amarillo:
# Visualizacion grafica de los clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()