import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly as py import plotly.graph_objs as go import warnings warnings.filterwarnings('ignore') from sklearn import preprocessing import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering from google.colab import drive import os drive.mount('/content/gdrive') # Establecer ruta de acceso en drive import os print(os.getcwd()) os.chdir("/content/gdrive/My Drive") %cd '/content/gdrive/MyDrive/Diplomado Python Análisis y Visualización de Datos/Modulo 5. Aprendizaje No Supervisado' df = pd.read_csv('Mall_Customers.csv') df.head() df.isnull().sum() df.describe() plt.figure(1 , figsize = (15 , 6)) n = 0 for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(1 , 3 , n) plt.subplots_adjust(hspace = 0.5 , wspace = 0.5) sns.distplot(df[x],bins=30) plt.title('Histograma de {}'.format(x)) plt.show() label_encoder = preprocessing.LabelEncoder() df['Gender'] = label_encoder.fit_transform(df['Gender']) df.head() df.shape plt.figure(1, figsize = (16 ,8)) sns.heatmap(df[['Age','Annual Income (k$)','Spending Score (1-100)']]) plt.show() plt.figure( figsize = (12 ,6)) plt.subplot(121) sns.scatterplot(df.Age,df['Annual Income (k$)']) plt.subplots_adjust(hspace = 0.3 , wspace = 0.3) plt.subplot(122) sns.scatterplot(df['Annual Income (k$)'],df['Spending Score (1-100)']) df=df.drop(columns=['CustomerID','Gender']) df.head() plt.figure(1, figsize = (16 ,8)) dendrogram = sch.dendrogram(sch.linkage(df, method = "ward",metric='euclidean',), orientation='top',# Diferentes formas: right, left, bottom, top ) # Si eligen del metodo de Ward deben usar euclidean plt.title('Dendrogram') plt.xlabel('Clientes') plt.ylabel('Distancias euclideana') plt.show() hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage ='ward') y_hc = hc.fit_predict(df) y_hc y_hc.shape df['cluster'] = pd.DataFrame(y_hc) df trace1 = go.Scatter3d( x= df['Age'], y= df['Spending Score (1-100)'], z= df['Annual Income (k$)'], mode='markers', marker=dict( color = df['cluster'], size= 10, line=dict( color= df['cluster'], width= 12 ), opacity=0.8 ) ) data = [trace1] layout = go.Layout( title= 'Clusters usando tecnica Agglomerative Clustering', scene = dict( xaxis = dict(title = 'Edad'), yaxis = dict(title = 'Spending Score'), zaxis = dict(title = 'Ingreso anual') ) ) fig = go.Figure(data=data, layout=layout) py.offline.iplot(fig) df.head() X = df.iloc[:, [1,2]].values plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster 1') plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster 2') plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster 3') plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster 4') plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster 5') plt.title('Clusters de clientes (Hierarchical Clustering)') plt.xlabel('Annual Income(k$)') plt.ylabel('Spending Score(1-100)') plt.show() X = df.iloc[:, [0,1]].values plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster 1') plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster 2') plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster 3') plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster 4') plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster 5') plt.title('Clusters de clientes (Hierarchical Clustering)') plt.xlabel('Edad') plt.ylabel('Annual income') plt.show() X = df.iloc[:, [2,4]].values plt.scatter(X[y_hc==0, 0], X[y_hc==0, 1], s=100, c='red', label ='Cluster 1') plt.scatter(X[y_hc==1, 0], X[y_hc==1, 1], s=100, c='blue', label ='Cluster 2') plt.scatter(X[y_hc==2, 0], X[y_hc==2, 1], s=100, c='green', label ='Cluster 3') plt.scatter(X[y_hc==3, 0], X[y_hc==3, 1], s=100, c='purple', label ='Cluster 4') plt.scatter(X[y_hc==4, 0], X[y_hc==4, 1], s=100, c='orange', label ='Cluster 5') plt.title('Clusters de clientes (Hierarchical Clustering)') plt.xlabel('Edad') plt.ylabel('Spending Score') plt.show() !wget -O cars_clus.csv https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/cars_clus.csv filename = 'cars_clus.csv' #Lectura pdf = pd.read_csv(filename) print ("Shape: ", pdf.shape) pdf.head(5) print ("Shape antes de cleaning: ", pdf.shape) pdf[[ 'sales', 'resale', 'type', 'price', 'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg', 'lnsales']] = pdf[['sales', 'resale', 'type', 'price', 'engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg', 'lnsales']].apply(pd.to_numeric, errors='coerce') pdf = pdf.dropna() pdf = pdf.reset_index(drop=True) print ("Shape despues de cleaning: ", pdf.shape) pdf.head(5) featureset = pdf[['engine_s', 'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap', 'mpg']] featureset feature_mtx=featureset.values feature_mtx feature_mtx.shape import scipy leng = feature_mtx.shape[0] D = scipy.zeros([leng,leng]) for i in range(leng): for j in range(leng): D[i,j] = scipy.spatial.distance.euclidean(feature_mtx[i], feature_mtx[j]) D.shape D import pylab import scipy.cluster.hierarchy Z = scipy.cluster.hierarchy.linkage(D, 'complete') Z from scipy.cluster.hierarchy import fcluster max_d = 3 clusters = fcluster(Z, max_d, criterion='distance') clusters # Determinar clusters from scipy.cluster.hierarchy import fcluster k = 5 clusters = fcluster(Z, k, criterion='maxclust') clusters fig = pylab.figure(figsize=(18,50)) def llf(id): return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id], int(float(pdf['type'][id])) ) dendro = scipy.cluster.hierarchy.dendrogram(Z, leaf_label_func=llf, leaf_rotation=0, leaf_font_size =12, orientation = 'top') from scipy.spatial import distance_matrix dist_matrix = distance_matrix(feature_mtx,feature_mtx) print(dist_matrix) agglom = AgglomerativeClustering(n_clusters = 6, linkage = 'complete') agglom.fit(feature_mtx) agglom.labels_ pdf['cluster_'] = agglom.labels_ pdf.head() import matplotlib.cm as cm n_clusters = max(agglom.labels_)+1 colors = cm.rainbow(np.linspace(0, 1, n_clusters)) cluster_labels = list(range(0, n_clusters)) # Figura de tamaño 16 inches por 14 inches. plt.figure(figsize=(16,14)) for color, label in zip(colors, cluster_labels): subset = pdf[pdf.cluster_ == label] for i in subset.index: plt.text(subset.horsepow[i], subset.mpg[i],str(subset['model'][i]), rotation=25) plt.scatter(subset.horsepow, subset.mpg, s= subset.price*10, c=color, label='cluster'+str(label),alpha=0.5) # plt.scatter(subset.horsepow, subset.mpg) plt.legend() plt.title('Clusters') plt.xlabel('horsepow') plt.ylabel('mpg') pdf.groupby(['cluster_','type'])['cluster_'].count() agg_cars = pdf.groupby(['cluster_','type'])['horsepow','engine_s','mpg','price'].mean() agg_cars plt.figure(figsize=(16,10)) for color, label in zip(colors, cluster_labels): subset = agg_cars.loc[(label,),] for i in subset.index: plt.text(subset.loc[i][0]+5, subset.loc[i][2], 'type='+str(int(i)) + ', price='+str(int(subset.loc[i][3]))+'k') plt.scatter(subset.horsepow, subset.mpg, s=subset.price*20, c=color, label='cluster'+str(label)) plt.legend() plt.title('Clusters') plt.xlabel('horsepow') plt.ylabel('mpg')