import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.datasets import make_blobs from sklearn.cluster import KMeans X, y = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0) wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 11), wcss) plt.title('Metodo del codo') plt.xlabel('Numero de clusters') plt.ylabel('Inercia') plt.show() kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0) pred_y = kmeans.fit_predict(X) plt.scatter(X[:,0], X[:,1]) plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red') plt.show() from google.colab import drive import os drive.mount('/content/gdrive') %cd '/content/gdrive/MyDrive/' import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.cluster import AgglomerativeClustering import scipy.cluster.hierarchy as sch dataset = pd.read_csv('Mall_Customers.csv') X = dataset.iloc[:, [3, 4]].values plt.figure(figsize=(10,6)) dendrogram = sch.dendrogram(sch.linkage(X, method='ward')) model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') model.fit(X) labels = model.labels_ plt.scatter(X[labels==0, 0], X[labels==0, 1], s=50, marker='o', color='red') plt.scatter(X[labels==1, 0], X[labels==1, 1], s=50, marker='o', color='blue') plt.scatter(X[labels==2, 0], X[labels==2, 1], s=50, marker='o', color='green') plt.scatter(X[labels==3, 0], X[labels==3, 1], s=50, marker='o', color='purple') plt.scatter(X[labels==4, 0], X[labels==4, 1], s=50, marker='o', color='orange') plt.show() from sklearn.datasets import make_blobs from sklearn.cluster import DBSCAN # Configuracion de datos y parametros num_samples_total = 1000;cluster_centers = [(3,3), (7,7)] num_classes = len(cluster_centers) epsilon = 1.0;min_samples = 13 # Generacion de datos X, y = make_blobs(n_samples = num_samples_total, centers = cluster_centers, n_features = num_classes, center_box=(0, 1), cluster_std = 0.5) # DBSCAN db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X) labels = db.labels_ no_clusters = len(np.unique(labels) ) no_noise = np.sum(np.array(labels) == -1, axis=0) # Ruido (Outliers) print('#. clusters estimado: %d' % no_clusters) print('# puntos ruidosos: %d' % no_noise) # Generar figura de datos colors = list(map(lambda x: '#3b4cc0' if x == 1 else '#b40426', labels)) plt.scatter(X[:,0], X[:,1], c=colors, marker="o", picker=True) plt.title('Clasificacion DBSCAN') plt.xlabel('Eje X[0]');plt.ylabel('Eje X[1]') plt.show() import pandas as pd url = 'https://raw.githubusercontent.com/JJTorresDS/stocks-ds-edu/main/stocks.csv' df = pd.read_csv(url, index_col=0) df.head(5) import numpy as np from scipy import stats from sklearn.preprocessing import MinMaxScaler X = df.values X_2= np.average(X,axis=0);X_3= np.std(X,axis=0) df_n = pd.DataFrame();df_n['labels']=df.columns df_n['Valores']=X_2;df_n['Sd']=X_3;df_n.index=df_n['labels'] df_n['Trim']=stats.trim_mean(X, 0.1) df_n= df_n.drop(columns='labels') df_n[['Valores', 'Sd','Trim']] = MinMaxScaler().fit_transform(df_n[['Valores', 'Sd','Trim']]) df_n feature_mtx=df_n.values feature_mtx import scipy leng = feature_mtx.shape[0] D = scipy.zeros([leng,leng]) for i in range(leng): for j in range(leng): D[i,j] = scipy.spatial.distance.euclidean(feature_mtx[i], feature_mtx[j]) import pylab import scipy.cluster.hierarchy Z = scipy.cluster.hierarchy.linkage(D, 'complete') Z fig = pylab.figure(figsize=(12,6)) def llf(id): return '[%s]' % (df_n.index[id] ) dendro = scipy.cluster.hierarchy.dendrogram(Z, leaf_label_func=llf, leaf_rotation=0, leaf_font_size =12, orientation = 'top') agglom = AgglomerativeClustering(n_clusters = 4, linkage = 'complete') agglom.fit(feature_mtx) agglom.labels_ df_n['cluster_'] = agglom.labels_ df_n.head() fig, ax = plt.subplots(figsize=(12,6)) ax.scatter(np.log(df_n.Valores), np.log(df_n.Sd)) for i, txt in enumerate(df_n.index): ax.annotate(txt, (np.log(df_n.Valores[i]), np.log(df_n.Sd[i]))) # Preprocesado y modelado from sklearn.decomposition import PCA from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import scale import statsmodels.api as sm USArrests = sm.datasets.get_rdataset("USArrests", "datasets") datos = USArrests.data # Entrenamiento modelo PCA con escalado de los datos pca_pipe = make_pipeline(StandardScaler(), PCA()) pca_pipe.fit(datos) # Se extrae el modelo entrenado del pipeline modelo_pca = pca_pipe.named_steps['pca'] # Porcentaje de varianza explicada por cada componente print('Porcentaje de varianza explicada por cada componente') print(modelo_pca.explained_variance_ratio_) import seaborn as sns;sns.set_style("whitegrid") fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5)) ax.bar(x= np.arange(modelo_pca.n_components_) + 1,height = modelo_pca.explained_variance_ratio_) for x, y in zip(np.arange(len(datos.columns)) + 1, modelo_pca.explained_variance_ratio_): label = round(y, 2) ax.annotate(label,(x,y),textcoords="offset points",xytext=(0,10),ha='center') ax.set_xticks(np.arange(modelo_pca.n_components_) + 1);ax.set_ylim(0, 1.1) ax.set_title('Porcentaje de varianza explicada por cada componente') ax.set_xlabel('Componente principal') ax.set_ylabel('Por. varianza explicada'); # Proyección de las observaciones de entrenamiento # ============================================================================== proyecciones = pca_pipe.transform(X=datos) proyecciones = pd.DataFrame( proyecciones, columns = ['PC1', 'PC2', 'PC3', 'PC4'], index = datos.index ) proyecciones = np.dot(modelo_pca.components_, scale(datos).T) proyecciones = pd.DataFrame(proyecciones, index = ['PC1', 'PC2', 'PC3', 'PC4']) proyecciones = proyecciones.transpose().set_index(datos.index) plt.figure(figsize=(15,6)) proyecciones['val']=proyecciones.index ax = proyecciones.set_index('PC1')['PC2'].plot(style='o') def label_point(x, y, val, ax): a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1) for i, point in a.iterrows(): ax.text(point['x'], point['y'], str(point['val'])) label_point(proyecciones.PC1, proyecciones.PC2, proyecciones.val, ax) plt.axvline(x=0,color='black');plt.axhline(y=0,color='black') plt.title('PC1 vs PC2 estados EU');plt.xlabel('PC1',color='k') plt.ylabel('PC2',color='black')