#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from scipy.spatial.distance import pdist, squareform from sklearn.datasets import load_iris from sklearn.cluster import DBSCAN as skDBSCAN # ### Implementation 1 # - based on brute force # - similar to scikit-learn algorithm='brute' # In[2]: class DBSCAN(): def __init__(self, eps=0.5, min_samples=5): self.eps = eps self.min_samples = min_samples def fit(self, X): dist = squareform(pdist(X)) neighbors = np.array([np.where(d <= self.eps)[0] for d in dist]) n_neighbors = np.array([len(neighbor) for neighbor in neighbors]) core_samples = n_neighbors >= self.min_samples labels = np.full(X.shape[0], -1) label_num = 0 stack = [] for i in range(len(labels)): if labels[i] != -1 or not core_samples[i]: continue stack.append(i) while len(stack): cur = stack.pop() if labels[cur] == -1: labels[cur] = label_num if core_samples[cur]: for neighbor in neighbors[cur]: if labels[neighbor] == -1 and neighbor not in stack: stack.append(neighbor) label_num += 1 self.core_sample_indices_ = np.where(core_samples)[0] self.labels_= labels return self def fit_predict(self, X): self.fit(X) return self.labels_ # In[3]: X, _ = load_iris(return_X_y=True) clust1 = DBSCAN().fit(X) clust2 = skDBSCAN().fit(X) assert np.allclose(clust1.core_sample_indices_, clust2.core_sample_indices_) assert np.array_equal(clust1.labels_, clust2.labels_)