The DBSCAN algorithm is a clustering algorithm which works really well for datasets in which samples conregate in large groups. cuML’s DBSCAN expects a cuDF DataFrame, and constructs an adjacency graph to compute the distances between close neighbours. The DBSCAN model implemented in the cuML library can accept the following parameters :
The methods that can be used with DBSCAN are:
The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well as cuDF DataFrames. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the DBSCAN model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN as skDBSCAN
from cuml import DBSCAN as cumlDBSCAN
import cudf
import os
# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for clustering
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
if os.path.exists(cached):
print('use mortgage data')
with gzip.open(cached) as f:
X = np.load(f)
X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
else:
# create a random dataset
print('use random data')
X = np.random.rand(nrows,ncols)
df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
return df
# this function checks if the results obtained from two different methods is the same
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=5e-3,with_sign=True):
a = to_nparray(a)
b = to_nparray(b)
if with_sign == False:
a,b = np.abs(a),np.abs(b)
res = mean_squared_error(a,b)<threshold
return res
# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
return np.array(x)
elif isinstance(x,np.float64):
return np.array([x])
elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
return x.to_pandas().values
return x
%%time
# nrows = number of samples
# ncols = number of features of each sample
nrows = 5000
ncols = 128
X = load_data(nrows,ncols)
print('data',X.shape)
# eps = maximum distance between 2 sample points for them to be in the same neighborhood
# min_samples = number of samples that should be present in a neighborhood for it to be considered as a core point
eps = 3
min_samples = 2
%%time
# use the sklearn DBSCAN model to fit the dataset
clustering_sk = skDBSCAN(eps = eps, min_samples = min_samples)
clustering_sk.fit(X)
%%time
# convert the pandas dataframe to cudf format
X = cudf.DataFrame.from_pandas(X)
%%time
# run the cuml DBSCAN model to fit the dataset
clustering_cuml = cumlDBSCAN(eps = eps, min_samples = min_samples)
clustering_cuml.fit(X)
# check if the output of the sklearn model and the cuml model are equal or not
passed = array_equal(clustering_sk.labels_,clustering_cuml.labels_)
message = 'compare dbscan: cuml vs sklearn labels_ %s'%('equal'if passed else 'NOT equal')
print(message)