K NearestNeighbors is a unsupervised algorithm where if one wants to find the “closest” datapoint(s) to new unseen data, one can calculate a suitable “distance” between each and every point, and return the top K datapoints which have the smallest distance to it.
cuML’s KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done in to a Numpy Array in a future release), and fits a special data structure first to approximate the distance calculations, allowing our querying times to be O(plogn) and not the brute force O(np) [where p = no(features)]:
The KNN function accepts the following parameters:
The methods that can be used with KNN are:
The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well as cuDF DataFrames. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the K NearestNeighbors model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/api.html#nearest-neighbors
import numpy as np
import pandas as pd
import cudf
import os
from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN
# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for clustering
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
if os.path.exists(cached) and source=='mortgage':
print('use mortgage data')
with gzip.open(cached) as f:
X = np.load(f)
X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
else:
# create a random dataset
print('use random data')
X = np.random.random((nrows,ncols)).astype('float32')
df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
return df
from sklearn.metrics import mean_squared_error
# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
a = to_nparray(a)
b = to_nparray(b)
if with_sign == False:
a,b = np.abs(a),np.abs(b)
if metric=='mse':
error = mean_squared_error(a,b)
res = error<threshold
elif metric=='abs':
error = a-b
res = len(error[error>threshold]) == 0
elif metric == 'acc':
error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
res = error<threshold
return res
# calculate the accuracy
def accuracy(a,b, threshold=1e-4):
a = to_nparray(a)
b = to_nparray(b)
c = a-b
c = len(c[c>1]) / (c.shape[0]*c.shape[1])
return c<threshold
# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
return np.array(x)
elif isinstance(x,np.float64):
return np.array([x])
elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
return x.to_pandas().values
return x
%%time
# nrows = number of samples
# ncols = number of features of each sample
nrows = 2**15
ncols = 40
X = load_data(nrows,ncols)
print('data',X.shape)
# the number of neighbors whos labels are to be checked
n_neighbors = 10
%%time
# use the sklearn KNN model to fit the dataset
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)
%%time
# convert the pandas dataframe to cudf dataframe
X = cudf.DataFrame.from_pandas(X)
%%time
# use cuml's KNN model to fit the dataset
knn_cuml = cumlKNN()
knn_cuml.fit(X)
# calculate the distance and the indices of the samples present in the dataset
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)
# compare the distance obtained while using sklearn and cuml models
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)
# compare the labels obtained while using sklearn and cuml models
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)