Notebook

KNN¶

K NearestNeighbors is a unsupervised algorithm where if one wants to find the “closest” datapoint(s) to new unseen data, one can calculate a suitable “distance” between each and every point, and return the top K datapoints which have the smallest distance to it.

cuML’s KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done in to a Numpy Array in a future release), and fits a special data structure first to approximate the distance calculations, allowing our querying times to be O(plogn) and not the brute force O(np) [where p = no(features)]:

The KNN function accepts the following parameters:

n_neighbors: int (default = 5). The top K closest datapoints you want the algorithm to return. If this number is large, then expect the algorithm to run slower.
should_downcast:bool (default = False). Currently only single precision is supported in the underlying undex. Setting this to true will allow single-precision input arrays to be automatically downcasted to single precision. Default = False.

The methods that can be used with KNN are:

fit: Fit GPU index for performing nearest neighbor queries.
kneighbors: Query the GPU index for the k nearest neighbors of row vectors in X.

The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or _cuda_array_interface_compliant), as well as cuDF DataFrames. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the K NearestNeighbors model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/api.html#nearest-neighbors

In [ ]:

import numpy as np
import pandas as pd
import cudf
import os

from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

Helper Functions¶

In [ ]:

# check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for clustering 
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        # create a random dataset
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [ ]:

from sklearn.metrics import mean_squared_error
# this function checks if the results obtained from two different methods (sklearn and cuml) are the same
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
        res = error<threshold
    elif metric=='abs':
        error = a-b
        res = len(error[error>threshold]) == 0
    elif metric == 'acc':
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
        res = error<threshold
    return res

# calculate the accuracy 
def accuracy(a,b, threshold=1e-4):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

Run tests¶

In [ ]:

%%time
# nrows = number of samples
# ncols = number of features of each sample

nrows = 2**15
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)

In [ ]:

# the number of neighbors whos labels are to be checked
n_neighbors = 10

In [ ]:

%%time
# use the sklearn KNN model to fit the dataset 
knn_sk = skKNN(metric = 'sqeuclidean', )
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X,n_neighbors)

In [ ]:

%%time
# convert the pandas dataframe to cudf dataframe
X = cudf.DataFrame.from_pandas(X)

In [ ]:

%%time
# use cuml's KNN model to fit the dataset
knn_cuml = cumlKNN()
knn_cuml.fit(X)

# calculate the distance and the indices of the samples present in the dataset
D_cuml,I_cuml = knn_cuml.kneighbors(X,n_neighbors)

In [ ]:

# compare the distance obtained while using sklearn and cuml models
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

In [ ]:

# compare the labels obtained while using sklearn and cuml models
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

In [ ]: