K-nearest neighbors regression uses the labels of neighborhoods around data samples to determine the outputs of unseen data samples.
The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or cuda_array_interface-compliant), as well as cuDF DataFrames as the input.
For information on converting your dataset to cuDF format, refer to the cuDF documentation: https://docs.rapids.ai/api/cudf/stable
For additional information on cuML's Nearest Neighbors implementation: https://rapidsai.github.io/projects/cuml/en/stable/api.html#cuml.neighbors.KNeighborsRegressor
import os
import numpy as np
from sklearn.datasets import make_blobs
import pandas as pd
import cudf as gd
from sklearn.neighbors import KNeighborsRegressor as skKNR
from cuml.neighbors import KNeighborsRegressor as cumlKNR
n_samples = 2**17
n_features = 40
n_query = 5000
n_neighbors = 4
%%time
X_host_train, y_host_train = make_blobs(
n_samples=n_samples, n_features=n_features, centers=5, random_state=0)
X_host_train = pd.DataFrame(X_host_train)
y_host_train = pd.DataFrame(y_host_train)
%%time
X_host_test, y_host_test = make_blobs(
n_samples=n_query, n_features=n_features, centers=5, random_state=0)
X_host_test = pd.DataFrame(X_host_test)
y_host_test = pd.DataFrame(y_host_test)
X_device_train = gd.DataFrame.from_pandas(X_host_train)
y_device_train = gd.DataFrame.from_pandas(y_host_train)
X_device_test = gd.DataFrame.from_pandas(X_host_test)
y_device_test = gd.DataFrame.from_pandas(y_host_test)
%%time
knn_sk = skKNR(algorithm="brute", n_neighbors=n_neighbors, n_jobs=-1)
knn_sk.fit(X_host_train, y_host_train)
sk_result = knn_sk.predict(X_host_test)
%%time
knn_cuml = cumlKNR(n_neighbors=n_neighbors)
knn_cuml.fit(X_device_train, y_device_train)
cuml_result = knn_cuml.predict(X_device_test)
passed = np.allclose(np.asarray(cuml_result.as_gpu_matrix()), sk_result, atol=1e-9)
print('compare knn: cuml vs sklearn classes %s'%('equal'if passed else 'NOT equal'))