#!/usr/bin/env python # coding: utf-8 # # K-Nearest Neighbors Regression Demo # # K-nearest neighbors regression uses the labels of neighborhoods around data samples to determine the outputs of unseen data samples. # The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or cuda_array_interface-compliant), as well as cuDF DataFrames as the input. # # For information on converting your dataset to cuDF format, refer to the cuDF documentation: https://docs.rapids.ai/api/cudf/stable # # For additional information on cuML's Nearest Neighbors implementation: https://rapidsai.github.io/projects/cuml/en/stable/api.html#cuml.neighbors.KNeighborsRegressor # In[ ]: import os import numpy as np from sklearn.datasets import make_blobs import pandas as pd import cudf as gd from sklearn.neighbors import KNeighborsRegressor as skKNR from cuml.neighbors import KNeighborsRegressor as cumlKNR # ## Define Parameters # In[ ]: n_samples = 2**17 n_features = 40 n_query = 5000 n_neighbors = 4 # ## Generate Data # # ### Host # In[ ]: get_ipython().run_cell_magic('time', '', 'X_host_train, y_host_train = make_blobs(\n n_samples=n_samples, n_features=n_features, centers=5, random_state=0)\n\nX_host_train = pd.DataFrame(X_host_train)\ny_host_train = pd.DataFrame(y_host_train)\n') # In[ ]: get_ipython().run_cell_magic('time', '', 'X_host_test, y_host_test = make_blobs(\n n_samples=n_query, n_features=n_features, centers=5, random_state=0)\n\nX_host_test = pd.DataFrame(X_host_test)\ny_host_test = pd.DataFrame(y_host_test)\n') # ### Device # In[ ]: X_device_train = gd.DataFrame.from_pandas(X_host_train) y_device_train = gd.DataFrame.from_pandas(y_host_train) # In[ ]: X_device_test = gd.DataFrame.from_pandas(X_host_test) y_device_test = gd.DataFrame.from_pandas(y_host_test) # ## Scikit-learn Model # In[ ]: get_ipython().run_cell_magic('time', '', 'knn_sk = skKNR(algorithm="brute", n_neighbors=n_neighbors, n_jobs=-1)\nknn_sk.fit(X_host_train, y_host_train)\n\nsk_result = knn_sk.predict(X_host_test)\n') # ## cuML Model # In[ ]: get_ipython().run_cell_magic('time', '', 'knn_cuml = cumlKNR(n_neighbors=n_neighbors)\nknn_cuml.fit(X_device_train, y_device_train)\n\ncuml_result = knn_cuml.predict(X_device_test)\n') # ## Compare Results # In[ ]: passed = np.allclose(np.asarray(cuml_result.as_gpu_matrix()), sk_result, atol=1e-9) print('compare knn: cuml vs sklearn classes %s'%('equal'if passed else 'NOT equal'))