Ridge Regression extends linear regression by providing L2 regularization of the coefficients. It can reduce the variance of the predictors, and improves the conditioning of the problem.
The model can take array-like objects, either in host as NumPy arrays or in device (as Numba or cuda_array_interface-compliant), as well as cuDF DataFrames as the input.
For information about cuDF, refer to the cuDF documentation.
For information about cuML's ridge regression API: https://rapidsai.github.io/projects/cuml/en/0.11.0/api.html#cuml.Ridge.
import cudf
from cuml import make_regression, train_test_split
from cuml.metrics.regression import r2_score
from cuml.linear_model import Ridge as cuRidge
from sklearn.linear_model import Ridge as skRidge
n_samples = 2**20
n_features = 399
random_state = 23
%%time
X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=0)
X = cudf.DataFrame.from_gpu_matrix(X)
y = cudf.DataFrame.from_gpu_matrix(y)[0]
X_cudf, X_cudf_test, y_cudf, y_cudf_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
# Copy dataset from GPU memory to host memory.
# This is done to later compare CPU and GPU results.
X_train = X_cudf.to_pandas()
X_test = X_cudf_test.to_pandas()
y_train = y_cudf.to_pandas()
y_test = y_cudf_test.to_pandas()
%%time
ridge_sk = skRidge(fit_intercept=False, normalize=True, alpha=0.1)
ridge_sk.fit(X_train, y_train)
%%time
predict_sk= ridge_sk.predict(X_test)
%%time
r2_score_sk = r2_score(y_cudf_test, predict_sk)
%%time
# Run the cuml ridge regression model to fit the training dataset.
# Eig is the faster algorithm, but svd is more accurate.
# In general svd uses significantly more memory and is slower than eig.
# If using CUDA 10.1, the memory difference is even bigger than in the other supported CUDA versions
ridge_cuml = cuRidge(fit_intercept=False, normalize=True, solver='eig', alpha=0.1)
ridge_cuml.fit(X_cudf, y_cudf)
%%time
predict_cuml = ridge_cuml.predict(X_cudf_test)
%%time
r2_score_cuml = r2_score(y_cudf_test, predict_cuml)
print("R^2 score (SKL): %s" % r2_score_sk)
print("R^2 score (cuML): %s" % r2_score_cuml)