#!/usr/bin/env python # coding: utf-8 # Comparision of dask_glm and scikit-learn on the [SUSY dataset](https://archive.ics.uci.edu/ml/datasets/SUSY). # In[2]: import numpy as np import pandas as pd import dask from distributed import Client import dask.array as da from sklearn import linear_model from dask_glm.estimators import LogisticRegression # In[3]: df = pd.read_csv("SUSY.csv.gz", header=None) df.head() # In[4]: len(df) # We have 5,000,000 rows of all-numeric data. We'll skip any feature engineering and preprocessing. # In[5]: y = df[0].values X = df.drop(0, axis=1).values # In[6]: C = 10 # for scikit-learn λ = 1 / C # for dask_glm # In[7]: from sklearn.preprocessing import scale X = scale(X) # ## Scikit-learn # # First, we run scikit-learn's `LogisticRegression` on the full dataset. # In[8]: get_ipython().run_cell_magic('time', '', "lm = linear_model.LogisticRegression(penalty='l1', C=C, solver='saga')\nlm.fit(X, y)\n") # In[9]: get_ipython().run_cell_magic('time', '', 'lm.score(X, y)\n') # In[10]: # %%time # lm = linear_model.LogisticRegression(penalty='l1', C=C) # lm.fit(X, y) # In[11]: # %%time # lm.score(X, y) # In[12]: lm.coef_ # ## Dask GLM # # Now for the dask-glm version. # In[13]: client = Client() # dask K = 100000 dX = da.from_array(X, chunks=(K, X.shape[-1])) dy = da.from_array(y, chunks=(K,)) dX, dy = dask.persist(X, y) client.rebalance([X, y]) # In[14]: get_ipython().run_cell_magic('time', '', 'dk = LogisticRegression()\ndk.fit(dX, dy)\n') # In[15]: get_ipython().run_cell_magic('time', '', 'dk.score(dX, dy)\n') # In[16]: dk.coef_ # | Library | Training time | Score | # | -------------| ------------- | ----- | # | dask-glm | 1:08 | .788 | # | scikit-learn | 6:01 | .788 | # The saga fit is not perfect though (accuracy is slightly lower and the coefficients not identical): # In[19]: np.max(np.abs(dk.coef_ - lm.coef_)) # In[20]: np.abs(dk.coef_ - lm.coef_) # In[ ]: