#!/usr/bin/env python # coding: utf-8 # This notebook is part of my [Python data science curriculum](http://www.terran.us/articles/python_curriculum.html) # # There are two major sets of documentation which are relevant for H2O: # # http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html # http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/intro.html # # In[1]: import pandas as pd import altair as alt alt.renderers.enable('notebook') # # Cluster Init # In[2]: import h2o h2o.init() # # Data Loading # At first I thought you could only load from disk: # In[3]: from plotnine.data import diamonds diamonds.to_csv('/tmp/diamonds.csv') h2o_diamonds = h2o.import_file('/tmp/diamonds.csv') # But in fact you __can__ transfer data from Python. The key piece of information is that you use the h2o.H2OFrame __constructor__ to do it. See references: # # http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/data.html#loading-data-from-a-python-object # http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/frame.html#h2oframe # In[4]: # The destination_frame argument is optional, but if you don't use it, you get a horrible hex name. h2o_diamonds2 = h2o.H2OFrame(diamonds, destination_frame='diamonds2') # In[5]: h2o.ls() # In[6]: h2o_diamonds2 # # Linear Regression # ## Failed Attempt # Let's fit a very simple model: # In[7]: h2o_lm = h2o.estimators.H2OGeneralizedLinearEstimator(family='gaussian') # In[8]: h2o_lm.train(x=['carat','cut','color','clarity'],y='price',training_frame=h2o_diamonds2) # In[9]: h2o_lm # __This is not right at all. $R^2$ should be 0.91 for this model, not 0.35!__ # # __Aha, it appears the default model is regularized.__ This is not explicitly stated but it is implied by the available arguments. # ## Successful Attempt # In[10]: # Note the lambda_=0 to turn off regularization h2o_lm = h2o.estimators.H2OGeneralizedLinearEstimator(family='gaussian',lambda_=0) h2o_lm.train(x=['carat','cut','color','clarity'],y='price',training_frame=h2o_diamonds2) h2o_lm # That's more like it! # In[11]: h2o_lm.coef() # In[12]: h2o.ls() # # Random Forest # In[13]: h2o_rf = h2o.estimators.random_forest.H2ORandomForestEstimator() h2o_rf.train(x=['carat','cut','color','clarity'],y='price',training_frame=h2o_diamonds2) h2o_rf # In[14]: h2o_rf.varimp() # # Gradient Boosting # We will split the data into training and test, use cross-validation on the training data to tune the hyperparameters, and then evaluate on the test data. This is a standard workflow for high-variance ML models. # First split the data # # http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/splitting-datasets.html # In[15]: diamonds_split = h2o_diamonds2.split_frame(ratios=[0.75], destination_frames=['diamonds_train','diamonds_test']) # In[16]: diamonds_split[0].dim # Then fit a model with cross-validation by specifying nfolds # http://docs.h2o.ai/h2o/latest-stable/h2o-docs/cross-validation.html # In[17]: h2o_gb = h2o.estimators.gbm.H2OGradientBoostingEstimator(nfolds=5) # In[18]: h2o_gb.train(x=['carat','cut','color','clarity'],y='price',training_frame=diamonds_split[0]) # In[19]: h2o_gb # That didn't print well; let's try this instead: # In[20]: h2o_gb.cross_validation_metrics_summary().as_data_frame() # In[21]: results=pd.DataFrame() for lr in [0.02,.05,0.1,0.2,0.5]: for ntrees in [5,50,500]: h2o_gb = h2o.estimators.gbm.H2OGradientBoostingEstimator(nfolds=5,learn_rate=lr,ntrees=ntrees) h2o_gb.train(x=['carat','cut','color','clarity','x','y','z','depth','table'],y='price',training_frame=diamonds_split[0]) tmp=h2o_gb.cross_validation_metrics_summary().as_data_frame() tmp['lr']=lr tmp['ntrees']=ntrees results=pd.concat([results,tmp]) # In[22]: tmp=results[lambda x: x.iloc[:,0]=='rmse'].copy() # For some reason all the data is strings tmp['mean']=tmp['mean'].astype('double') tmp.sort_values('mean').head() # In[23]: c=alt.Chart(tmp[tmp.iloc[:,0]=='rmse']) c.mark_point().encode(x='lr:Q',color='ntrees:N',y='mean').interactive() # I didn't end up using this because it was too dense to see on the chart #tmp['sd']=tmp['sd'].astype('double') #tmp['low']=tmp['mean'] - 2*tmp.sd #tmp['high']=tmp['mean'] + 2*tmp.sd # In[24]: h2o_gb_best = h2o.estimators.gbm.H2OGradientBoostingEstimator(learn_rate=0.05,ntrees=500) h2o_gb_best.train( x=['carat','cut','color','clarity','x','y','z','depth','table'],y='price', training_frame=diamonds_split[0],validation_frame=diamonds_split[1]) h2o_gb_best # Alternatively, you could try H2OGridSearch # http://docs.h2o.ai/h2o/latest-stable/h2o-docs/grid-search.html#grid-search-example-in-python # # Predictions # # We can also run the models and get the results back in Pandas objects: # In[31]: gb_predictions=h2o_gb_best.predict(diamonds_split[1]) lm_predictions=h2o_lm.predict(diamonds_split[1]) # In[43]: pred=diamonds_split[1].as_data_frame().assign( model='gb',pred=gb_predictions.as_data_frame()).append( diamonds_split[1].as_data_frame().assign( model='lm',pred=lm_predictions.as_data_frame())).sample(1000) alt.Chart(pred).mark_circle(opacity=0.25).encode( x='price',y='pred',color='model' ).interactive() # In[ ]: