#!/usr/bin/env python # coding: utf-8 # In[1]: import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator # In[2]: # Connect to a pre-existing cluster h2o.init() # In[3]: from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory. df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv")) # In[4]: df.describe() # In[5]: # Remove ID from training frame train = df.drop("ID") # In[6]: # For VOL & GLEASON, a zero really means "missing" vol = train['VOL'] vol[vol == 0] = None gle = train['GLEASON'] gle[gle == 0] = None # In[7]: # Convert CAPSULE to a logical factor train['CAPSULE'] = train['CAPSULE'].asfactor() # In[8]: # See that the data is ready train.describe() # In[9]: # Run GBM my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1) my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train) # In[10]: my_gbm_metrics = my_gbm.model_performance(train) my_gbm_metrics.show()