#!/usr/bin/env python
# coding: utf-8

# In[1]:


import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator


# In[2]:


# Connect to a pre-existing cluster
h2o.init()


# In[3]:


from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

df = h2o.import_file(path=_locate("smalldata/logreg/prostate.csv"))


# In[4]:


df.describe()


# In[5]:


# Remove ID from training frame
train = df.drop("ID")


# In[6]:


# For VOL & GLEASON, a zero really means "missing"
vol = train['VOL']
vol[vol == 0] = None
gle = train['GLEASON']
gle[gle == 0] = None


# In[7]:


# Convert CAPSULE to a logical factor
train['CAPSULE'] = train['CAPSULE'].asfactor()


# In[8]:


# See that the data is ready
train.describe()


# In[9]:


# Run GBM
my_gbm = H2OGradientBoostingEstimator(distribution = "bernoulli", ntrees=50, learn_rate=0.1)

my_gbm.train(x=list(range(1,train.ncol)), y="CAPSULE", training_frame=train, validation_frame=train)


# In[10]:


my_gbm_metrics = my_gbm.model_performance(train)
my_gbm_metrics.show()