#!/usr/bin/env python
# coding: utf-8

# ## Hyperparameter search

# In[22]:


#Import H2O and other libraries that will be used in this tutorial 
import h2o
import matplotlib as plt

#Import the Estimators
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Import h2o grid search 
import h2o.grid 
from h2o.grid.grid_search import H2OGridSearch


# In[23]:


import h2o
h2o.init(max_mem_size=16)


# In[24]:


loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv")


# In[25]:


train, valid, test = loan_level.split_frame([0.7, 0.15], seed=42)
print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows))
y = "DELINQUENT"
ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] 
x = list(set(train.names) - set(ignore))


# ## Grid Search/ Cartesian Search by default or not specified

# In[26]:


glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 4)],
        "lambda": [x*1e-6 for x in range(0, 4)],
        },
    
    grid_id = "glm_grid_2",
    
)
get_ipython().run_line_magic('time', 'glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)')


# ## Random Search

# In[27]:


glm_grid = h2o.grid.H2OGridSearch (
    H2OGeneralizedLinearEstimator( 
        family = "binomial",
        lambda_search = True),
    
    hyper_params = {
        "alpha": [x*0.01 for x in range(0, 100)],
        "lambda": [x*1e-6 for x in range(0, 1000)],
        },
    
    grid_id = "glm_grid",
    
    search_criteria = {
        "strategy":"RandomDiscrete", 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        }
)
get_ipython().run_line_magic('time', 'glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)')


# In[28]:


h2o.ls()


# In[29]:


help(h2o.grid.H2OGridSearch)


# In[30]:


glm_grid.get_grid(sort_by='auc',decreasing=True)


# In[36]:


glm_grid.models[0]


# In[32]:


glm_grid.summary()


# In[37]:


sorted_glm_grid = glm_grid.get_grid(sort_by='auc',decreasing=True)


# In[38]:


sorted_glm_grid[0].actual_params


# In[39]:


print(sorted_glm_grid[0].F1())
sorted_glm_grid[1].F1()


# In[40]:


sorted_glm_grid[0].model_performance(test) # should give AUC of 0.8524 compared to the untuned version of 0.8523


# ## Random Forest

# In[41]:


# Grid Search/ Cartesian Search by default or not specified
rf_grid = h2o.grid.H2OGridSearch (
    H2ORandomForestEstimator(nfolds=10),
    
    hyper_params = {
        "ntrees": [50,100],
        "max_depth": [10,20],
        },
    
     search_criteria = {
        "strategy":"RandomDiscrete", # Random Search 
        "max_models":100,
        "max_runtime_secs":300,
        "seed":42
        },
    
    grid_id = "rf_grid_2",
    
)
get_ipython().run_line_magic('time', 'rf_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)')


# In[42]:


rf_grid.get_grid(sort_by='auc', decreasing=True)


# ### Get the best model and train on top of that

# In[43]:


best_model = rf_grid.get_grid(sort_by="auc", decreasing=True)[0]

rf = H2ORandomForestEstimator (seed=42, model_id='default_random_forest', checkpoint=best_model.model_id)
get_ipython().run_line_magic('time', 'rf.train(x=x, y=y, training_frame=train, validation_frame=valid)')


# In[45]:


rf.summary()


# In[ ]: