#!/usr/bin/env python # coding: utf-8 # ## Hyperparameter search # In[22]: #Import H2O and other libraries that will be used in this tutorial import h2o import matplotlib as plt #Import the Estimators from h2o.estimators.glm import H2OGeneralizedLinearEstimator from h2o.estimators import H2ORandomForestEstimator from h2o.estimators.gbm import H2OGradientBoostingEstimator #Import h2o grid search import h2o.grid from h2o.grid.grid_search import H2OGridSearch # In[23]: import h2o h2o.init(max_mem_size=16) # In[24]: loan_level = h2o.import_file("https://s3.amazonaws.com/data.h2o.ai/DAI-Tutorials/loan_level_500k.csv") # In[25]: train, valid, test = loan_level.split_frame([0.7, 0.15], seed=42) print("train:%d valid:%d test:%d" % (train.nrows, valid.nrows, test.nrows)) y = "DELINQUENT" ignore = ["DELINQUENT", "PREPAID", "PREPAYMENT_PENALTY_MORTGAGE_FLAG", "PRODUCT_TYPE"] x = list(set(train.names) - set(ignore)) # ## Grid Search/ Cartesian Search by default or not specified # In[26]: glm_grid = h2o.grid.H2OGridSearch ( H2OGeneralizedLinearEstimator( family = "binomial", lambda_search = True), hyper_params = { "alpha": [x*0.01 for x in range(0, 4)], "lambda": [x*1e-6 for x in range(0, 4)], }, grid_id = "glm_grid_2", ) get_ipython().run_line_magic('time', 'glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)') # ## Random Search # In[27]: glm_grid = h2o.grid.H2OGridSearch ( H2OGeneralizedLinearEstimator( family = "binomial", lambda_search = True), hyper_params = { "alpha": [x*0.01 for x in range(0, 100)], "lambda": [x*1e-6 for x in range(0, 1000)], }, grid_id = "glm_grid", search_criteria = { "strategy":"RandomDiscrete", "max_models":100, "max_runtime_secs":300, "seed":42 } ) get_ipython().run_line_magic('time', 'glm_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)') # In[28]: h2o.ls() # In[29]: help(h2o.grid.H2OGridSearch) # In[30]: glm_grid.get_grid(sort_by='auc',decreasing=True) # In[36]: glm_grid.models[0] # In[32]: glm_grid.summary() # In[37]: sorted_glm_grid = glm_grid.get_grid(sort_by='auc',decreasing=True) # In[38]: sorted_glm_grid[0].actual_params # In[39]: print(sorted_glm_grid[0].F1()) sorted_glm_grid[1].F1() # In[40]: sorted_glm_grid[0].model_performance(test) # should give AUC of 0.8524 compared to the untuned version of 0.8523 # ## Random Forest # In[41]: # Grid Search/ Cartesian Search by default or not specified rf_grid = h2o.grid.H2OGridSearch ( H2ORandomForestEstimator(nfolds=10), hyper_params = { "ntrees": [50,100], "max_depth": [10,20], }, search_criteria = { "strategy":"RandomDiscrete", # Random Search "max_models":100, "max_runtime_secs":300, "seed":42 }, grid_id = "rf_grid_2", ) get_ipython().run_line_magic('time', 'rf_grid.train(x=x, y=y, training_frame=train, validation_frame = valid)') # In[42]: rf_grid.get_grid(sort_by='auc', decreasing=True) # ### Get the best model and train on top of that # In[43]: best_model = rf_grid.get_grid(sort_by="auc", decreasing=True)[0] rf = H2ORandomForestEstimator (seed=42, model_id='default_random_forest', checkpoint=best_model.model_id) get_ipython().run_line_magic('time', 'rf.train(x=x, y=y, training_frame=train, validation_frame=valid)') # In[45]: rf.summary() # In[ ]: