#!/usr/bin/env python # coding: utf-8 # # Random Search # > In this chapter you will be introduced to another popular automated hyperparameter tuning methodology called Random Search. You will learn what it is, how it works and importantly how it differs from grid search. You will learn some advantages and disadvantages of this method and when to choose this method compared to Grid Search. You will practice undertaking a Random Search with Scikit Learn as well as visualizing & interpreting the output. This is the Summary of lecture "Hyperparameter Tuning in Python", via datacamp. # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Datacamp, Machine_Learning] # - image: images/grid_vs_random.png # In[1]: import numpy as np import pandas as pd # ## Introducting Random Search # - Similar to grid search: # - Define an estimator, which hyperparameters to tune and the range of values for each hyperparameter. # - Set a Cross-Validation scheme and scoring function # > Note - This paper shows empirically and theoretically that randomly chosen trials are more efficient for hyperparmeter optimization than trials on a grid search (Bengio & Bergstra (2012)) # - Two main reason: # 1. Not every hyperparameter is as important # 2. A little trick of probability # ### Randomly Sample Hyperparameters # To undertake a random search, we firstly need to undertake a random sampling of our hyperparameter space. # # In this exercise, you will firstly create some lists of hyperparameters that can be zipped up to a list of lists. Then you will randomly sample hyperparameter combinations preparation for running a random search. # # You will use just the hyperparameters `learning_rate` and `min_samples_leaf` of the GBM algorithm to keep the example illustrative and not overly complicated. # In[2]: from itertools import product from pprint import pprint # Create a list of values for the learning_rate hyperparameter learn_rate_list = list(np.linspace(0.01, 1.5, 200)) # Create a list of values for the min_samples_leaf hyperparameter min_samples_list = list(range(10, 41)) # Combination list combinations_list = [list(x) for x in product(learn_rate_list, min_samples_list)] # Sample hyperparameter combinations for a randomsearch random_combinations_index = np.random.choice(range(0, len(combinations_list)), 30, replace=False) combinations_random_chosen = [combinations_list[x] for x in random_combinations_index] # Print the result pprint(combinations_random_chosen) # ### Randomly Search with Random Forest # To solidify your knowledge of random sampling, let's try a similar exercise but using different hyperparameters and a different algorithm. # # As before, create some lists of hyperparameters that can be zipped up to a list of lists. You will use the hyperparameters `criterion`, `max_depth` and `max_features` of the random forest algorithm. Then you will randomly sample hyperparameter combinations in preparation for running a random search. # In[3]: import random # Create lists for criterion and max_features criterion_list = ['gini', 'entropy'] max_feature_list = ['auto', 'sqrt', 'log2', None] # Create a list of values for the max_dep hyperparameter max_depth_list = list(range(3, 56)) # Combination list combinations_list = [list(x) for x in product(criterion_list, max_feature_list, max_depth_list)] # Cample hyperparameter combinations for a random search combinations_random_chosen = random.sample(combinations_list, 30) # Print the result pprint(combinations_random_chosen) # ### Visualizing a Random Search # Visualizing the search space of random search allows you to easily see the coverage of this technique and therefore allows you to see the effect of your sampling on the search space. # # In this exercise you will use several different samples of hyperparameter combinations and produce visualizations of the search space. # # The function `sample_and_visualize_hyperparameters()` takes a single argument (number of combinations to sample) and then randomly samples hyperparameter combinations, just like you did in the last exercise! The function will then visualize the combinations. # In[4]: import matplotlib.pyplot as plt def sample_and_visualize_hyperparameters(n_samples): # If asking for all combinations, just return the entire list. if n_samples == len(combinations_list): combinations_random_chosen = combinations_list else: combinations_random_chosen = [] random_combinations_index = np.random.choice(range(0, len(combinations_list)), n_samples, replace=False) combinations_random_chosen = [combinations_list[x] for x in random_combinations_index] # Pull out the X and Y to plot rand_y, rand_x = [x[0] for x in combinations_random_chosen], [x[1] for x in combinations_random_chosen] # Plot plt.clf() plt.scatter(rand_y, rand_x, c=['blue']*len(combinations_random_chosen)) plt.gca().set(xlabel='learn_rate', ylabel='min_samples_leaf', title='Random Search Hyperparameters') plt.gca().set_xlim([0.01, 1.5]) plt.gca().set_ylim([10, 29]) # In[5]: # Create a list of values for the learning_rate hyperparameter learn_rate_list = list(np.linspace(0.01, 1.5, 200)) # Create a list of values for the min_samples_leaf hyperparameter min_samples_list = list(range(10, 30)) # Combination list combinations_list = [list(x) for x in product(learn_rate_list, min_samples_list)] # Confirm how many hyperparameter combinations & print number_combs = len(combinations_list) print(number_combs) # Sample and visualize specified combinations sample_and_visualize_hyperparameters(50) # In[6]: # Sample and visualize specified combinations sample_and_visualize_hyperparameters(500) # In[7]: # Sample and visualize specified combinations sample_and_visualize_hyperparameters(1500) # In[8]: # Sample all the hyperparameter combinations & visualize sample_and_visualize_hyperparameters(number_combs) # ## Random Search in Scikit Learn # - Comparing to GridSearchCV # 1. Decide an algorithm/estimator # 2. Define which hyperparameters we will tune # 3. Define a range of values for each hyperparameter # 4. Setting a Cross-Validation scheme # 5. Define a score function # 6. Include extra useful information or function # - In Random Search, # 7. Decide how many samples to take and sample it # ### The RandomizedSearchCV Object # Just like the `GridSearchCV` library from Scikit Learn, `RandomizedSearchCV` provides many useful features to assist with efficiently undertaking a random search. You're going to create a `RandomizedSearchCV` object, making the small adjustment needed from the `GridSearchCV` object. # # The desired options are: # # - A default Gradient Boosting Classifier Estimator # - 5-fold cross validation # - Use accuracy to score the models # - Use 4 cores for processing in parallel # - Ensure you refit the best model and return training scores # - Randomly sample 10 models # # The hyperparameter grid should be for `learning_rate` (150 values between 0.1 and 2) and `min_samples_leaf` (all values between and including 20 and 64). # In[9]: from sklearn.model_selection import train_test_split credit_card = pd.read_csv('./dataset/credit-card-full.csv') # To change categorical variable with dummy variables credit_card = pd.get_dummies(credit_card, columns=['SEX', 'EDUCATION', 'MARRIAGE'], drop_first=True) X = credit_card.drop(['ID', 'default payment next month'], axis=1) y = credit_card['default payment next month'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True) # In[10]: from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV # Create the parameter grid param_grid = {'learning_rate': np.linspace(0.1, 2, 150), 'min_samples_leaf': list(range(20, 65))} # Create a random search object random_GBM_class = RandomizedSearchCV( estimator=GradientBoostingClassifier(), param_distributions=param_grid, n_iter=10, scoring='accuracy', n_jobs=4, cv=5, refit=True, return_train_score=True ) # Fit to the training data random_GBM_class.fit(X_train, y_train) # Print the values used for both hyperparameters print(random_GBM_class.cv_results_['param_learning_rate']) print(random_GBM_class.cv_results_['param_min_samples_leaf']) # ### RandomizedSearchCV in Scikit Learn # Let's practice building a `RandomizedSearchCV` object using Scikit Learn. # # The hyperparameter grid should be for `max_depth` (all values between and including 5 and 25) and `max_features` ('auto' and 'sqrt'). # # The desired options for the `RandomizedSearchCV` object are: # # - A RandomForestClassifier Estimator with `n_estimators` of 80. # - 3-fold cross validation (`cv`) # - Use `roc_auc` to score the models # - Use 4 cores for processing in parallel (`n_jobs`) # - Ensure you refit the best model and return training scores # - Only sample 5 models for efficiency (`n_iter`) # # Remember, to extract the chosen hyperparameters these are found in `cv_results_` with a column per hyperparameter. For example, the column for the hyperparameter `criterion` would be `param_criterion`. # In[11]: from sklearn.ensemble import RandomForestClassifier # Create the parameter grid param_grid = {'max_depth': list(range(5, 26)), 'max_features': ['auto', 'sqrt']} # Create a random search object random_rf_class = RandomizedSearchCV( estimator=RandomForestClassifier(n_estimators=80), param_distributions=param_grid, n_iter=5, scoring='roc_auc', n_jobs=4, cv=3, refit=True, return_train_score=True ) # Fit to the training data random_rf_class.fit(X_train, y_train) # Print the values used for both hyperparameters print(random_rf_class.cv_results_['param_max_depth']) print(random_rf_class.cv_results_['param_max_features']) # ## Comparing Grid and Random Search # - Comparision # # | Grid Search | Random Search | # | ----------- | ------------- | # | Exhaustively tries all combinations within the sample space | Random Selects a subset of combinations within the sample space (that you must specify) | # | No sampling methodology | Can select a sampling methodology (other than uniform) | # | More computationally expensive | Less computationally expensive | # | Guaranteed to find the best score in the sample space | Not guaranteed to find the best score in the sample space (but likely to find a good one fater) | # ### Grid and Random Search Side by Side # Visualizing the search space of random and grid search together allows you to easily see the coverage that each technique has and therefore brings to life their specific advantages and disadvantages. # # In this exercise, you will sample hyperparameter combinations in a grid search way as well as a random search way, then plot these to see the difference. # In[13]: def visualize_search(grid_combinations_chosen, random_combinations_chosen): grid_y, grid_x = [x[0] for x in grid_combinations_chosen], [x[1] for x in grid_combinations_chosen] rand_y, rand_x = [x[0] for x in random_combinations_chosen], [x[1] for x in random_combinations_chosen] # Plot all together plt.scatter(grid_y + rand_y, grid_x + rand_x, c=['red']*300 + ['blue']*300) plt.gca().set(xlabel='learn_rate', ylabel='min_samples_leaf', title='Grid and Random Search Hyperparameters') plt.gca().set_xlim([0.01, 3.0]) plt.gca().set_ylim([5, 25]) # In[14]: learn_rate_list = np.linspace(0.01, 3.0, 200) min_samples_leaf_list = range(5, 25) combinations_list = [list(x) for x in product(learn_rate_list, min_samples_leaf_list)] # Sample grid coordinates grid_combinations_chosen = combinations_list[0:300] # Create a list of sample indexes sample_indexes = list(range(0, len(combinations_list))) # Randomly sample 300 indexes random_indexes = np.random.choice(sample_indexes, 300, replace=False) # Use indexes to create random sample random_combinations_chosen = [combinations_list[index] for index in random_indexes] # Call the function to produce the visualization visualize_search(grid_combinations_chosen, random_combinations_chosen)