!pip install scikeras import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns import warnings import tensorflow as tf from tensorflow import keras from keras import layers, models from keras.callbacks import ModelCheckpoint, EarlyStopping from keras.layers import Dense from keras.metrics import Accuracy, F1Score from keras.models import Sequential from keras.utils import to_categorical from keras import backend as K from sklearn.model_selection import cross_val_score, KFold, train_test_split from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from scikeras.wrappers import KerasClassifier, KerasRegressor warnings.filterwarnings('ignore') tf.get_logger().setLevel('ERROR') np.random.seed(42) tf.random.set_seed(42) # mount the colab from google.colab import drive drive.mount('/content/drive') # For saving and retrieving the trained model WORKSPACE_PATH = "/content/drive/MyDrive/Colab Notebooks/models/" # Read the dataset from github url = 'https://raw.githubusercontent.com/quickheaven/scs-3546-deep-learning/master/datasets/' X_train = pd.read_csv(url + 'X_train.csv', index_col=0) X_test =pd.read_csv(url + 'X_test.csv', index_col=0) y_train =pd.read_csv(url + 'y_train.csv', index_col=0) y_test = pd.read_csv(url + 'y_test.csv', index_col=0) # Initialize input dim and output nodes NUM_INPUT_DIM = X_train.shape[1] NUM_OUTPUT_NODES = len(y_train['URL_Type_obf_Type'].unique()) # y_train_dummy = to_categorical(y_train) y_test_dummy = to_categorical(y_test) def get_base_model_config(): """ This function returns the based model configuration for the experiments. Parameters ---------- None Returns ---------- dict - The dictionary containing the based model configuration. """ early_stopping_callback = EarlyStopping(monitor='val_accuracy', mode='max', patience=5, restore_best_weights=True) callbacks = [early_stopping_callback] model_config = { # ################################## "model_name": None, "input_dim": NUM_INPUT_DIM, "custom_layers": list(), "callbacks": callbacks, "validation_split": 0.20, "loss": "categorical_crossentropy", "output_nodes": NUM_OUTPUT_NODES, "output_activation": "softmax", "metrics": ['accuracy'], # ################################## "batch_size": 32, "epochs": 200, "hidden_activation": "relu", "weights_initializer": "random_normal", # ################################## "normalization": None, "optimizer": "adam", "learning_rate": 0.001, "regularizer": None, "dropout_rate": None, # ################################## "is_save_model": False, "workspace_path": WORKSPACE_PATH, "verbose": 1 } return model_config def get_optimizer(optimizer_name, learning_rate): """ (str. float) -> keras.optimizers This method returns the optimizer that will be use in the experiment. Parameters ---------- optimizer_name - The name of the optimizer to use in the experiment. values are adagrad, rmsprop, adam and None. learning_rate - The rate of learning to use in the optimizer. Returns ---------- keras.optimizer - The keras optimizer object. """ optimizer=None if optimizer_name == 'adagrad': optimizer = keras.optimizers.Adagrad(learning_rate=learning_rate) elif 'rmsprop': optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate) elif'adam' : optimizer = keras.optimizers.Adam(learning_rate=learning_rate) else : optimizer = keras.optimizers.SGD(learning_rate=learning_rate) return optimizer def create_and_run_model(model_config, X, y): # Build the new model model = Sequential(layers=model_config['custom_layers'], name=model_config['model_name']) model.summary() optimizer = get_optimizer(model_config["optimizer"], model_config["learning_rate"]) # Compile the model model.compile(optimizer=optimizer, loss=model_config['loss'], metrics=model_config['metrics']) # Fit the model history = model.fit(X, y, batch_size=model_config['batch_size'], callbacks=model_config['callbacks'], epochs=model_config['epochs'], validation_split=model_config['validation_split'], verbose=model_config["verbose"]) if True == model_config['is_save_model']: model_file = str(model_config['workspace_path']) + str(model_config['model_name']) + '.h5' model.save(model_file) return history def plot_accuracy_measures(accuracy_measures, title): import matplotlib.pyplot as plt plt.figure(figsize=(8, 5)) for experiment in accuracy_measures.keys(): plt.plot(accuracy_measures[experiment], label=experiment, linewidth=3) plt.title(title) plt.xlabel("Epochs") plt.ylabel("Accuracy") plt.legend() plt.show() def plot_learning_curves(history, title): # plot curves for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Learning Curves ' + title) plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper left') plt.plot(figsize=(8, 5)) plt.grid(True) plt.show() model_config = get_base_model_config() accuracy_measures = {} model_config = get_base_model_config() batch_sizes = [32, 64, 128] for size in batch_sizes: model_config['batch_size'] = size custom_layers = [ layers.Dense(16, activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Epoch_and_batch_size_' + str(size) model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_batch_epoch = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Batch Size and Epoch") model_config['batch_size'] = 32 model_config['epochs'] = 50 model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 accuracy_measures = {} # reused the previous model config from previous experiment. # but on this case only increase the number of layers # I will start small by checking if one or two nodes is enough to get a good result. layer_list = [] for hidden_layer_count in range(1,11): custom_layers = [] # Simply use 16 nodes per layer for now. # Input if hidden_layer_count == 1: custom_layers.append(layers.Dense(16, activation=model_config['hidden_activation'], input_dim=model_config['input_dim'])) else: # Additional Layers custom_layers.append(layers.Dense(16, activation=model_config['hidden_activation'], input_dim=model_config['input_dim'])) for i in range(1, hidden_layer_count): custom_layers.append(layers.Dense(16, activation=model_config['hidden_activation'])) # Output layers custom_layers.append(layers.Dense(model_config['output_nodes'], activation=model_config['output_activation'])) model_name = 'Layers_' + str(hidden_layer_count) model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_hidden_layers = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Hidden Layers") accuracy_measures_tmp = {} accuracy_measures_tmp['Layers_1'] = accuracy_measures['Layers_1'] accuracy_measures_tmp['Layers_2'] = accuracy_measures['Layers_2'] accuracy_measures_tmp['Layers_3'] = accuracy_measures['Layers_3'] accuracy_measures_tmp['Layers_7'] = accuracy_measures['Layers_7'] accuracy_measures_tmp['Layers_9'] = accuracy_measures['Layers_9'] accuracy_measures_tmp['Layers_10'] = accuracy_measures['Layers_10'] plot_accuracy_measures(accuracy_measures_tmp, "Compare Hidden Layers") # reset the accuracy measures accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Exhaustive approach. Try all combinations of layers and the number of nodes; node_list_layer_1 = [16, 24, 32] node_list_layer_2 = [16, 24] #node_list_layer_3 = [16] for nbr_of_nodes_layer_1 in node_list_layer_1: for nbr_of_nodes_layer_2 in node_list_layer_2: #for nbr_of_nodes_layer_3 in node_list_layer_3: custom_layers = [ layers.Dense(nbr_of_nodes_layer_1, activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(nbr_of_nodes_layer_2, activation=model_config['hidden_activation']), #layers.Dense(nbr_of_nodes_layer_3, activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Nodes-' + str(nbr_of_nodes_layer_1) + '-' + str(nbr_of_nodes_layer_2) #model_name = 'Nodes-' + str(nbr_of_nodes_layer_1) + '-' + str(nbr_of_nodes_layer_2) + '-' + str(nbr_of_nodes_layer_3) model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_nodes = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Nodes in a Layer") accuracy_measures_tmp = {} accuracy_measures_tmp['Nodes-16-16'] = accuracy_measures['Nodes-16-16'] accuracy_measures_tmp['Nodes-24-24'] = accuracy_measures['Nodes-24-24'] accuracy_measures_tmp['Nodes-32-24'] = accuracy_measures['Nodes-32-24'] plot_accuracy_measures(accuracy_measures_tmp, "Compare Nodes in a Layer") accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 activation_list = ['relu','sigmoid','tanh'] for activation in activation_list: model_config['hidden_activation'] = activation custom_layers = [ layers.Dense(24, activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Model-' + activation model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_activation = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Activation Functions") model_config['hidden_activation'] = 'relu' accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' initializer_list = ['random_normal', "random_uniform"] for initializer in initializer_list: model_config['weights_initializer'] = initializer custom_layers = [ layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Model-' + initializer model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_weight_init = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Weights Initializers") model_config['weights_initializer'] = 'random_normal' accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' normalization_list = ['batch','none'] for normalization in normalization_list: custom_layers = [] if normalization == 'none': custom_layers = [ layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; else: custom_layers = [ layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.BatchNormalization(), layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.BatchNormalization(), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Normalization-' + normalization model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_normalization = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Normalization Techniques") accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' # Batch Normalization: not added optimizer_list = ['sgd','rmsprop','adam','adagrad'] for optimizer in optimizer_list: model_config['optimizer'] = optimizer custom_layers = [ layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Optimizer-' + optimizer model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_optimizer = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Optimizers") model_config['optimizer'] = 'adam' accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' # Batch Normalization: not added model_config['optimizer'] = 'adam' learning_rate_list = [0.001, 0.005, 0.01, 0.1] for learning_rate in learning_rate_list: model_config['learning_rate'] = learning_rate custom_layers = [ layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Learning-Rate-' + str(learning_rate) model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) accuracy_measures[model_name] = history.history['accuracy'] accuracy_measures_learning_rate = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Learning Rates") model_config['learning_rate'] = 0.001 accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' # Batch Normalization: not added model_config['optimizer'] = 'adam' model_config['learning_rate'] = 0.001 regularizer_list = ['l1','l2','l1_l2', 'None'] for regularizer in regularizer_list: if regularizer == 'None': regularizer = None model_config['regularizer'] = regularizer print('Testing: {}'.format(regularizer)) custom_layers = [ layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ]; model_name = 'Regularizer-' + str(regularizer) model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) # Used the validation accuracy accuracy_measures[model_name] = history.history['val_accuracy'] accuracy_measures_regularizer = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Regularizers") model_config['regularizer'] = None accuracy_measures = {} model_config = get_base_model_config() # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 2 # Number of nodes per each layer: 24-24 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' # Batch Normalization: not added model_config['optimizer'] = 'adam' model_config['learning_rate'] = 0.001 model_config['regularizer'] = None model_config['dropout_rate'] = None dropout_list = [0.0, 0.001, 0.1, 0.2, 0.5] for dropout in dropout_list: model_config['dropout_rate'] = dropout custom_layers = [] if model_config['dropout_rate'] > 0.0: model_name = 'Dropout' + str(dropout) custom_layers = [ layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dropout(model_config['dropout_rate']), layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dropout(model_config['dropout_rate']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ] else: model_name = 'Dropout-None' custom_layers = [ layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ] model_config['model_name'] = model_name model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) history = create_and_run_model(model_config, X_train, y_train_dummy) plot_learning_curves(history, model_name) # Used the validation accuracy accuracy_measures[model_name] = history.history['val_accuracy'] accuracy_measures_dropout = accuracy_measures.copy() plot_accuracy_measures(accuracy_measures, "Compare Dropout") model_config['dropout_rate'] = 0.001 def get_best_param_model_config(model_name): """ (str) -> dict This method returns the best hyper parameters from experiments. Parameters ---------- model_name - The name of the model Returns ---------- dict - The dictionary containing the best hyper parameters from experiments. """ # Default model config model_config = get_base_model_config() model_config['model_name'] = model_name # Include F1-Score in the metrics so we can use that score during evaluation model_config['metrics'] = ['accuracy', F1Score(average='macro')], # set the average so the F1-Score will not return an array per each individual classes. # These are the final hyperparameters that I will use in training the Models: # Hyperparameters: model_config['batch_size'] = 32 model_config['epochs'] = 50 # Number of layers: 3 # Number of nodes per each layer: 32-16-16 model_config['hidden_activation'] = 'relu' model_config['weights_initializer'] = 'random_normal' # Batch Normalization: not added model_config['optimizer'] = 'adam' model_config['learning_rate'] = 0.001 model_config['regularizer'] = None model_config['dropout_rate'] = 0.001 custom_layers = [ layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation'], input_dim=model_config['input_dim']), layers.Dropout(model_config['dropout_rate']), layers.Dense(24, kernel_regularizer=model_config['regularizer'], kernel_initializer=model_config['weights_initializer'], activation=model_config['hidden_activation']), layers.Dropout(model_config['dropout_rate']), layers.Dense(model_config['output_nodes'], activation=model_config['output_activation']) ] model_config['custom_layers'].clear() model_config['custom_layers'].extend(custom_layers) # we would like to save the model starting here for evaluation. model_config['is_save_model'] = True return model_config from keras.models import load_model def evaluate_model(model_config, X, y): """ (dict, tf.data.Dataset) This is a reusable function that load a previously trained model and run an evaluation. Parameters ---------- model_config - The dictionary to use in model evaluation. test_ds - The test dataset to use in the evaluation. Returns ---------- Dataframe - The Panda Dataframe containing the scores from the experiment. The scores includes the Accuracy and Loss. """ model_file = str(model_config['workspace_path']) + str(model_config['model_name']) + '.h5' model = load_model(model_file) test_loss, test_accuracy, test_f1_score = model.evaluate(X, y, verbose=model_config['verbose']) print(f"Test accuracy: {test_accuracy}, Test F1-Score: {test_f1_score}") # reset accuracy measures accuracy_measures = {} model_name = 'Multiclass_Clarissification_of_Malicious_URL' best_param_model_config = get_best_param_model_config(model_name) best_model_history = create_and_run_model(best_param_model_config, X_train, y_train_dummy) plot_learning_curves(best_model_history, model_name) evaluate_model(best_param_model_config, X_test, y_test_dummy)