#!/usr/bin/env python # coding: utf-8 # ## Index # # 1. [Workspace configuration](#Workspace-configuration) # 2. [Import packages](#Import-packages) # 3. [Load workspace](#Configure-workspace) # 4. [Create or connect to an experiment](#Create-or-connect-to-an-experiment) # 5. [Local execution](#Local-execution) # - [Load data](#Load-data) # - [Train a model](#Train-a-model) # - [Run the experiment](#Run-the-experiment) # 6. [Remote execution with DSVM](#Remote-execution) # - [Create or connect to a remote Linux DSVM](#Create-or-connect-to-a-remote-Linux-DSVM) # - [Upload data to the cloud](#Upload-data-to-the-cloud) # - [Create get data file](#Create-get-data-file) # - [Train a model on remote compute target](#Train-a-model-on-remote-compute-target) # 7. [Cancel runs](#Cancel-runs) # 8. [Explore results](#Explore-results) # 9. [Retrieve models](#Retrieve-models) # - [Best model with the highest value](#Best-model-with-the-highest-value) # - [Best model based on other metrics](#Best-model-based-on-other-metrics) # - [Model from a specific iteration](#Model-from-a-specific-iteration) # 10. [Test the best model](#Test-the-best-model) # 11. [Register the best model](#Register-the-best-model) # ## Workspace configuration # # Configure a workspace to enable communication between your local computer and remote resources. # # However, we don't have to create a new config file each time we run this notebook. # In[1]: from azureml.core import Workspace subscription_id ='364eeb5b-f3c7-42b8-b15f-08afee51aa96' resource_group ='Xiangzhe-ML' workspace_name = 'Xiangzhe-WS' try: ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name) ws.write_config() print('Library configuration succeeded') except: print('Workspace not found') # ## Import packages # In[1]: import azureml.core import pandas as pd from azureml.core.workspace import Workspace from azureml.train.automl.run import AutoMLRun import time import logging from matplotlib import pyplot as plt from matplotlib.pyplot import imshow import random import numpy as np import os # ## Load workspace # # Create a workspace object from the existing workspace `Workspace.from_config()` reads the file **aml_config/config.json** and loads the details into an object named `ws`. # In[2]: # load workspace configuration from the config.json file in the current folder. ws = Workspace.from_config() print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t') # ## Create or connect to an experiment # In[3]: experiment_name = 'automl-regression-nyc-taxi' from azureml.core import Experiment exp = Experiment(workspace = ws, name = experiment_name) # --- # ## Local execution # # ### Load data # In[9]: from sklearn import preprocessing from sklearn.model_selection import train_test_split # load data pd_dataframe = pd.read_pickle('data/data_after_prep.pkl') print('Data loading finished.') y = np.array(pd_dataframe["trip_duration"]).astype(float) y = np.log(y) X = pd_dataframe.drop(["trip_duration"],axis = 1) # ### Train a model # # Function `AutoMLConfig` description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py # # Here is a brief summary of the properties which are used in our model: # # |Property| Description| # |---|---| # |**task**|'classification' or 'regression' depending on what kind of ML problem to solve.| # |**primary_metric**|Metric that you want to optimize.| # |**max_time_sec**|Time limit in seconds for each iteration| # |**iterations**|Number of iterations. In each iteration, the model trains with the data with a specific pipeline| # |**n_cross_validations**|Number of cross validation splits| # |**preprocess**|*True/False* Enables experiment to perform preprocessing on the input. Preprocessing handles *missing data*, and performs some common *feature extraction*| # |**verbosity**|Verbosity level for AutoML log file.| # |**X**|The training features to use when fitting pipelines during AutoML experiment.| # |**y**|Training labels to use when fitting pipelines during AutoML experiment.| # |**path**|Full path to the AzureML project folder.| # In[10]: project_folder = './projects/automl-regression-nyc-taxi' if not os.path.exists(project_folder): os.makedirs(project_folder) # In[11]: from azureml.train.automl import AutoMLConfig # randomly choose 100 samples to train n = 100 sample_indices = np.random.permutation(X.shape[0])[0:n] # local compute automl_config_local = AutoMLConfig(task = 'regression', primary_metric = 'spearman_correlation', max_time_sec = 600, iterations = 5, n_cross_validations = 5, preprocess = True, verbosity = logging.INFO, X = X.iloc[sample_indices], y = y[sample_indices], path = project_folder) # ### Run the experiment # In[12]: local_run = exp.submit(automl_config_local, show_output=True) # --- # ## Remote execution # # ### Create or connect to a remote Linux DSVM # In[13]: from azureml.core.compute import DsvmCompute dsvm_name = 'mydsvm' try: dsvm_compute = DsvmCompute(ws, dsvm_name) print('Found an existing DSVM.') except: print('Creating a new DSVM.') dsvm_config = DsvmCompute.provisioning_configuration(vm_size = "Standard_D2_v2") dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config) dsvm_compute.wait_for_completion(show_output = True) # ### Upload data to the cloud # In[14]: from azureml.core import Workspace, Datastore #blob_datastore = Datastore(ws, blob_datastore_name) ds = ws.get_default_datastore() print(ds.datastore_type, ds.account_name, ds.container_name) # In[21]: #ds.upload_files("./data/data_after_prep.pkl") ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True) # In[22]: from azureml.core.runconfig import DataReferenceConfiguration dr = DataReferenceConfiguration(datastore_name=ds.name, path_on_datastore='data', mode='download', # download files from datastore to compute target overwrite=True) # In[23]: from azureml.core.runconfig import RunConfiguration # create a new RunConfig object conda_run_config = RunConfiguration(framework="python") # Set compute target to the Linux DSVM conda_run_config.target = dsvm_compute.name # set the data reference of the run coonfiguration conda_run_config.data_references = {ds.name: dr} # ### Create get data file # In[24]: project_folder = './projects/automl-regression-nyc-taxi' if not os.path.exists(project_folder): os.makedirs(project_folder) # In[25]: get_ipython().run_cell_magic('writefile', '$project_folder/get_data.py', '\nimport numpy as np\nimport pandas as pd\nimport os\nfrom os.path import expanduser, join, dirname\n\n\ndef get_data():\n # download data from cloud\n pd_dataframe = pd.read_pickle(join(dirname(os.path.realpath(__file__)),\n os.environ["AZUREML_DATAREFERENCE_workspacefilestore"],\n "data_after_prep.pkl"))\n \n y = np.array(pd_dataframe["trip_duration"]).astype(float)\n y = np.log(y)\n X = pd_dataframe.drop(["trip_duration"],axis = 1)\n \n # randomly choose samples to train\n n = 300\n sample_indices = np.random.permutation(X.shape[0])[0:n]\n\n return { "X" : X.iloc[sample_indices], "y" : y[sample_indices] }\n') # ### Train a model on remote compute target # # Function `AutoMLConfig` description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py # # _**Note**_: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig. # In[26]: from azureml.train.automl import AutoMLConfig automl_settings = { "max_time_sec": 600, "iterations": 5, "n_cross_validations": 5, "primary_metric": 'spearman_correlation', "preprocess": True, "max_cores_per_iteration": 2, "verbosity": logging.INFO } automl_config_remote = AutoMLConfig(task = 'regression', debug_log = 'automl_errors.log', path = project_folder, run_configuration = conda_run_config, data_script = project_folder + "/get_data.py", **automl_settings ) # In[29]: remote_run = exp.submit(automl_config_remote, show_output = False) # ## Cancel runs # In[ ]: remote_run.cancel() # --- # ## Explore results # In[105]: remote_run # ### Retrieve all child runs # # We can use the cell below to fetch all the child runs and see individual metrics. # In[30]: children = list(remote_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} metricslist[int(properties['iteration'])] = metrics import pandas as pd rundata = pd.DataFrame(metricslist).sort_index(1) rundata # ## Retrieve models # # ### Best model with the highest value # # The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. # In[31]: best_run, fitted_model = remote_run.get_output() print(best_run) print(fitted_model) # ### Best model based on other metrics # # For example, the cell below shows the run and the model that has the smallest `root_mean_squared_error` value. # In[ ]: lookup_metric = "root_mean_squared_error" best_run, fitted_model = remote_run.get_output(metric = lookup_metric) print(best_run) print(fitted_model) # ### Model from a specific iteration # # For example, the cell below shows the run and the model from the 3rd iteration. # In[ ]: iteration = 3 third_run, third_model = remote_run.get_output(iteration = iteration) print(third_run) print(third_model) # ## Test the best model # In[32]: pd_dataframe = pd.read_pickle("./data/sub_data_after_prep.pkl") y_test = np.array(pd_dataframe["trip_duration"]).astype(float) y_test = np.log(y_test) X_test = pd_dataframe.drop(["trip_duration"],axis = 1) # In[35]: random_index = np.random.randint(0, len(X_test)-1) y_pred = fitted_model.predict(X_test.iloc[[random_index]]) y_residual_test = abs(y_test[random_index] - y_pred[0]) # In[36]: print("actual value:", y_test[random_index]) print("prediction:", y_pred[0]) print("residual:", y_residual_test) # ## Register the best model # # We can use the cell below to register model in the workspace. # In[ ]: description = 'Automated Machine Learning Model' tags = None remote_run.register_model(description=description, tags=tags) remote_run.model_id # Use this id to deploy the model as a web service in Azure # In[ ]: