#!/usr/bin/env python
# coding: utf-8

# ## Index
# 
# 1. [Workspace configuration](#Workspace-configuration)
# 2. [Import packages](#Import-packages)
# 3. [Load workspace](#Configure-workspace)
# 4. [Create or connect to an experiment](#Create-or-connect-to-an-experiment)
# 5. [Local execution](#Local-execution)
#     - [Load data](#Load-data)
#     - [Train a model](#Train-a-model)
#     - [Run the experiment](#Run-the-experiment)
# 6. [Remote execution with DSVM](#Remote-execution)
#     - [Create or connect to a remote Linux DSVM](#Create-or-connect-to-a-remote-Linux-DSVM)
#     - [Upload data to the cloud](#Upload-data-to-the-cloud)
#     - [Create get data file](#Create-get-data-file)
#     - [Train a model on remote compute target](#Train-a-model-on-remote-compute-target)
# 7. [Cancel runs](#Cancel-runs)
# 8. [Explore results](#Explore-results)
# 9. [Retrieve models](#Retrieve-models)
#     - [Best model with the highest value](#Best-model-with-the-highest-value)
#     - [Best model based on other metrics](#Best-model-based-on-other-metrics)
#     - [Model from a specific iteration](#Model-from-a-specific-iteration)
# 10. [Test the best model](#Test-the-best-model)
# 11. [Register the best model](#Register-the-best-model)

# ## Workspace configuration
# 
# Configure a workspace to enable communication between your local computer and remote resources. 
# 
# However, we don't have to create a new config file each time we run this notebook.

# In[1]:


from azureml.core import Workspace

subscription_id ='364eeb5b-f3c7-42b8-b15f-08afee51aa96'
resource_group ='Xiangzhe-ML'
workspace_name = 'Xiangzhe-WS'

try:
   ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
   ws.write_config()
   print('Library configuration succeeded')
except:
   print('Workspace not found')


# ## Import packages

# In[1]:


import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
from azureml.train.automl.run import AutoMLRun
import time
import logging
from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
import random
import numpy as np
import os


# ## Load workspace
# 
# Create a workspace object from the existing workspace `Workspace.from_config()` reads the file **aml_config/config.json** and loads the details into an object named `ws`.

# In[2]:


# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')


# ## Create or connect to an experiment

# In[3]:


experiment_name = 'automl-regression-nyc-taxi'

from azureml.core import Experiment
exp = Experiment(workspace = ws, name = experiment_name)


# ---
# ## Local execution
# 
# ### Load data

# In[9]:


from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# load data
pd_dataframe = pd.read_pickle('data/data_after_prep.pkl')
print('Data loading finished.')

y = np.array(pd_dataframe["trip_duration"]).astype(float)
y = np.log(y)
X = pd_dataframe.drop(["trip_duration"],axis = 1)


# ### Train a model
# 
# Function `AutoMLConfig` description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
# 
# Here is a brief summary of the properties which are used in our model:
# 
# |Property| Description|
# |---|---|
# |**task**|'classification' or 'regression' depending on what kind of ML problem to solve.|
# |**primary_metric**|Metric that you want to optimize.|
# |**max_time_sec**|Time limit in seconds for each iteration|
# |**iterations**|Number of iterations. In each iteration, the model trains with the data with a specific pipeline|
# |**n_cross_validations**|Number of cross validation splits|
# |**preprocess**|*True/False* Enables experiment to perform preprocessing on the input.  Preprocessing handles *missing data*, and performs some common *feature extraction*|
# |**verbosity**|Verbosity level for AutoML log file.|
# |**X**|The training features to use when fitting pipelines during AutoML experiment.|
# |**y**|Training labels to use when fitting pipelines during AutoML experiment.|
# |**path**|Full path to the AzureML project folder.|

# In[10]:


project_folder = './projects/automl-regression-nyc-taxi'

if not os.path.exists(project_folder):
    os.makedirs(project_folder)


# In[11]:


from azureml.train.automl import AutoMLConfig

# randomly choose 100 samples to train
n = 100
sample_indices = np.random.permutation(X.shape[0])[0:n]

# local compute
automl_config_local = AutoMLConfig(task = 'regression',
                                   primary_metric = 'spearman_correlation',
                                   max_time_sec = 600,
                                   iterations = 5,
                                   n_cross_validations = 5,
                                   preprocess = True,
                                   verbosity = logging.INFO,
                                   X = X.iloc[sample_indices],
                                   y = y[sample_indices],
                                   path = project_folder)


# ### Run the experiment

# In[12]:


local_run = exp.submit(automl_config_local, show_output=True)


# ---
# ## Remote execution
# 
# ### Create or connect to a remote Linux DSVM

# In[13]:


from azureml.core.compute import DsvmCompute

dsvm_name = 'mydsvm'
try:
    dsvm_compute = DsvmCompute(ws, dsvm_name)
    print('Found an existing DSVM.')
except:
    print('Creating a new DSVM.')
    dsvm_config = DsvmCompute.provisioning_configuration(vm_size = "Standard_D2_v2")
    dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)
    dsvm_compute.wait_for_completion(show_output = True)


# ### Upload data to the cloud

# In[14]:


from azureml.core import Workspace, Datastore
#blob_datastore = Datastore(ws, blob_datastore_name)
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)


# In[21]:


#ds.upload_files("./data/data_after_prep.pkl")
ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)


# In[22]:


from azureml.core.runconfig import DataReferenceConfiguration
dr = DataReferenceConfiguration(datastore_name=ds.name, 
                                path_on_datastore='data', 
                                mode='download', # download files from datastore to compute target
                                overwrite=True)


# In[23]:


from azureml.core.runconfig import RunConfiguration

# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")

# Set compute target to the Linux DSVM
conda_run_config.target = dsvm_compute.name
# set the data reference of the run coonfiguration
conda_run_config.data_references = {ds.name: dr}


# ### Create get data file

# In[24]:


project_folder = './projects/automl-regression-nyc-taxi'

if not os.path.exists(project_folder):
    os.makedirs(project_folder)


# In[25]:


get_ipython().run_cell_magic('writefile', '$project_folder/get_data.py', '\nimport numpy as np\nimport pandas as pd\nimport os\nfrom os.path import expanduser, join, dirname\n\n\ndef get_data():\n    # download data from cloud\n    pd_dataframe = pd.read_pickle(join(dirname(os.path.realpath(__file__)),\n                          os.environ["AZUREML_DATAREFERENCE_workspacefilestore"],\n                          "data_after_prep.pkl"))\n    \n    y = np.array(pd_dataframe["trip_duration"]).astype(float)\n    y = np.log(y)\n    X = pd_dataframe.drop(["trip_duration"],axis = 1)\n    \n    # randomly choose samples to train\n    n = 300\n    sample_indices = np.random.permutation(X.shape[0])[0:n]\n\n    return { "X" : X.iloc[sample_indices], "y" : y[sample_indices] }\n')


# ### Train a model on remote compute target
# 
# Function `AutoMLConfig` description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
# 
# _**Note**_: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig.

# In[26]:


from azureml.train.automl import AutoMLConfig

automl_settings = {
    "max_time_sec": 600,
    "iterations": 5,
    "n_cross_validations": 5,
    "primary_metric": 'spearman_correlation',
    "preprocess": True,
    "max_cores_per_iteration": 2,
    "verbosity": logging.INFO
}

automl_config_remote = AutoMLConfig(task = 'regression',
                             debug_log = 'automl_errors.log',
                             path = project_folder, 
                             run_configuration = conda_run_config,
                             data_script = project_folder + "/get_data.py",
                             **automl_settings
                            )


# In[29]:


remote_run = exp.submit(automl_config_remote, show_output = False)


# ## Cancel runs

# In[ ]:


remote_run.cancel()


# ---
# ## Explore results

# In[105]:


remote_run


# ### Retrieve all child runs
# 
# We can use the cell below to fetch all the child runs and see individual metrics.

# In[30]:


children = list(remote_run.get_children())
metricslist = {}
for run in children:
    properties = run.get_properties()
    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    metricslist[int(properties['iteration'])] = metrics

import pandas as pd
rundata = pd.DataFrame(metricslist).sort_index(1)
rundata


# ## Retrieve models
# 
# ### Best model with the highest value
# 
# The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing.

# In[31]:


best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)


# ### Best model based on other metrics
# 
# For example, the cell below shows the run and the model that has the smallest `root_mean_squared_error` value.

# In[ ]:


lookup_metric = "root_mean_squared_error"
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)


# ### Model from a specific iteration
# 
# For example, the cell below shows the run and the model from the 3rd iteration.

# In[ ]:


iteration = 3
third_run, third_model = remote_run.get_output(iteration = iteration)
print(third_run)
print(third_model)


# ## Test the best model

# In[32]:


pd_dataframe = pd.read_pickle("./data/sub_data_after_prep.pkl")
y_test = np.array(pd_dataframe["trip_duration"]).astype(float)
y_test = np.log(y_test)
X_test = pd_dataframe.drop(["trip_duration"],axis = 1)


# In[35]:


random_index = np.random.randint(0, len(X_test)-1)
y_pred = fitted_model.predict(X_test.iloc[[random_index]])
y_residual_test = abs(y_test[random_index] - y_pred[0])


# In[36]:


print("actual value:", y_test[random_index])
print("prediction:", y_pred[0])
print("residual:", y_residual_test)


# ## Register the best model
# 
# We can use the cell below to register model in the workspace.

# In[ ]:


description = 'Automated Machine Learning Model'
tags = None
remote_run.register_model(description=description, tags=tags)
remote_run.model_id # Use this id to deploy the model as a web service in Azure


# In[ ]: