#!/usr/bin/env python
# coding: utf-8

# # Multinomial Partial Dependency plot
# 
# Authors Lauren DiPerna, Veronika Maurerova

# ## Build a GLM with the Iris Dataset

# In[7]:


# Import the Iris Dataset and Build a GLM
import h2o
h2o.init()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# import the iris dataset:
# this dataset is used to classify the type of iris plant
# the original dataset can be found at https://archive.ics.uci.edu/ml/datasets/Iris
# iris = h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv")
iris = h2o.import_file("../../smalldata/iris/iris_wheader.csv")

# convert response column to a factor
iris['class'] = iris['class'].asfactor()

# set the predictor names and the response column name
predictors = iris.col_names[:-1]
response = 'class'

# split into train and validation
train, valid = iris.split_frame(ratios = [.8], seed=1234)

# build model
model = H2OGeneralizedLinearEstimator(family = 'multinomial')
model.train(x = predictors, y = response, training_frame = train, validation_frame = valid)


# ## Specify Feature of Interest
# In the cell below, if you decide to use a different dataset, model, or features please update the following variables:
# * model
# * data_pdp
# * col

# In[8]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# hide progress bar
h2o.no_progress()

# specify the model to you:
model = model

# specify the dataframe to use
data_pdp = iris

# specify the feature of interest, available features include: 
# ['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
# col = "sepal_len"
# col = 'sepal_wid'
col = 'petal_len'
# col = 'petal_wid'

# create a copy of the column of interest, so that values are preserved after each run
col_data = data_pdp[col]


# ## Generate a PDP per class manualy

# In[9]:


# get a list of the classes in your target
classes = h2o.as_list(data_pdp['class'].unique(), use_pandas=False,header=False)
classes = [class_val[0] for class_val in classes]


# create bins for the pdp plot
bins = data_pdp[col].quantile(prob=list(np.linspace(0.05,1,19)))[:,1].unique()
bins = bins.as_data_frame().values.tolist()
bins = [bin_val[0] for bin_val in bins]
bins.sort()


# Loop over each class and print the pdp for the given feature
for class_val in classes:
    mean_responses = []

    for bin_val in bins:
        # warning this line modifies the dataset.
        # when you rerun on a new column make sure to return
        # all columns to their original values.
        data_pdp[col] = bin_val
        response = model.predict(data_pdp)
        mean_response = response[:,class_val].mean()[0]
        mean_responses.append(mean_response)
        mean_responses

    pdp_manual = pd.DataFrame({col: bins, 'mean_response':mean_responses},columns=[col,'mean_response'])
    plt.plot(pdp_manual[col], pdp_manual.mean_response);
    plt.xlabel(col);
    plt.ylabel('mean_response');
    plt.title('PDP for Class {0}'.format(class_val));
    plt.show()


# reset col value to original value for future runs of this cell
data_pdp[col] = col_data


# ## Use target parameter and plot H2O multinomial PDP 

# In[11]:


# h2o multinomial PDP class setosa
data = model.partial_plot(data=iris, cols=["petal_len"], plot_stddev=False, plot=True, targets=["Iris-setosa"])


# In[12]:


# h2o multinomial PDP class versicolor
data = model.partial_plot(data=iris, cols=["petal_len"], plot_stddev=False, plot=True, targets=["Iris-versicolor"])


# In[13]:


# h2o multinomial PDP class virginica
data = model.partial_plot(data=iris, cols=["petal_len"], plot_stddev=False, plot=True, targets=["Iris-virginica"])


# In[14]:


# h2o multinomial PDP all classes
data = model.partial_plot(data=iris, cols=["petal_len"], plot_stddev=False, plot=True, targets=["Iris-setosa", "Iris-versicolor", "Iris-virginica"])


# In[16]:


# h2o multinomial PDP all classes with stddev
data = model.partial_plot(data=iris, cols=["petal_len"], plot_stddev=True, plot=True, targets=["Iris-setosa", "Iris-versicolor", "Iris-virginica"])