#!/usr/bin/env python
# coding: utf-8

# In[1]:


import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator


# In[2]:


# Connect to a cluster
h2o.init()


# In[3]:


from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()

print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()

print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()


# In[4]:


def refine_date_col(data, col, pattern):
    #data[col]         = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()


# In[5]:


# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()


# In[6]:


# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]

# Simple GBM - Predict Arrest
crimes_names_x = crimes.names[:]
crimes_names_x.remove("Arrest")
data_gbm = H2OGradientBoostingEstimator(ntrees         =10,
                                        max_depth      =6,
                                        distribution   ="bernoulli")

data_gbm.train(x               =crimes_names_x,
               y               ="Arrest",
               training_frame  =train,
               validation_frame=test)

# Simple Deep Learning - Predict Arrest
# data_dl = H2ODeepLearningEstimator(variable_importances=True,
#                                    loss                ="Automatic")

# data_dl.train(x                =crimes_names_x,
#               y                ="Arrest",
#               training_frame  =train,
#               validation_frame=test)


# In[7]:


# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
# train_auc_dl = data_dl.model_performance(train).auc()
# test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
#            ["DL ", train_auc_dl, test_auc_dl]
         ]
h2o.display.H2OTableDisplay(table, columns_labels=header)


# In[8]:


# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()


# In[9]:


# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
# dl_pred  = data_dl .predict(crime_examples)

# Make a pretty HTML table printout of the results
# header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
# table  = [
#            [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
#            [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
#          ]
header = ["FBI Code", "GBM Arrest Prob"]
table  = [
           [examples["FBI.Code"][0], gbm_pred[0,"true"]],
           [examples["FBI.Code"][1], gbm_pred[1,"true"]],
         ]
h2o.display.H2OTableDisplay(table, columns_labels=header)