#!/usr/bin/env python # coding: utf-8 # In[1]: import h2o from h2o.estimators.gbm import H2OGradientBoostingEstimator from h2o.estimators.deeplearning import H2ODeepLearningEstimator # In[2]: # Connect to a cluster h2o.init() # In[3]: from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory. weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv") census_path = _locate("smalldata/chicago/chicagoCensus.csv") crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip") print("Import and Parse weather data") weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6) weather.drop("date") weather.describe() print("Import and Parse census data") census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7) census.describe() print("Import and Parse crimes data") crimes = h2o.import_file(path=crimes_path) crimes.describe() # In[4]: def refine_date_col(data, col, pattern): #data[col] = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date data["Day"] = data[col].day() data["Month"] = data[col].month() + 1 # Since H2O indexes from 0 data["Year"] = data[col].year() + 1900 # Start of epoch is 1900 data["WeekNum"] = data[col].week() data["WeekDay"] = data[col].dayOfWeek() data["HourOfDay"] = data[col].hour() data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425. # Create weekend and season cols # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb. # data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]] data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat")) data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"]) refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p") crimes = crimes.drop("Date") crimes.describe() # In[5]: # Merge crimes data with weather and census census.set_name(0,"Community Area") weather.set_name(1,"Month") weather.set_name(2,"Day") weather.set_name(3,"Year") crimes.merge(census, all_x=True, all_y=False) crimes.merge(weather, all_x=True, all_y=False) crimes.describe() # In[6]: # Create test/train split r = crimes["Arrest"].runif(1234) train = crimes[r < 0.8] test = crimes[r >= 0.8] # Simple GBM - Predict Arrest crimes_names_x = crimes.names[:] crimes_names_x.remove("Arrest") data_gbm = H2OGradientBoostingEstimator(ntrees =10, max_depth =6, distribution ="bernoulli") data_gbm.train(x =crimes_names_x, y ="Arrest", training_frame =train, validation_frame=test) # Simple Deep Learning - Predict Arrest # data_dl = H2ODeepLearningEstimator(variable_importances=True, # loss ="Automatic") # data_dl.train(x =crimes_names_x, # y ="Arrest", # training_frame =train, # validation_frame=test) # In[7]: # GBM performance on train/test data train_auc_gbm = data_gbm.model_performance(train).auc() test_auc_gbm = data_gbm.model_performance(test) .auc() # Deep Learning performance on train/test data # train_auc_dl = data_dl.model_performance(train).auc() # test_auc_dl = data_dl.model_performance(test) .auc() # Make a pretty HTML table printout of the results header = ["Model", "AUC Train", "AUC Test"] table = [ ["GBM", train_auc_gbm, test_auc_gbm], # ["DL ", train_auc_dl, test_auc_dl] ] h2o.display.H2OTableDisplay(table, columns_labels=header) # In[8]: # Create new H2OFrame of crime observations examples = { "Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"], "IUCR": [1811, 1150], "Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"], "Location.Description": ["STREET", "RESIDENCE"], "Domestic": ["false", "false"], "Beat": [422, 923], "District": [4, 9], "Ward": [7, 14], "Community.Area": [46, 63], "FBI.Code": [18, 11] } crime_examples = h2o.H2OFrame(examples) # Refine date column and merge with census data refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p") crime_examples.drop("Date") census.set_name(0,"Community.Area") crime_examples.merge(census, all_x=True, all_y=False) crime_examples.describe() # In[9]: # Predict probability of arrest from new observations gbm_pred = data_gbm.predict(crime_examples) # dl_pred = data_dl .predict(crime_examples) # Make a pretty HTML table printout of the results # header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"] # table = [ # [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]], # [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]] # ] header = ["FBI Code", "GBM Arrest Prob"] table = [ [examples["FBI.Code"][0], gbm_pred[0,"true"]], [examples["FBI.Code"][1], gbm_pred[1,"true"]], ] h2o.display.H2OTableDisplay(table, columns_labels=header)