import h2o
# Connect to a cluster
h2o.init()
H2O cluster uptime: | 17 seconds 548 milliseconds |
H2O cluster version: | 3.1.0.99999 |
H2O cluster name: | anqi_fu |
H2O cluster total nodes: | 1 |
H2O cluster total memory: | 1.78 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
weather_path = h2o.locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = h2o.locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = h2o.locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
print "Import and Parse weather data"
weather = h2o.import_frame(path=weather_path)
weather.drop("date")
weather.describe()
print "Import and Parse census data"
census = h2o.import_frame(path=census_path)
census.describe()
print "Import and Parse crimes data"
crimes = h2o.import_frame(path=crimes_path)
crimes.describe()
Import and Parse weather data Parse Progress: [##################################################] 100% Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoAllWeather.csv . Parsed 5,162 rows and 7 cols Rows: 5,162 Cols: 7 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C1N | 1-Byte Integers (w/o NAs) | 2 | 28.57143 | 10.2 KB | 11.221008 |
C1S | 1-Byte Fractions | 4 | 57.14286 | 20.5 KB | 22.510675 |
CStr | String | 1 | 14.285715 | 60.3 KB | 66.26832 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 91.0 KB | 5162.0 | 1.0 | 7.0 |
mean | 91.0 KB | 5162.0 | 1.0 | 7.0 |
min | 91.0 KB | 5162.0 | 1.0 | 7.0 |
max | 91.0 KB | 5162.0 | 1.0 | 7.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 91.0 KB | 5162.0 | 1.0 | 7.0 |
Column-by-Column Summary:
date | month | day | year | maxTemp | meanTemp | minTemp | |
type | string | int | int | int | int | int | int |
mins | NaN | 1.0 | 1.0 | 2001.0 | -2.0 | -9.0 | -18.0 |
maxs | NaN | 12.0 | 31.0 | 2015.0 | 103.0 | 93.0 | 82.0 |
sigma | NaN | 3.46905171694 | 8.79895173997 | 4.0773409057 | 21.4829777237 | 19.9302399266 | 19.0207297123 |
zero_count | 0 | 0 | 0 | 0 | 0 | 2 | 16 |
missing_count | 0 | 0 | 0 | 0 | 13 | 13 | 13 |
Import and Parse census data Parse Progress: [##################################################] 100% Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCensus.csv . Parsed 79 rows and 9 cols Rows: 79 Cols: 9 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C1 | 1-Byte Integers | 2 | 22.222223 | 294 B | 9.312638 |
C1S | 1-Byte Fractions | 1 | 11.111112 | 163 B | 5.1631293 |
C2S | 2-Byte Fractions | 4 | 44.444447 | 968 B | 30.662022 |
C4 | 4-Byte Integers | 1 | 11.111112 | 384 B | 12.163446 |
CStr | String | 1 | 11.111112 | 1.3 KB | 42.698765 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 3.1 KB | 79.0 | 1.0 | 9.0 |
mean | 3.1 KB | 79.0 | 1.0 | 9.0 |
min | 3.1 KB | 79.0 | 1.0 | 9.0 |
max | 3.1 KB | 79.0 | 1.0 | 9.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 3.1 KB | 79.0 | 1.0 | 9.0 |
Column-by-Column Summary:
Community Area Number | COMMUNITY AREA NAME | PERCENT OF HOUSING CROWDED | PERCENT HOUSEHOLDS BELOW POVERTY | PERCENT AGED 16 UNEMPLOYED | PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA | PERCENT AGED UNDER 18 OR OVER 64 | PER CAPITA INCOME | HARDSHIP INDEX | |
type | int | string | real | real | real | real | real | int | int |
mins | 1.0 | NaN | 0.3 | 3.3 | 4.7 | 2.5 | 13.5 | 8201.0 | 1.0 |
maxs | 77.0 | NaN | 15.8 | 56.5 | 35.9 | 54.8 | 51.5 | 88669.0 | 98.0 |
sigma | 22.3718573212 | NaN | 3.65898144135 | 11.457230913 | 7.49949670861 | 11.7465143511 | 7.28442108494 | 15196.4055413 | 28.6905556516 |
zero_count | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
missing_count | 2 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 2 |
Import and Parse crimes data Parse Progress: [##################################################] 100% Imported /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCrimes10k.csv.zip . Parsed 9,999 rows and 22 cols Rows: 9,999 Cols: 22 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 4 | 4.5454545 | 320 B | 0.03695244 |
C1 | 1-Byte Integers | 32 | 36.363636 | 80.2 KB | 9.488462 |
C1N | 1-Byte Integers (w/o NAs) | 8 | 9.090909 | 20.1 KB | 2.3721156 |
C2 | 2-Byte Integers | 16 | 18.181818 | 79.2 KB | 9.362824 |
C4 | 4-Byte Integers | 12 | 13.636364 | 118.0 KB | 13.950008 |
CStr | String | 8 | 9.090909 | 391.1 KB | 46.252445 |
C8D | 64-bit Reals | 8 | 9.090909 | 156.8 KB | 18.537191 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 845.7 KB | 9999.0 | 4.0 | 88.0 |
mean | 845.7 KB | 9999.0 | 4.0 | 88.0 |
min | 845.7 KB | 9999.0 | 4.0 | 88.0 |
max | 845.7 KB | 9999.0 | 4.0 | 88.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 845.7 KB | 9999.0 | 4.0 | 88.0 |
Column-by-Column Summary:
ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | |
type | int | string | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum |
mins | 21735.0 | NaN | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 2015.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 |
maxs | 9962898.0 | NaN | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 2015.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 |
sigma | 396787.564221 | NaN | NaN | 1915.88517194 | 927.751435583 | 9.16241735944 | 60.1059382029 | 25.5963972463 | 0.455083515588 | 0.35934414686 | 695.76029875 | 6.94547493301 | 13.6495661144 | 21.2748762223 | 7.57423857911 | 16496.4493681 | 31274.0163199 | 0.0 | 10.0824464345 | 0.0860186579359 | 0.0600357970653 | 2469.64729385 |
zero_count | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 |
missing_count | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 |
def refine_date_col(data, col, pattern):
data[col] = data[col].as_date(pattern)
data["Day"] = data[col].day()
data["Month"] = data[col].month() + 1 # Since H2O indexes from 0
data["Year"] = data[col].year() + 1900 # Start of epoch is 1900
data["WeekNum"] = data[col].week()
data["WeekDay"] = data[col].dayOfWeek()
data["HourOfDay"] = data[col].hour()
data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
# Create weekend and season cols
# Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
# data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()
Rows: 9,999 Cols: 27 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 9 | 8.333334 | 720 B | 0.10067465 |
C1 | 1-Byte Integers | 32 | 29.62963 | 80.2 KB | 11.489216 |
C1N | 1-Byte Integers (w/o NAs) | 23 | 21.296297 | 57.9 KB | 8.29671 |
C2 | 2-Byte Integers | 16 | 14.814815 | 79.2 KB | 11.337085 |
C4 | 4-Byte Integers | 12 | 11.111112 | 118.0 KB | 16.891531 |
C8 | 64-bit Integers | 4 | 3.7037036 | 78.4 KB | 11.222987 |
CStr | String | 4 | 3.7037036 | 127.2 KB | 18.215822 |
C8D | 64-bit Reals | 8 | 7.4074073 | 156.8 KB | 22.445974 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 698.4 KB | 9999.0 | 4.0 | 108.0 |
mean | 698.4 KB | 9999.0 | 4.0 | 108.0 |
min | 698.4 KB | 9999.0 | 4.0 | 108.0 |
max | 698.4 KB | 9999.0 | 4.0 | 108.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 698.4 KB | 9999.0 | 4.0 | 108.0 |
Column-by-Column Summary:
ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | |
type | int | string | int | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int |
mins | 21735.0 | NaN | 1.42203063e+12 | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 |
maxs | 9962898.0 | NaN | 1.42346782e+12 | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 |
sigma | 396787.564221 | NaN | 433879245.188 | 1915.88517194 | 927.751435583 | 9.16241735944 | 60.1059382029 | 25.5963972463 | 0.455083515588 | 0.35934414686 | 695.76029875 | 6.94547493301 | 13.6495661144 | 21.2748762223 | 7.57423857911 | 16496.4493681 | 31274.0163199 | 0.0 | 10.0824464345 | 0.0860186579359 | 0.0600357970653 | 2469.64729385 | 11.1801043358 | 0.493492406787 | 0.738929830409 | 1.93284056432 | 6.47321735807 |
zero_count | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 |
missing_count | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 |
Rows: 9,999 Cols: 28 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 13 | 11.607142 | 1.0 KB | 0.16332634 |
CBS | Bits | 4 | 3.5714288 | 1.5 KB | 0.2404352 |
C1 | 1-Byte Integers | 32 | 28.57143 | 80.2 KB | 12.9040365 |
C1N | 1-Byte Integers (w/o NAs) | 23 | 20.535715 | 57.9 KB | 9.318395 |
C2 | 2-Byte Integers | 16 | 14.285715 | 79.2 KB | 12.733171 |
C4 | 4-Byte Integers | 12 | 10.714286 | 118.0 KB | 18.97161 |
CStr | String | 4 | 3.5714288 | 127.2 KB | 20.458979 |
C8D | 64-bit Reals | 8 | 7.1428576 | 156.8 KB | 25.210047 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 621.8 KB | 9999.0 | 4.0 | 112.0 |
mean | 621.8 KB | 9999.0 | 4.0 | 112.0 |
min | 621.8 KB | 9999.0 | 4.0 | 112.0 |
max | 621.8 KB | 9999.0 | 4.0 | 112.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 621.8 KB | 9999.0 | 4.0 | 112.0 |
Column-by-Column Summary:
ID | Case Number | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | Weekend | Season | |
type | int | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int | int | enum |
mins | 21735.0 | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 |
maxs | 9962898.0 | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
sigma | 396787.564221 | NaN | 1915.88517194 | 927.751435583 | 9.16241735944 | 60.1059382029 | 25.5963972463 | 0.455083515588 | 0.35934414686 | 695.76029875 | 6.94547493301 | 13.6495661144 | 21.2748762223 | 7.57423857911 | 16496.4493681 | 31274.0163199 | 0.0 | 10.0824464345 | 0.0860186579359 | 0.0600357970653 | 2469.64729385 | 11.1801043358 | 0.493492406787 | 0.738929830409 | 1.93284056432 | 6.47321735807 | 0.365802434041 | 0.493492406787 |
zero_count | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 | 8408 | 5805 |
missing_count | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
# Merge crimes data with weather and census
census["Community Area Number"]._name = "Community Area"
weather["month"]._name = "Month"
weather["day"] ._name = "Day"
weather["year"] ._name = "Year"
crimes.merge(census, allLeft=True, allRite=False)
crimes.merge(weather, allLeft=True, allRite=False)
--------------------------------------------------------------------------- EnvironmentError Traceback (most recent call last) <ipython-input-5-e946a6af6204> in <module>() 4 weather["day"] ._name = "Day" 5 weather["year"] ._name = "Year" ----> 6 crimes.merge(census, allLeft=True, allRite=False) 7 crimes.merge(weather, allLeft=True, allRite=False) /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc in merge(self, other, allLeft, allRite) 1022 expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )" 1023 -> 1024 h2o.rapids(expr2) # merge in h2o 1025 # Make backing H2OVecs for the remote h2o vecs 1026 j = h2o.frame(tmp_key) # Fetch the frame as JSON /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc in rapids(expr) 487 :return: The JSON response of the Rapids execution 488 """ --> 489 result = H2OConnection.post_json("Rapids", ast=urllib.quote(expr), _rest_version=99) 490 if result['error'] is not None: 491 raise EnvironmentError("rapids expression not evaluated: {0}".format(str(result['error']))) /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs) 360 if __H2OCONN__ is None: 361 raise ValueError("No h2o connection. Did you run `h2o.init()` ?") --> 362 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs) 363 364 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs): /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs) 363 364 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs): --> 365 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs) 366 return self._process_tables(raw_txt.json()) 367 /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs) 429 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \ 430 "detailed error messages: {}") --> 431 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs)) 432 433 # TODO: is.logging? -> write to logs EnvironmentError: h2o-py got an unexpected HTTP status code: 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). detailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented
# Create test/train split
data_split = h2o.split_frame(data, ratios = [0.8,0.2])
train = data_split[1]
test = data_split[2]
# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
ntrees =10,
max_depth =6,
distribution ="bernoulli")
# Simple Deep Learning
data_dl = h2o.deeplearning(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
variable_importances=True,
loss ="Automatic")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-12-347776b381b3> in <module>() 1 # Create test/train split ----> 2 data_split = h2o.split_frame(data, ratios = [0.8,0.2]) 3 train = data_split[1] 4 test = data_split[2] 5 NameError: name 'data' is not defined
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm = data_gbm.model_performance(test) .auc()
# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl = data_dl.model_performance(test) .auc()
# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table = [
["GBM", train_auc_gbm, test_auc_gbm],
["DL ", train_auc_dl, test_auc_dl]
]
h2o.H2ODisplay(table, header)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-f7c2ab3a3e26> in <module>() 1 # GBM performance on train/test data ----> 2 train_auc_gbm = data_gbm.model_performance(train).auc() 3 test_auc_gbm = data_gbm.model_performance(test) .auc() 4 5 # Deep Learning performance on train/test data NameError: name 'data_gbm' is not defined
# Create new H2OFrame of crime observations
examples = {
"Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
"IUCR": [1811, 1150],
"Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"],
"Location.Description": ["STREET", "RESIDENCE"],
"Domestic": ["false", "false"],
"Beat": [422, 923],
"District": [4, 9],
"Ward": [7, 14],
"Community.Area": [46, 63],
"FBI.Code": [18, 11]
}
crime_examples = h2o.H2OFrame(python_obj = examples)
# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
crime_examples.merge(census, allLeft=True, allRite=False)
Parse Progress: [##################################################] 100% Uploaded py634b18a9-7e84-40ca-b265-b2fe43e064aa into cluster with 2 rows and 10 cols Rows: 2 Cols: 16 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 7 | 43.75 | 560 B | 43.818466 |
C1N | 1-Byte Integers (w/o NAs) | 4 | 25.0 | 280 B | 21.909233 |
C2 | 2-Byte Integers | 2 | 12.5 | 144 B | 11.267606 |
C2S | 2-Byte Fractions | 1 | 6.25 | 88 B | 6.885759 |
CStr | String | 2 | 12.5 | 206 B | 16.118937 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.17:54321 | 1.2 KB | 2.0 | 1.0 | 16.0 |
mean | 1.2 KB | 2.0 | 1.0 | 16.0 |
min | 1.2 KB | 2.0 | 1.0 | 16.0 |
max | 1.2 KB | 2.0 | 1.0 | 16.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.2 KB | 2.0 | 1.0 | 16.0 |
Column-by-Column Summary:
Location.Description | FBI.Code | Primary.Type | Community.Area | District | Beat | Domestic | IUCR | Date | Ward | Day | Month | Year | WeekNum | WeekDay | HourOfDay | |
type | string | int | string | int | int | int | enum | int | int | int | int | int | int | int | enum | int |
mins | NaN | 11.0 | NaN | 46.0 | 4.0 | 422.0 | 0.0 | 1150.0 | 1.423465239e+12 | 7.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
maxs | NaN | 18.0 | NaN | 63.0 | 9.0 | 923.0 | 0.0 | 1811.0 | 1.423467838e+12 | 14.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
sigma | NaN | 4.94974746831 | NaN | 12.0208152802 | 3.53553390593 | 354.260497374 | 0.0 | 467.397582364 | 1837770.5243 | 4.94974746831 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
zero_count | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
missing_count | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
--------------------------------------------------------------------------- EnvironmentError Traceback (most recent call last) <ipython-input-6-85bb7c75c897> in <module>() 16 17 # Refine date column and merge with census data ---> 18 refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p") 19 crime_examples.drop("Date") 20 crime_examples.merge(census, allLeft=True, allRite=False) <ipython-input-4-c2702228f9f1> in refine_date_col(data, col, pattern) 15 # data["Weekend"] = h2o.ifelse(data["WeekDay"] in ("Sun", "Sat"), 1, 0)[0] 16 data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0] ---> 17 data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"]) 18 19 refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p") /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc in cut(self, breaks, labels, include_lowest, right, dig_lab) 1256 1257 expr = "(cut '{}' {} {} {} {} #{}".format(self.key(), breaks_list, labels_list, "%TRUE" if include_lowest else "%FALSE", "%TRUE" if right else "%FALSE", dig_lab) -> 1258 res = h2o.rapids(expr) 1259 return H2OVec(self._name, Expr(op=res["vec_ids"][0]["name"], length=res["num_rows"])) 1260 /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc in rapids(expr) 487 :return: The JSON response of the Rapids execution 488 """ --> 489 result = H2OConnection.post_json("Rapids", ast=urllib.quote(expr), _rest_version=99) 490 if result['error'] is not None: 491 raise EnvironmentError("rapids expression not evaluated: {0}".format(str(result['error']))) /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs) 360 if __H2OCONN__ is None: 361 raise ValueError("No h2o connection. Did you run `h2o.init()` ?") --> 362 return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs) 363 364 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs): /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs) 363 364 def _rest_json(self, url_suffix, method, file_upload_info, **kwargs): --> 365 raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs) 366 return self._process_tables(raw_txt.json()) 367 /Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs) 429 raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \ 430 "detailed error messages: {}") --> 431 .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs)) 432 433 # TODO: is.logging? -> write to logs EnvironmentError: h2o-py got an unexpected HTTP status code: 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). detailed error messages: Data vector is constant!
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred = data_dl .predict(crime_examples)
# TODO: Replace with a pretty HTML table
gbm_pred.describe()
dl_pred.describe()