import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# Connect to a cluster
h2o.init()
H2O cluster uptime: | 1 hours 13 minutes 22 seconds 521 milliseconds |
H2O cluster version: | 3.7.0.99999 |
H2O cluster name: | ludirehak |
H2O cluster total nodes: | 1 |
H2O cluster total free memory: | 3.24 GB |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster healthy: | True |
H2O Connection ip: | 127.0.0.1 |
H2O Connection port: | 54321 |
H2O Connection proxy: | None |
Python Version: | 3.5.1 |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()
print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()
print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()
Import and Parse weather data Parse Progress: [##################################################] 100% Rows:5,162 Cols:7 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 19 | 8.225108 | 1.5 KB | 3.3811588 |
C0D | Constant Reals | 33 | 14.285715 | 2.6 KB | 5.872539 |
C1 | 1-Byte Integers | 8 | 3.4632034 | 1.7 KB | 3.9528418 |
C1N | 1-Byte Integers (w/o NAs) | 135 | 58.441555 | 29.6 KB | 67.35625 |
C1S | 1-Byte Fractions | 36 | 15.584415 | 8.5 KB | 19.437214 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 43.9 KB | 5162.0 | 33.0 | 231.0 |
mean | 43.9 KB | 5162.0 | 33.0 | 231.0 |
min | 43.9 KB | 5162.0 | 33.0 | 231.0 |
max | 43.9 KB | 5162.0 | 33.0 | 231.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 43.9 KB | 5162.0 | 33.0 | 231.0 |
date | month | day | year | maxTemp | meanTemp | minTemp | |
---|---|---|---|---|---|---|---|
type | time | int | int | int | int | int | int |
mins | NaN | 1.0 | 1.0 | 2001.0 | -2.0 | -9.0 | -18.0 |
mean | 0.0 | 6.47442851607904 | 15.708252615265401 | 2007.5714839209609 | 58.871042920955524 | 50.31035152456788 | 41.4812584967955 |
maxs | NaN | 12.0 | 31.0 | 2015.0 | 103.0 | 93.0 | 82.0 |
sigma | -0.0 | 3.469051716937685 | 8.798951739966594 | 4.077340905700527 | 21.482977723685387 | 19.930239926608884 | 19.020729712312264 |
zeros | -5162 | 0 | 0 | 0 | 0 | 2 | 16 |
missing | 5162 | 0 | 0 | 0 | 13 | 13 | 13 |
0 | nan | 1.0 | 1.0 | 2001.0 | 23.0 | 14.0 | 6.0 |
1 | nan | 1.0 | 2.0 | 2001.0 | 18.0 | 12.0 | 6.0 |
2 | nan | 1.0 | 3.0 | 2001.0 | 28.0 | 18.0 | 8.0 |
3 | nan | 1.0 | 4.0 | 2001.0 | 30.0 | 24.0 | 19.0 |
4 | nan | 1.0 | 5.0 | 2001.0 | 36.0 | 30.0 | 21.0 |
5 | nan | 1.0 | 6.0 | 2001.0 | 33.0 | 26.0 | 19.0 |
6 | nan | 1.0 | 7.0 | 2001.0 | 34.0 | 28.0 | 21.0 |
7 | nan | 1.0 | 8.0 | 2001.0 | 26.0 | 20.0 | 14.0 |
8 | nan | 1.0 | 9.0 | 2001.0 | 23.0 | 16.0 | 10.0 |
9 | nan | 1.0 | 10.0 | 2001.0 | 34.0 | 26.0 | 19.0 |
Import and Parse census data Parse Progress: [##################################################] 100% Rows:79 Cols:9 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C1 | 1-Byte Integers | 3 | 33.333336 | 441 B | 22.546013 |
C1S | 1-Byte Fractions | 1 | 11.111112 | 163 B | 8.333334 |
C2S | 2-Byte Fractions | 4 | 44.444447 | 968 B | 49.488754 |
C4 | 4-Byte Integers | 1 | 11.111112 | 384 B | 19.6319 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 1.9 KB | 79.0 | 1.0 | 9.0 |
mean | 1.9 KB | 79.0 | 1.0 | 9.0 |
min | 1.9 KB | 79.0 | 1.0 | 9.0 |
max | 1.9 KB | 79.0 | 1.0 | 9.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.9 KB | 79.0 | 1.0 | 9.0 |
Community Area Number | COMMUNITY AREA NAME | PERCENT OF HOUSING CROWDED | PERCENT HOUSEHOLDS BELOW POVERTY | PERCENT AGED 16 UNEMPLOYED | PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA | PERCENT AGED UNDER 18 OR OVER 64 | PER CAPITA INCOME | HARDSHIP INDEX | |
---|---|---|---|---|---|---|---|---|---|
type | int | enum | real | real | real | real | real | int | int |
mins | 1.0 | 0.0 | 0.30000000000000004 | 3.3000000000000003 | 4.7 | 2.5 | 13.5 | 8201.0 | 1.0 |
mean | 39.0 | NaN | 4.920512820512822 | 21.73974358974359 | 15.341025641025642 | 20.33076923076924 | 35.71794871794871 | 25597.000000000004 | 49.506493506493506 |
maxs | 77.0 | 78.0 | 15.8 | 56.5 | 35.9 | 54.800000000000004 | 51.5 | 88669.0 | 98.0 |
sigma | 22.371857321197094 | NaN | 3.6589814413502006 | 11.457230912971083 | 7.49949670860991 | 11.746514351100048 | 7.284421084944952 | 15196.405541331917 | 28.69055565156158 |
zeros | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
missing | 2 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 2 |
0 | nan | COMMUNITY AREA NAME | nan | nan | nan | nan | nan | nan | nan |
1 | 1.0 | Rogers Park | 7.7 | 23.6 | 8.700000000000001 | 18.2 | 27.5 | 23939.0 | 39.0 |
2 | 2.0 | West Ridge | 7.800000000000001 | 17.2 | 8.8 | 20.8 | 38.5 | 23040.0 | 46.0 |
3 | 3.0 | Uptown | 3.8000000000000003 | 24.0 | 8.9 | 11.8 | 22.200000000000003 | 35787.0 | 20.0 |
4 | 4.0 | Lincoln Square | 3.4000000000000004 | 10.9 | 8.200000000000001 | 13.4 | 25.5 | 37524.0 | 17.0 |
5 | 5.0 | North Center | 0.30000000000000004 | 7.5 | 5.2 | 4.5 | 26.200000000000003 | 57123.0 | 6.0 |
6 | 6.0 | Lake View | 1.1 | 11.4 | 4.7 | 2.6 | 17.0 | 60058.0 | 5.0 |
7 | 7.0 | Lincoln Park | 0.8 | 12.3 | 5.1000000000000005 | 3.6 | 21.5 | 71551.0 | 2.0 |
8 | 8.0 | Near North Side | 1.9000000000000001 | 12.9 | 7.0 | 2.5 | 22.6 | 88669.0 | 1.0 |
9 | 9.0 | Edison Park | 1.1 | 3.3000000000000003 | 6.5 | 7.4 | 35.300000000000004 | 40959.0 | 8.0 |
Import and Parse crimes data Parse Progress: [##################################################] 100% Rows:9,999 Cols:22 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 1 | 4.5454545 | 80 B | 0.0092869 |
C1 | 1-Byte Integers | 8 | 36.363636 | 78.6 KB | 9.349084 |
C1N | 1-Byte Integers (w/o NAs) | 2 | 9.090909 | 19.7 KB | 2.337271 |
C2 | 2-Byte Integers | 4 | 18.181818 | 78.4 KB | 9.317509 |
C4 | 4-Byte Integers | 3 | 13.636364 | 117.4 KB | 13.952581 |
CStr | String | 2 | 9.090909 | 390.7 KB | 46.446617 |
C8D | 64-bit Reals | 2 | 9.090909 | 156.4 KB | 18.587654 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 841.2 KB | 9999.0 | 1.0 | 22.0 |
mean | 841.2 KB | 9999.0 | 1.0 | 22.0 |
min | 841.2 KB | 9999.0 | 1.0 | 22.0 |
max | 841.2 KB | 9999.0 | 1.0 | 22.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 841.2 KB | 9999.0 | 1.0 | 22.0 |
ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | string | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum |
mins | 21735.0 | NaN | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 2015.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 |
mean | 9931318.737373699 | NaN | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 2015.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN |
maxs | 9962898.0 | NaN | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 2015.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 |
sigma | 396787.5642214295 | NaN | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN |
zeros | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 |
missing | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 |
0 | 9955810.0 | HY144797 | 02/08/2015 11:43:40 PM | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) |
1 | 9955861.0 | HY144838 | 02/08/2015 11:41:42 PM | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) |
2 | 9955801.0 | HY144779 | 02/08/2015 11:30:22 PM | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) |
3 | 9956197.0 | HY144787 | 02/08/2015 11:30:23 PM | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 2015.0 | 02/15/2015 12:43:39 PM | nan | nan | |
4 | 9955846.0 | HY144829 | 02/08/2015 11:30:58 PM | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) |
5 | 9955835.0 | HY144778 | 02/08/2015 11:30:21 PM | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) |
6 | 9955872.0 | HY144822 | 02/08/2015 11:27:24 PM | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) |
7 | 21752.0 | HY144738 | 02/08/2015 11:26:12 PM | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) |
8 | 9955808.0 | HY144775 | 02/08/2015 11:20:33 PM | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) |
9 | 9958275.0 | HY146732 | 02/08/2015 11:15:36 PM | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 2015.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) |
def refine_date_col(data, col, pattern):
#data[col] = data[col].as_date(pattern) # As of 5/29/2106 H2O defaults parse as a date
data["Day"] = data[col].day()
data["Month"] = data[col].month() + 1 # Since H2O indexes from 0
data["Year"] = data[col].year() + 1900 # Start of epoch is 1900
data["WeekNum"] = data[col].week()
data["WeekDay"] = data[col].dayOfWeek()
data["HourOfDay"] = data[col].hour()
data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
# Create weekend and season cols
# Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
# data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()
Rows:9,999 Cols:27 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 1 | 3.7037036 | 80 B | 0.0110837 |
C1 | 1-Byte Integers | 8 | 29.62963 | 78.6 KB | 11.157955 |
C1N | 1-Byte Integers (w/o NAs) | 7 | 25.925926 | 68.8 KB | 9.763211 |
C2 | 2-Byte Integers | 4 | 14.814815 | 78.4 KB | 11.12027 |
C4 | 4-Byte Integers | 3 | 11.111112 | 117.4 KB | 16.652143 |
C8 | 64-bit Integers | 1 | 3.7037036 | 78.2 KB | 11.092008 |
CStr | String | 1 | 3.7037036 | 127.0 KB | 18.019316 |
C8D | 64-bit Reals | 2 | 7.4074073 | 156.4 KB | 22.184013 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 704.9 KB | 9999.0 | 1.0 | 27.0 |
mean | 704.9 KB | 9999.0 | 1.0 | 27.0 |
min | 704.9 KB | 9999.0 | 1.0 | 27.0 |
max | 704.9 KB | 9999.0 | 1.0 | 27.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 704.9 KB | 9999.0 | 1.0 | 27.0 |
ID | Case Number | Date | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | string | int | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int |
mins | 21735.0 | NaN | 1422030630000.0 | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 |
mean | 9931318.737373699 | NaN | 1422714450809.2847 | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 |
maxs | 9962898.0 | NaN | 1423467820000.0 | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 |
sigma | 396787.5642214295 | NaN | 433879245.1905283 | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 |
zeros | 0 | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 |
missing | 0 | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 |
0 | 9955810.0 | HY144797 | 1423467820000.0 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
1 | 9955861.0 | HY144838 | 1423467702000.0 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
2 | 9955801.0 | HY144779 | 1423467022000.0 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
3 | 9956197.0 | HY144787 | 1423467023000.0 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | 8.0 | 3.0 | 6.0 | Sun | 23.0 | |
4 | 9955846.0 | HY144829 | 1423467058000.0 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
5 | 9955835.0 | HY144778 | 1423467021000.0 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
6 | 9955872.0 | HY144822 | 1423466844000.0 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
7 | 21752.0 | HY144738 | 1423466772000.0 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
8 | 9955808.0 | HY144775 | 1423466433000.0 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
9 | 9958275.0 | HY146732 | 1423466136000.0 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 |
Rows:9,999 Cols:28 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 1 | 3.5714288 | 80 B | 0.0124154 |
CBS | Bits | 2 | 7.1428576 | 2.6 KB | 0.4097082 |
C1 | 1-Byte Integers | 8 | 28.57143 | 78.6 KB | 12.498584 |
C1N | 1-Byte Integers (w/o NAs) | 7 | 25.0 | 68.8 KB | 10.936261 |
C2 | 2-Byte Integers | 4 | 14.285715 | 78.4 KB | 12.456371 |
C4 | 4-Byte Integers | 3 | 10.714286 | 117.4 KB | 18.652899 |
CStr | String | 1 | 3.5714288 | 127.0 KB | 20.184338 |
C8D | 64-bit Reals | 2 | 7.1428576 | 156.4 KB | 24.849424 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 629.3 KB | 9999.0 | 1.0 | 28.0 |
mean | 629.3 KB | 9999.0 | 1.0 | 28.0 |
min | 629.3 KB | 9999.0 | 1.0 | 28.0 |
max | 629.3 KB | 9999.0 | 1.0 | 28.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 629.3 KB | 9999.0 | 1.0 | 28.0 |
ID | Case Number | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | Weekend | Season | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int | int | enum |
mins | 21735.0 | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | 9931318.737373699 | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 | 0.35753575357535755 | NaN |
maxs | 9962898.0 | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
sigma | 396787.5642214295 | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 | 0.47929835538994453 | NaN |
zeros | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 | 6424 | 5805 |
missing | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 9955810.0 | HY144797 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
1 | 9955861.0 | HY144838 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
2 | 9955801.0 | HY144779 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
3 | 9956197.0 | HY144787 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring | |
4 | 9955846.0 | HY144829 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
5 | 9955835.0 | HY144778 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
6 | 9955872.0 | HY144822 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
7 | 21752.0 | HY144738 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
8 | 9955808.0 | HY144775 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
9 | 9958275.0 | HY146732 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()
Rows:9,999 Cols:28 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 1 | 3.5714288 | 80 B | 0.0124154 |
CBS | Bits | 2 | 7.1428576 | 2.6 KB | 0.4097082 |
C1 | 1-Byte Integers | 8 | 28.57143 | 78.6 KB | 12.498584 |
C1N | 1-Byte Integers (w/o NAs) | 7 | 25.0 | 68.8 KB | 10.936261 |
C2 | 2-Byte Integers | 4 | 14.285715 | 78.4 KB | 12.456371 |
C4 | 4-Byte Integers | 3 | 10.714286 | 117.4 KB | 18.652899 |
CStr | String | 1 | 3.5714288 | 127.0 KB | 20.184338 |
C8D | 64-bit Reals | 2 | 7.1428576 | 156.4 KB | 24.849424 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 629.3 KB | 9999.0 | 1.0 | 28.0 |
mean | 629.3 KB | 9999.0 | 1.0 | 28.0 |
min | 629.3 KB | 9999.0 | 1.0 | 28.0 |
max | 629.3 KB | 9999.0 | 1.0 | 28.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 629.3 KB | 9999.0 | 1.0 | 28.0 |
ID | Case Number | Block | IUCR | Primary Type | Description | Location Description | Arrest | Domestic | Beat | District | Ward | Community Area | FBI Code | X Coordinate | Y Coordinate | Year | Updated On | Latitude | Longitude | Location | Day | Month | WeekNum | WeekDay | HourOfDay | Weekend | Season | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | string | enum | int | enum | enum | enum | enum | enum | int | int | int | int | int | int | int | int | enum | real | real | enum | int | int | int | enum | int | int | enum |
mins | 21735.0 | NaN | 0.0 | 110.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 111.0 | 1.0 | 1.0 | 1.0 | 2.0 | 1100317.0 | 1814255.0 | 3915.0 | 0.0 | 41.64507243 | -87.906463888 | 0.0 | 1.0 | 2.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 |
mean | 9931318.737373699 | NaN | NaN | 1189.676513569939 | NaN | NaN | NaN | 0.29282928292829274 | 0.15231523152315235 | 1159.6180618061765 | 11.348988512757918 | 22.954095409541008 | 37.447644764476536 | 12.740123622682114 | 1163880.5981498407 | 1885916.1498424308 | 3915.0 | NaN | 41.842565224673535 | -87.67414052209607 | NaN | 17.683968396839663 | 2.419441944194423 | 5.1808180818082 | NaN | 13.631963196319662 | 0.35753575357535755 | NaN |
maxs | 9962898.0 | NaN | 6517.0 | 5131.0 | 26.0 | 198.0 | 90.0 | 1.0 | 1.0 | 2535.0 | 25.0 | 50.0 | 77.0 | 26.0 | 1205069.0 | 1951533.0 | 3915.0 | 32.0 | 42.022646183 | -87.524773286 | 8603.0 | 31.0 | 3.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
sigma | 396787.5642214295 | NaN | NaN | 927.7514355826443 | NaN | NaN | NaN | 0.4550835155878833 | 0.3593441468595258 | 695.7602987498396 | 6.945474933012859 | 13.649566114361296 | 21.274876222320856 | 7.574238579108433 | 16496.449368147238 | 31274.01631985589 | 0.0 | NaN | 0.08601865793584824 | 0.06003579706529789 | NaN | 11.180104335827702 | 0.4934924067865386 | 0.7389298304087689 | NaN | 6.4732173580715475 | 0.47929835538994453 | NaN |
zeros | 0 | 0 | 3 | 0 | 11 | 933 | 19 | 7071 | 8476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 603 | 0 | 0 | 1 | 0 | 0 | 0 | 1038 | 374 | 6424 | 5805 |
missing | 0 | 0 | 0 | 419 | 0 | 0 | 6 | 0 | 0 | 0 | 162 | 0 | 0 | 2557 | 162 | 162 | 0 | 0 | 162 | 162 | 162 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | 9955810.0 | HY144797 | 081XX S COLES AVE | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 422.0 | 4.0 | 7.0 | 46.0 | 18.0 | 1198273.0 | 1851626.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.747693646 | -87.54903538900001 | (41.747693646, -87.549035389) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
1 | 9955861.0 | HY144838 | 118XX S STATE ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | true | true | 522.0 | 5.0 | 34.0 | 53.0 | nan | 1178335.0 | 1826581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.679442289 | -87.622850758 | (41.679442289, -87.622850758) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
2 | 9955801.0 | HY144779 | 002XX S LARAMIE AVE | 2026.0 | NARCOTICS | POSS: PCP | SIDEWALK | true | false | 1522.0 | 15.0 | 29.0 | 25.0 | 18.0 | 1141717.0 | 1898581.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.877773330000004 | -87.755117993 | (41.87777333, -87.755117993) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
3 | 9956197.0 | HY144787 | 006XX E 67TH ST | 1811.0 | NARCOTICS | POSS: CANNABIS 30GMS OR LESS | STREET | true | false | 321.0 | nan | 6.0 | 42.0 | 18.0 | nan | nan | 3915.0 | 02/15/2015 12:43:39 PM | nan | nan | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring | |
4 | 9955846.0 | HY144829 | 0000X S MAYFIELD AVE | 610.0 | BURGLARY | FORCIBLE ENTRY | APARTMENT | false | false | 1513.0 | 15.0 | 29.0 | 25.0 | 5.0 | 1137239.0 | 1899372.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.880025548000006 | -87.77154132400001 | (41.880025548, -87.771541324) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
5 | 9955835.0 | HY144778 | 010XX W 48TH ST | 486.0 | BATTERY | DOMESTIC BATTERY SIMPLE | APARTMENT | false | true | 933.0 | 9.0 | 3.0 | 61.0 | nan | 1169986.0 | 1873019.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.807059405000004 | -87.65206589 | (41.807059405, -87.65206589) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
6 | 9955872.0 | HY144822 | 015XX W ARTHUR AVE | 1320.0 | CRIMINAL DAMAGE | TO VEHICLE | STREET | false | false | 2432.0 | 24.0 | 40.0 | 1.0 | 14.0 | 1164732.0 | 1943222.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.999814056000005 | -87.669342967 | (41.999814056, -87.669342967) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
7 | 21752.0 | HY144738 | 060XX W GRAND AVE | 110.0 | HOMICIDE | FIRST DEGREE MURDER | STREET | true | false | 2512.0 | 25.0 | 37.0 | 19.0 | nan | 1135910.0 | 1914206.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.920755683 | -87.776067514 | (41.920755683, -87.776067514) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
8 | 9955808.0 | HY144775 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | OTHER | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
9 | 9958275.0 | HY146732 | 001XX W WACKER DR | 460.0 | BATTERY | SIMPLE | HOTEL/MOTEL | false | false | 122.0 | 1.0 | 42.0 | 32.0 | nan | 1175384.0 | 1902088.0 | 3915.0 | 02/15/2015 12:43:39 PM | 41.886707818000005 | -87.63139635600001 | (41.886707818, -87.631396356) | 8.0 | 3.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]
# Simple GBM - Predict Arrest
crimes_names_x = crimes.names[:]
crimes_names_x.remove("Arrest")
data_gbm = H2OGradientBoostingEstimator(ntrees =10,
max_depth =6,
distribution ="bernoulli")
data_gbm.train(x =crimes_names_x,
y ="Arrest",
training_frame =train,
validation_frame=test)
# Simple Deep Learning - Predict Arrest
# data_dl = H2ODeepLearningEstimator(variable_importances=True,
# loss ="Automatic")
# data_dl.train(x =crimes_names_x,
# y ="Arrest",
# training_frame =train,
# validation_frame=test)
gbm Model Build Progress: [##################################################] 100% deeplearning Model Build Progress: [##################################################] 100%
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm = data_gbm.model_performance(test) .auc()
# Deep Learning performance on train/test data
# train_auc_dl = data_dl.model_performance(train).auc()
# test_auc_dl = data_dl.model_performance(test) .auc()
# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table = [
["GBM", train_auc_gbm, test_auc_gbm],
# ["DL ", train_auc_dl, test_auc_dl]
]
h2o.display.H2OTableDisplay(table, columns_labels=header)
Model | AUC Train | AUC Test |
GBM | 0.9568221 | 0.9307979 |
DL | 0.8956055 | 0.8841564 |
Model | AUC Train | AUC Test |
GBM | 0.9568221 | 0.9307979 |
DL | 0.8956055 | 0.8841564 |
# Create new H2OFrame of crime observations
examples = {
"Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
"IUCR": [1811, 1150],
"Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"],
"Location.Description": ["STREET", "RESIDENCE"],
"Domestic": ["false", "false"],
"Beat": [422, 923],
"District": [4, 9],
"Ward": [7, 14],
"Community.Area": [46, 63],
"FBI.Code": [18, 11]
}
crime_examples = h2o.H2OFrame(examples)
# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()
Parse Progress: [##################################################] 100% Rows:2 Cols:16 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 7 | 43.75 | 560 B | 43.75 |
C1N | 1-Byte Integers (w/o NAs) | 4 | 25.0 | 280 B | 21.875 |
C2 | 2-Byte Integers | 2 | 12.5 | 144 B | 11.25 |
C2S | 2-Byte Fractions | 1 | 6.25 | 88 B | 6.875 |
CStr | String | 2 | 12.5 | 208 B | 16.25 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 1.3 KB | 2.0 | 1.0 | 16.0 |
mean | 1.3 KB | 2.0 | 1.0 | 16.0 |
min | 1.3 KB | 2.0 | 1.0 | 16.0 |
max | 1.3 KB | 2.0 | 1.0 | 16.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.3 KB | 2.0 | 1.0 | 16.0 |
Primary.Type | Domestic | FBI.Code | Ward | District | Community.Area | Location.Description | Date | IUCR | Beat | Day | Month | Year | WeekNum | WeekDay | HourOfDay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | string | enum | int | int | int | int | string | int | int | int | int | int | int | int | enum | int |
mins | NaN | 0.0 | 11.0 | 7.0 | 4.0 | 46.0 | NaN | 1423465239000.0 | 1150.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
mean | NaN | 0.0 | 14.5 | 10.5 | 6.5 | 54.5 | NaN | 1423466538500.0 | 1480.5 | 672.5 | 8.0 | 3.0 | 3915.0 | 6.0 | NaN | 23.0 |
maxs | NaN | 0.0 | 18.0 | 14.0 | 9.0 | 63.0 | NaN | 1423467838000.0 | 1811.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 |
sigma | NaN | 0.0 | 4.949747468305833 | 4.949747468305833 | 3.5355339059327378 | 12.020815280171307 | NaN | 1837770.524303837 | 467.39758236430794 | 354.26049737446033 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 |
zeros | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | NARCOTICS | false | 18.0 | 7.0 | 4.0 | 46.0 | STREET | 1423467838000.0 | 1811.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 |
1 | DECEPTIVE PRACTICE | false | 11.0 | 14.0 | 9.0 | 63.0 | RESIDENCE | 1423465239000.0 | 1150.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 |
Rows:2 Cols:18 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 9 | 50.0 | 720 B | 50.0 |
C1N | 1-Byte Integers (w/o NAs) | 4 | 22.222223 | 280 B | 19.444445 |
C2 | 2-Byte Integers | 2 | 11.111112 | 144 B | 10.0 |
C2S | 2-Byte Fractions | 1 | 5.555556 | 88 B | 6.111111 |
CStr | String | 2 | 11.111112 | 208 B | 14.444445 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.43:54321 | 1.4 KB | 2.0 | 1.0 | 18.0 |
mean | 1.4 KB | 2.0 | 1.0 | 18.0 |
min | 1.4 KB | 2.0 | 1.0 | 18.0 |
max | 1.4 KB | 2.0 | 1.0 | 18.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.4 KB | 2.0 | 1.0 | 18.0 |
Primary.Type | Domestic | FBI.Code | Ward | District | Community.Area | Location.Description | Date | IUCR | Beat | Day | Month | Year | WeekNum | WeekDay | HourOfDay | Weekend | Season | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | string | enum | int | int | int | int | string | int | int | int | int | int | int | int | enum | int | int | enum |
mins | NaN | 0.0 | 11.0 | 7.0 | 4.0 | 46.0 | NaN | 1423465239000.0 | 1150.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
mean | NaN | 0.0 | 14.5 | 10.5 | 6.5 | 54.5 | NaN | 1423466538500.0 | 1480.5 | 672.5 | 8.0 | 3.0 | 3915.0 | 6.0 | NaN | 23.0 | 1.0 | NaN |
maxs | NaN | 0.0 | 18.0 | 14.0 | 9.0 | 63.0 | NaN | 1423467838000.0 | 1811.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | 6.0 | 23.0 | 1.0 | 1.0 |
sigma | NaN | 0.0 | 4.949747468305833 | 4.949747468305833 | 3.5355339059327378 | 12.020815280171307 | NaN | 1837770.524303837 | 467.39758236430794 | 354.26049737446033 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 0.0 | 0.0 | NaN |
zeros | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
0 | NARCOTICS | false | 18.0 | 7.0 | 4.0 | 46.0 | STREET | 1423467838000.0 | 1811.0 | 422.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
1 | DECEPTIVE PRACTICE | false | 11.0 | 14.0 | 9.0 | 63.0 | RESIDENCE | 1423465239000.0 | 1150.0 | 923.0 | 8.0 | 3.0 | 3915.0 | 6.0 | Sun | 23.0 | 1.0 | Spring |
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
# dl_pred = data_dl .predict(crime_examples)
# Make a pretty HTML table printout of the results
# header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
# table = [
# [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
# [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
# ]
header = ["FBI Code", "GBM Arrest Prob"]
table = [
[examples["FBI.Code"][0], gbm_pred[0,"true"]],
[examples["FBI.Code"][1], gbm_pred[1,"true"]],
]
h2o.display.H2OTableDisplay(table, columns_labels=header)
FBI Code | GBM Arrest Prob | DL Arrest Prob |
18 | 0.1199714 | 0.3047381 |
11 | 0.1199714 | 0.2496035 |
FBI Code | GBM Arrest Prob | DL Arrest Prob |
18 | 0.1199714 | 0.3047381 |
11 | 0.1199714 | 0.2496035 |