In [1]:

import h2o
import time

In [2]:

# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()

H2O cluster uptime:	1 hours 58 minutes 9 seconds 765 milliseconds
H2O cluster version:	3.1.0.99999
H2O cluster name:	spencer
H2O cluster total nodes:	1
H2O cluster total memory:	14.22 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

In [3]:

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return h2o.locate(s)

In [4]:

# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print "Import and Parse bike data"
data = h2o.import_frame(path=big_test)

Import and Parse bike data

Parse Progress: [##################################################] 100%

Parsed 10,407,546 rows and 15 cols:

File1	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-07.csv
File2	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-08.csv
File3	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-09.csv
File4	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-10.csv
File5	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-11.csv
File6	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-12.csv
File7	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-01.csv
File8	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-02.csv
File9	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-03.csv
File10	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-04.csv
File11	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-05.csv
File12	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-06.csv
File13	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-07.csv
File14	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-08.csv

In [5]:

# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()

Rows: 10,407,546 Cols: 16

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	117	1.5298117	9.1 KB	0.0015500536
C1	1-Byte Integers	478	6.25	10.0 MB	1.7289143
C1N	1-Byte Integers (w/o NAs)	478	6.25	10.0 MB	1.7289143
C1S	1-Byte Fractions	839	10.970188	17.5 MB	3.042758
C2	2-Byte Integers	2616	34.20502	108.8 MB	18.8909
C2S	2-Byte Fractions	314	4.1056485	12.9 MB	2.2460942
C4	4-Byte Integers	214	2.7981172	17.9 MB	3.1005228
C4S	4-Byte Fractions	389	5.086297	32.4 MB	5.625424
C8	64-bit Integers	680	8.891213	113.5 MB	19.704786
C8D	64-bit Reals	1523	19.913704	253.0 MB	43.930134

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	575.9 MB	10407546.0	478.0	7648.0
mean	575.9 MB	10407546.0	478.0	7648.0
min	575.9 MB	10407546.0	478.0	7648.0
max	575.9 MB	10407546.0	478.0	7648.0
stddev	0 B	0.0	0.0	0.0
total	575.9 MB	10407546.0	478.0	7648.0

Column-by-Column Summary:

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender	Days
type	int	time	time	int	enum	real	real	int	enum	real	real	int	enum	int	int	int
mins	60.0	1.372662e+12	1.372662242e+12	72.0	0.0	40.680342423	-74.01713445	72.0	0.0	40.680342423	-74.01713445	14529.0	0.0	1899.0	0.0	15887.0
maxs	6250750.0	1.409554787e+12	1.409563605e+12	3002.0	339.0	40.771522	-73.9500479759	3002.0	339.0	40.771522	-73.9500479759	21689.0	1.0	1998.0	2.0	16314.0
sigma	2985.10540532	11806578171.7	11806555707.8	355.755989765	103.210304227	0.0197100508736	0.0123453320185	360.070380844	103.205091206	0.0197309578633	0.0124311861598	1938.80517884	0.324807387506	11.132784905	0.563019777794	136.647269305
zero_count	0	0	0	0	56836	0	0	0	55167	0	0	0	1247534	0	1248517	0
missing_count	0	0	0	0	0	0	0	0	0	0	0	0	0	1247644	0	0

In [6]:

# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
group_by_cols = ["Days","start station name"]
aggregates = {"bikes": ["count", 0, "all"]}
bpd = data.group_by(cols=group_by_cols, aggregates=aggregates) # Compute bikes-per-day
bpd.show()
bpd.describe()
bpd.dim()

First 10 rows and first 3 columns:

Days	start station name	bikes
16234	Concord St & Bridge St	15
16106	Cumberland St & Lafayette Ave	6
15978	DeKalb Ave & Hudson Ave	36
16088	Allen St & Hester St	55
15945	Allen St & Rivington St	140
16251	Clinton St & Grand St	79
16123	Clinton St & Joralemon St	6
15995	Clinton St & Tillary St	22
16313	Greenwich St & N Moore St	74
16185	Hancock St & Bedford Ave	14

Rows: 139,261 Cols: 3

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C2	2-Byte Integers	96	100.0	822.4 KB	100.0

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	822.4 KB	139261.0	32.0	96.0
mean	822.4 KB	139261.0	32.0	96.0
min	822.4 KB	139261.0	32.0	96.0
max	822.4 KB	139261.0	32.0	96.0
stddev	0 B	0.0	0.0	0.0
total	822.4 KB	139261.0	32.0	96.0

Column-by-Column Summary:

	Days	start station name	bikes
type	int	enum	int
mins	15887.0	0.0	1.0
maxs	16314.0	339.0	680.0
sigma	123.635133897	98.50295732	64.1243887565
zero_count	0	428	0
missing_count	0	0	0

Out[6]:

[139261, 3]

In [7]:

# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print "Quantiles of bikes-per-day"
bpd["bikes"].quantile().show()

Quantiles of bikes-per-day
First 9 rows and first 2 columns:

Probs	bikesQuantiles
0.01	2
0.1	11
0.25	26
0.333	35
0.5	58
0.667	89
0.75	107
0.9	157
0.99	291

In [8]:

# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print "Bikes-Per-Day"
bpd.describe()

Bikes-Per-Day
Rows: 139,261 Cols: 5

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C1N	1-Byte Integers (w/o NAs)	64	40.0	276.2 KB	25.145071
C2	2-Byte Integers	96	60.000004	822.4 KB	74.85493

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	1.1 MB	139261.0	32.0	160.0
mean	1.1 MB	139261.0	32.0	160.0
min	1.1 MB	139261.0	32.0	160.0
max	1.1 MB	139261.0	32.0	160.0
stddev	0 B	0.0	0.0	0.0
total	1.1 MB	139261.0	32.0	160.0

Column-by-Column Summary:

	Days	start station name	bikes	Month	DayOfWeek
type	int	enum	int	enum	enum
mins	15887.0	0.0	1.0	0.0	0.0
maxs	16314.0	339.0	680.0	11.0	6.0
sigma	123.635133897	98.50295732	64.1243887565	3.20373100216	2.00302100015
zero_count	0	428	0	9949	19880
missing_count	0	0	0	0	0

In [9]:

# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print "Training data has",train.ncol(),"columns and",train.nrow(),"rows, test has",test.nrow(),"rows, holdout has",hold.nrow()
  
  # Run GBM
  s = time.time()
  gbm0 = h2o.gbm(x           =train.drop("bikes"),
                 y           =train     ["bikes"],
                 validation_x=test .drop("bikes"),
                 validation_y=test      ["bikes"],
                 ntrees=500, # 500 works well
                 max_depth=6,
                 learn_rate=0.1)
  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
  drf0 = h2o.random_forest(x =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                ntrees=250,
                max_depth=30)
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  s = time.time()
  glm0 = h2o.glm(x           =train.drop("bikes"),
                 y           =train     ["bikes"],
                 validation_x=test .drop("bikes"),
                 validation_y=test      ["bikes"],
                 Lambda=[1e-5],
                 family="poisson")
  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()
  dl0 = h2o.deeplearning(x  =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                hidden=[50,50,50,50],
                epochs=50)
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm0.model_performance(train).r2()
  test_r2_gbm  = gbm0.model_performance(test ).r2()
  hold_r2_gbm  = gbm0.model_performance(hold ).r2()
#   print "GBM R2 TRAIN=",train_r2_gbm,", R2 TEST=",test_r2_gbm,", R2 HOLDOUT=",hold_r2_gbm
  
  train_r2_drf = drf0.model_performance(train).r2()
  test_r2_drf  = drf0.model_performance(test ).r2()
  hold_r2_drf  = drf0.model_performance(hold ).r2()
#   print "DRF R2 TRAIN=",train_r2_drf,", R2 TEST=",test_r2_drf,", R2 HOLDOUT=",hold_r2_drf
  
  train_r2_glm = glm0.model_performance(train).r2()
  test_r2_glm  = glm0.model_performance(test ).r2()
  hold_r2_glm  = glm0.model_performance(hold ).r2()
#   print "GLM R2 TRAIN=",train_r2_glm,", R2 TEST=",test_r2_glm,", R2 HOLDOUT=",hold_r2_glm
    
  train_r2_dl = dl0.model_performance(train).r2()
  test_r2_dl  = dl0.model_performance(test ).r2()
  hold_r2_dl  = dl0.model_performance(hold ).r2()
#   print " DL R2 TRAIN=",train_r2_dl,", R2 TEST=",test_r2_dl,", R2 HOLDOUT=",hold_r2_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "R2 TRAIN", "R2 TEST", "R2 HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_r2_gbm, test_r2_gbm, hold_r2_gbm, round(gbm_elapsed,3)],
            ["DRF", train_r2_drf, test_r2_drf, hold_r2_drf, round(drf_elapsed,3)],
            ["GLM", train_r2_glm, test_r2_glm, hold_r2_glm, round(glm_elapsed,3)],
            ["DL ", train_r2_dl,  test_r2_dl,  hold_r2_dl , round(dl_elapsed,3) ],
           ]
  h2o.H2ODisplay(table,header)
  # --------------

In [10]:

# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.

Training data has 5 columns and 83754 rows, test has 41608 rows, holdout has 13899

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	0.967596306547	0.921163552313	0.924511283963	21.313
DRF	0.849449692569	0.819592185529	0.824331359117	27.972
GLM	0.784089976397	0.784183689467	0.788781938295	0.446
DL	0.90367986631	0.88016358277	0.882139604948	54.633

In [11]:

# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_frame(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()

Parse Progress: [##################################################] 100%

Parsed 17,520 rows and 50 cols:

File1	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv
File2	/Users/spencer/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv

Rows: 17,520 Cols: 50

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	107	6.294118	8.4 KB	0.7889721
C0D	Constant Reals	436	25.647058	34.1 KB	3.2148771
CXI	Sparse Integers	17	1.0	1.5 KB	0.13991351
C1	1-Byte Integers	346	20.352942	197.4 KB	18.634672
C1N	1-Byte Integers (w/o NAs)	214	12.588236	122.3 KB	11.544063
C1S	1-Byte Fractions	214	12.588236	125.3 KB	11.822968
C2S	2-Byte Fractions	196	11.529412	214.5 KB	20.242111
C4S	4-Byte Fractions	170	10.0	356.1 KB	33.612423

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	1.0 MB	17520.0	34.0	1700.0
mean	1.0 MB	17520.0	34.0	1700.0
min	1.0 MB	17520.0	34.0	1700.0
max	1.0 MB	17520.0	34.0	1700.0
stddev	0 B	0.0	0.0	0.0
total	1.0 MB	17520.0	34.0	1700.0

Column-by-Column Summary:

	Year Local	Month Local	Day Local	Hour Local	Year UTC	Month UTC	Day UTC	Hour UTC	Cavok Reported	Cloud Ceiling (m)	Cloud Cover Fraction	Cloud Cover Fraction 1	Cloud Cover Fraction 2	Cloud Cover Fraction 3	Cloud Cover Fraction 4	Cloud Cover Fraction 5	Cloud Cover Fraction 6	Cloud Height (m) 1	Cloud Height (m) 2	Cloud Height (m) 3	Cloud Height (m) 4	Cloud Height (m) 5	Cloud Height (m) 6	Dew Point (C)	Humidity Fraction	Precipitation One Hour (mm)	Pressure Altimeter (mbar)	Pressure Sea Level (mbar)	Pressure Station (mbar)	Snow Depth (cm)	Temperature (C)	Visibility (km)	Weather Code 1	Weather Code 1/ Description	Weather Code 2	Weather Code 2/ Description	Weather Code 3	Weather Code 3/ Description	Weather Code 4	Weather Code 4/ Description	Weather Code 5	Weather Code 5/ Description	Weather Code 6	Weather Code 6/ Description	Weather Code Most Severe / Icon Code	Weather Code Most Severe	Weather Code Most Severe / Description	Wind Direction (degrees)	Wind Gust (m/s)	Wind Speed (m/s)
type	int	int	int	int	int	int	int	int	int	real	real	real	real	real	int	int	int	real	real	real	int	int	int	real	real	real	real	int	int	int	real	real	int	enum	int	enum	int	enum	int	enum	int	enum	int	enum	int	int	enum	int	real	real
mins	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	0.0	0.0	61.0	0.0	0.0	0.25	0.5	NaN	NaN	NaN	60.96	213.36	365.76	NaN	NaN	NaN	-26.7	0.1251	0.0	983.2949	NaN	NaN	NaN	-15.6	0.001	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	3.0	0.0	0.0	1.0	0.0	10.0	7.2	0.0
maxs	2014.0	12.0	31.0	23.0	2015.0	12.0	31.0	23.0	0.0	3657.6	1.0	1.0	1.0	1.0	NaN	NaN	NaN	3657.5999	3657.5999	3657.5999	NaN	NaN	NaN	24.4	1.0	26.924	1042.2113	NaN	NaN	NaN	36.1	16.0934	60.0	11.0	60.0	10.0	36.0	7.0	27.0	4.0	27.0	2.0	3.0	0.0	16.0	60.0	11.0	360.0	20.58	10.8
sigma	0.500014270017	3.44794972385	8.79649804852	6.92238411188	0.500584411716	3.44782405458	8.79561488868	6.92230165203	0.0	995.339856966	0.462720830993	0.42770569708	0.197155690367	0.0861015598104	NaN	NaN	NaN	962.743095854	916.73861349	887.215847511	NaN	NaN	NaN	10.9731282097	0.185792011866	2.56215129179	7.46451697179	NaN	NaN	NaN	10.0396739531	3.69893623033	5.70486576983	2.47814708663	6.13386253912	1.83976235335	5.80553286364	1.28967553698	3.12340844261	1.282164369	6.15223536611	0.60207972894	0.0	0.0	4.07386062702	5.70486576983	2.47814708663	106.350000031	1.81511871115	1.61469790524
zero_count	0	0	0	730	0	0	0	730	17455	0	8758	8758	0	0	0	0	0	0	0	0	0	0	0	268	0	501	0	0	0	0	269	0	0	17	0	30	0	13	0	20	0	12	0	2	14980	0	17	0	0	2768
missing_count	0	0	0	0	0	0	0	0	65	10780	375	375	14682	16535	17520	17520	17520	9103	14683	16535	17520	17520	17520	67	67	15660	360	17520	17520	17520	67	412	14980	14980	16477	16477	17181	17181	17433	17433	17504	17504	17518	17518	0	14980	14980	9382	14381	1283

In [12]:

# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.setName(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.setName(wthr2.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!  

Rows: 17,520 Cols: 9

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	46	15.0326805	3.6 KB	1.780005
C1	1-Byte Integers	34	11.111112	19.4 KB	9.592678
C1N	1-Byte Integers (w/o NAs)	90	29.411766	51.5 KB	25.494701
C1S	1-Byte Fractions	42	13.725491	24.0 KB	11.894592
C2S	2-Byte Fractions	94	30.718956	103.4 KB	51.238026

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	201.9 KB	17520.0	34.0	306.0
mean	201.9 KB	17520.0	34.0	306.0
min	201.9 KB	17520.0	34.0	306.0
max	201.9 KB	17520.0	34.0	306.0
stddev	0 B	0.0	0.0	0.0
total	201.9 KB	17520.0	34.0	306.0

Column-by-Column Summary:

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1
type	int	int	int	int	real	real	real	real	enum
mins	2013.0	1.0	1.0	0.0	-26.7	0.1251	0.0	-15.6	0.0
maxs	2014.0	12.0	31.0	23.0	24.4	1.0	26.924	36.1	11.0
sigma	0.500014270017	3.44794972385	8.79649804852	6.92238411188	10.9731282097	0.185792011866	2.56215129179	10.0396739531	2.47814708663
zero_count	0	0	0	730	268	0	501	269	17
missing_count	0	0	0	0	67	67	15660	67	14980

In [13]:

# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]

In [14]:

# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.  

Rows: 730 Cols: 11

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	80	21.390373	6.3 KB	11.955688
C0D	Constant Reals	13	3.4759357	1.0 KB	1.9427994
C1	1-Byte Integers	30	8.021391	2.6 KB	5.0176535
C1N	1-Byte Integers (w/o NAs)	56	14.973262	4.9 KB	9.375875
C1S	1-Byte Fractions	34	9.090909	3.5 KB	6.698922
C2S	2-Byte Fractions	34	9.090909	4.2 KB	8.062618
C8D	64-bit Reals	127	33.95722	29.8 KB	56.946445

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	52.3 KB	730.0	34.0	374.0
mean	52.3 KB	730.0	34.0	374.0
min	52.3 KB	730.0	34.0	374.0
max	52.3 KB	730.0	34.0	374.0
stddev	0 B	0.0	0.0	0.0
total	52.3 KB	730.0	34.0	374.0

Column-by-Column Summary:

	Year Local	Month Local	Day Local	Hour Local	Dew Point (C)	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	msec	Days
type	int	int	int	int	real	real	real	real	enum	int	int
mins	2013.0	1.0	1.0	12.0	-26.7	0.1723	0.0	-13.9	0.0	1.3570704e+12	15706.0
maxs	2014.0	12.0	31.0	12.0	23.3	1.0	12.446	34.4	10.0	1.420056e+12	16435.0
sigma	0.500342818004	3.45021529307	8.80227802701	0.0	11.1062964725	0.179945027923	2.36064248615	10.3989855149	2.74674726123	18219740080.4	210.877136425
zero_count	0	0	0	0	14	0	15	7	1	0	0
missing_count	0	0	0	0	3	3	660	3	620	0	0

In [15]:

# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [16]:

# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0

In [17]:

# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print "Merge Daily Weather with Bikes-Per-Day"
bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
bpd_with_weather.describe()
bpd_with_weather.show()

Merge Daily Weather with Bikes-Per-Day
Rows: 139,261 Cols: 10

Chunk compression summary:

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C1	1-Byte Integers	32	10.0	138.1 KB	2.4677303
C1N	1-Byte Integers (w/o NAs)	64	20.0	276.2 KB	4.9354606
C2	2-Byte Integers	96	30.000002	822.4 KB	14.692484
C8D	64-bit Reals	128	40.0	4.3 MB	77.90433

Frame distribution summary:

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	5.5 MB	139261.0	32.0	320.0
mean	5.5 MB	139261.0	32.0	320.0
min	5.5 MB	139261.0	32.0	320.0
max	5.5 MB	139261.0	32.0	320.0
stddev	0 B	0.0	0.0	0.0
total	5.5 MB	139261.0	32.0	320.0

Column-by-Column Summary:

	Days	start station name	bikes	Month	DayOfWeek	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	Dew Point (C)
type	int	enum	int	enum	enum	real	real	real	enum	real
mins	15887.0	0.0	1.0	0.0	0.0	0.1723	0.0	-13.9	0.0	-26.7
maxs	16314.0	339.0	680.0	11.0	6.0	1.0	8.382	34.4	10.0	23.3
sigma	123.635133897	98.50295732	64.1243887565	3.20373100216	2.00302100015	0.178408938664	1.76771960813	10.9454511961	2.962709609	11.7308194576
zero_count	0	428	0	9949	19880	0	2909	1598	324	1954
missing_count	0	0	0	0	0	981	128246	981	119130	981

First 10 rows and first 10 columns:

Days	start station name	bikes	Month	DayOfWeek	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	Dew Point (C)
16234	Concord St & Bridge St	15	6	Thu	0.934	0.508	20.0	mist	18.900000000000002
16106	Cumberland St & Lafayette Ave	6	2	Tue	0.9228000000000001		0.0	mist	-1.1
15978	DeKalb Ave & Hudson Ave	36	9	Sun	0.46880000000000005		20.0		8.3
16088	Allen St & Hester St	55	1	Fri	1.0		1.0		1.0
15945	Allen St & Rivington St	140	8	Tue	0.5681	0.0	28.3	light rain	18.900000000000002
16251	Clinton St & Grand St	79	6	Sun	0.5275000000000001		27.200000000000003		16.7
16123	Clinton St & Joralemon St	6	2	Fri	0.3141		9.4		-6.7
15995	Clinton St & Tillary St	22	10	Wed	0.6765		20.6		14.4
16313	Greenwich St & N Moore St	74	8	Sat	0.6287		28.900000000000002		21.1
16185	Hancock St & Bedford Ave	14	4	Thu	0.2092		15.0		-7.2

In [18]:

# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)

Training data has 10 columns and 83357 rows, test has 41881 rows, holdout has 14023

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	0.966646796949	0.925059327615	0.924489377492	28.364
DRF	0.896356580051	0.846221607246	0.849226966424	146.965
GLM	0.90033401895	0.874089944288	0.888666983774	0.245
DL	0.949803523619	0.921881717344	0.923630675348	60.212