import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day. Use 10 million rows of historical
# data, and eventually add weather data.
# Connect to a cluster
h2o.init()
Connecting to H2O server at http://localhost:54321... successful!
/Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:90: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead. def _ipython_display_formatter_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:96: DeprecationWarning: DisplayFormatter._formatters_default is deprecated: use @default decorator instead. def _formatters_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead. def _deferred_printers_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead. def _singleton_printers_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead. def _type_printers_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:667: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead. def _singleton_printers_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:670: DeprecationWarning: PlainTextFormatter._type_printers_default is deprecated: use @default decorator instead. def _type_printers_default(self): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/IPython/core/formatters.py:675: DeprecationWarning: PlainTextFormatter._deferred_printers_default is deprecated: use @default decorator instead. def _deferred_printers_default(self):
H2O cluster uptime: | 2 mins 28 secs |
H2O cluster version: | 3.9.1.99999 |
H2O cluster name: | H2O_from_python_navdeepgill_t343ab |
H2O cluster total nodes: | 1 |
H2O cluster free memory: | 3.244 Gb |
H2O cluster total cores: | 8 |
H2O cluster allowed cores: | 8 |
H2O cluster is healthy: | True |
H2O cluster is locked: | True |
H2O connection url: | http://localhost:54321 |
H2O connection proxy: | None |
Python version: | 2.7.11 final |
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False
def mylocate(s):
if data_source_is_s3:
return "s3n://h2o-public-test-data/" + s
else:
return _locate(s)
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test = [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]
# ----------
# 1- Load data - 1 row per bicycle trip. Has columns showing the start and end
# station, trip duration and trip start time and day. The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=small_test)
Import and Parse bike data Warning: Method get_json in class H2OConnection is deprecated.
--------------------------------------------------------------------------- H2OResponseError Traceback (most recent call last) <ipython-input-6-4d7c875f5051> in <module>() 23 # totals about 10 million rows 24 print("Import and Parse bike data") ---> 25 data = h2o.import_file(path=small_test) /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in import_file(path, destination_frame, parse, header, sep, col_names, col_types, na_strings) 336 337 return H2OFrame()._import_parse(path, destination_frame, header, sep, col_names, --> 338 col_types, na_strings) 339 340 /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/frame.pyc in _import_parse(self, path, destination_frame, header, separator, column_names, column_types, na_strings) 196 197 def _import_parse(self, path, destination_frame, header, separator, column_names, column_types, na_strings): --> 198 rawkey = h2o.lazy_import(path) 199 self._parse(rawkey,destination_frame, header, separator, column_names, column_types, na_strings) 200 return self /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in lazy_import(path) 213 A path to a data file (remote or local). 214 """ --> 215 return [_import(p)[0] for p in path] if isinstance(path, (list, tuple)) else _import(path) 216 217 /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/h2o.pyc in _import(path) 217 218 def _import(path): --> 219 j = h2oconn.get_json(url_suffix="ImportFiles", path=path) 220 if j['fails']: raise ValueError("ImportFiles of " + path + " failed on " + str(j['fails'])) 221 return j['destination_frames'] /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/utils/backward_compatibility.pyc in <lambda>(*args, **kwargs) 64 self._bcin = { 65 # Creating lambdas in a loop, need to make sure that `fun` is bound to each lambda separately. ---> 66 name: (lambda fun: lambda *args, **kwargs: fun(self, *args, **kwargs))(fun) 67 for name, fun in viewitems(self._bc["im"]) 68 } /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in <lambda>(*args, **kwargs) 665 "post": lambda *args, **kwargs: _deprecated_post(*args, **kwargs), 666 "delete": lambda *args, **kwargs: _deprecated_delete(*args, **kwargs), --> 667 "get_json": lambda *args, **kwargs: _deprecated_get(*args, **kwargs), 668 "post_json": lambda *args, **kwargs: _deprecated_post(*args, **kwargs), 669 } /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in _deprecated_get(self, url_suffix, **kwargs) 1155 restver = kwargs.pop("_rest_version") if "_rest_version" in kwargs else 3 1156 endpoint = "GET /%d/%s" % (restver, url_suffix) -> 1157 return self.request(endpoint, data=kwargs) 1158 1159 def _deprecated_post(self, url_suffix, **kwargs): /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in request(self, endpoint, data, json, filename) 232 auth=self._auth, verify=self._verify_ssl_cert, proxies=self._proxies) 233 self._log_end_transaction(start_time, resp) --> 234 return self._process_response(resp) 235 236 except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e: /Users/navdeepgill/anaconda/lib/python2.7/site-packages/h2o/connection.pyc in _process_response(response) 585 # Client errors (400 = "Bad Request", 404 = "Not Found", 412 = "Precondition Failed") 586 if status_code in {400, 404, 412} and isinstance(data, (H2OErrorV3, H2OModelBuilderErrorV3)): --> 587 raise H2OResponseError(data) 588 589 # Server errors (notably 500 = "Server Error") H2OResponseError: Server error java.lang.IllegalArgumentException: Error: AWS Access Key ID and Secret Access Key must be specified as the username or password (respectively) of a s3n URL, or by setting the fs.s3n.awsAccessKeyId or fs.s3n.awsSecretAccessKey properties (respectively). Request: GET /3/ImportFiles params: {'path': 's3n://h2o-public-test-data/bigdata/laptop/citibike-nyc/2013-10.csv'}
# ----------
# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.
# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()
Rows:1,037,712 Cols:16 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 17 | 2.2135415 | 1.3 KB | 0.0022872 |
C1 | 1-Byte Integers | 48 | 6.25 | 1016.6 KB | 1.7506603 |
C1N | 1-Byte Integers (w/o NAs) | 48 | 6.25 | 1016.6 KB | 1.7506603 |
C1S | 1-Byte Fractions | 79 | 10.286459 | 1.6 MB | 2.8878725 |
C2 | 2-Byte Integers | 243 | 31.640625 | 10.0 MB | 17.696283 |
C2S | 2-Byte Fractions | 49 | 6.3802085 | 2.0 MB | 3.5701983 |
C4 | 4-Byte Integers | 32 | 4.166667 | 2.6 MB | 4.6726856 |
C4S | 4-Byte Fractions | 39 | 5.078125 | 3.2 MB | 5.6373096 |
C8 | 64-bit Integers | 60 | 7.8125 | 9.9 MB | 17.432673 |
C8D | 64-bit Reals | 153 | 19.921875 | 25.3 MB | 44.59937 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 56.7 MB | 1037712.0 | 48.0 | 768.0 |
mean | 56.7 MB | 1037712.0 | 48.0 | 768.0 |
min | 56.7 MB | 1037712.0 | 48.0 | 768.0 |
max | 56.7 MB | 1037712.0 | 48.0 | 768.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 56.7 MB | 1037712.0 | 48.0 | 768.0 |
tripduration | starttime | stoptime | start station id | start station name | start station latitude | start station longitude | end station id | end station name | end station latitude | end station longitude | bikeid | usertype | birth year | gender | Days | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | time | time | int | enum | real | real | int | enum | real | real | int | enum | int | int | int |
mins | 60.0 | 1.380610868e+12 | 1.380611083e+12 | 72.0 | 0.0 | 40.680342423 | -74.01713445 | 72.0 | 0.0 | 40.680342423 | -74.01713445 | 14529.0 | 0.0 | 1899.0 | 0.0 | 15979.0 |
mean | 825.614754383 | 1.38191371692e+12 | 1.38191454253e+12 | 443.714212614 | NaN | 40.7345188586 | -73.9911328848 | 443.207421712 | NaN | 40.7342847885 | -73.9912702982 | 17644.0716451 | 0.906095332809 | 1975.77839486 | 1.12375591686 | 15993.8523906 |
maxs | 1259480.0 | 1.383289197e+12 | 1.38341851e+12 | 3002.0 | 329.0 | 40.770513 | -73.9500479759 | 3002.0 | 329.0 | 40.770513 | -73.9500479759 | 20757.0 | 1.0 | 1997.0 | 2.0 | 16010.0 |
sigma | 2000.3732323 | 778871729.132 | 778847387.503 | 354.434325075 | NaN | 0.0195734073053 | 0.0123161234106 | 357.398217058 | NaN | 0.0195578458116 | 0.0123855811965 | 1717.68112134 | 0.291696182123 | 11.1314906238 | 0.544380593291 | 9.02215033588 |
zeros | 0 | 0 | 0 | 0 | 5239 | 0 | 0 | 0 | 5449 | 0 | 0 | 0 | 97446 | 0 | 97498 | 0 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 97445 | 0 | 0 |
0 | 326.0 | 1.380610868e+12 | 1.380611194e+12 | 239.0 | Willoughby St & Fleet St | 40.69196566 | -73.9813018 | 366.0 | Clinton Ave & Myrtle Ave | 40.693261 | -73.968896 | 16052.0 | Subscriber | 1982.0 | 1.0 | 15979.0 |
1 | 729.0 | 1.380610881e+12 | 1.38061161e+12 | 322.0 | Clinton St & Tillary St | 40.696192 | -73.991218 | 398.0 | Atlantic Ave & Furman St | 40.69165183 | -73.9999786 | 19412.0 | Customer | nan | 0.0 | 15979.0 |
2 | 520.0 | 1.380610884e+12 | 1.380611404e+12 | 174.0 | E 25 St & 1 Ave | 40.7381765 | -73.97738662 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 19645.0 | Subscriber | 1984.0 | 1.0 | 15979.0 |
3 | 281.0 | 1.380610885e+12 | 1.380611166e+12 | 430.0 | York St & Jay St | 40.7014851 | -73.98656928 | 323.0 | Lawrence St & Willoughby St | 40.69236178 | -73.98631746 | 16992.0 | Subscriber | 1985.0 | 1.0 | 15979.0 |
4 | 196.0 | 1.380610887e+12 | 1.380611083e+12 | 403.0 | E 2 St & 2 Ave | 40.72502876 | -73.99069656 | 401.0 | Allen St & Rivington St | 40.72019576 | -73.98997825 | 15690.0 | Subscriber | 1986.0 | 1.0 | 15979.0 |
5 | 1948.0 | 1.380610908e+12 | 1.380612856e+12 | 369.0 | Washington Pl & 6 Ave | 40.73224119 | -74.00026394 | 307.0 | Canal St & Rutgers St | 40.71427487 | -73.98990025 | 19846.0 | Subscriber | 1977.0 | 1.0 | 15979.0 |
6 | 1327.0 | 1.380610908e+12 | 1.380612235e+12 | 254.0 | W 11 St & 6 Ave | 40.73532427 | -73.99800419 | 539.0 | Metropolitan Ave & Bedford Ave | 40.71534825 | -73.96024116 | 14563.0 | Subscriber | 1986.0 | 2.0 | 15979.0 |
7 | 1146.0 | 1.380610917e+12 | 1.380612063e+12 | 490.0 | 8 Ave & W 33 St | 40.751551 | -73.993934 | 438.0 | St Marks Pl & 1 Ave | 40.72779126 | -73.98564945 | 16793.0 | Subscriber | 1959.0 | 1.0 | 15979.0 |
8 | 380.0 | 1.380610918e+12 | 1.380611298e+12 | 468.0 | Broadway & W 55 St | 40.7652654 | -73.98192338 | 385.0 | E 55 St & 2 Ave | 40.75797322 | -73.96603308 | 16600.0 | Customer | nan | 0.0 | 15979.0 |
9 | 682.0 | 1.380610925e+12 | 1.380611607e+12 | 300.0 | Shevchenko Pl & E 6 St | 40.728145 | -73.990214 | 519.0 | Pershing Square N | 40.75188406 | -73.97770164 | 15204.0 | Subscriber | 1992.0 | 1.0 | 15979.0 |
# Now do a monster Group-By. Count bike starts per-station per-day. Ends up
# with about 340 stations times 400 days (140,000 rows). This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim
Days | start station name | bikes |
---|---|---|
15979 | 1 Ave & E 15 St | 97 |
15979 | 1 Ave & E 18 St | 75 |
15979 | 1 Ave & E 30 St | 113 |
15979 | 10 Ave & W 28 St | 74 |
15979 | 11 Ave & W 27 St | 139 |
15979 | 11 Ave & W 41 St | 60 |
15979 | 12 Ave & W 40 St | 90 |
15979 | 2 Ave & E 31 St | 88 |
15979 | 2 Ave & E 58 St | 55 |
15979 | 3 Ave & Schermerhorn St | 8 |
Rows:10,450 Cols:3 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 1 | 1.0416667 | 80 B | 0.1364815 |
C1N | 1-Byte Integers (w/o NAs) | 1 | 1.0416667 | 412 B | 0.7028798 |
C1S | 1-Byte Fractions | 31 | 32.291664 | 12.4 KB | 21.714207 |
C2 | 2-Byte Integers | 63 | 65.625 | 44.3 KB | 77.446434 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 57.2 KB | 10450.0 | 32.0 | 96.0 |
mean | 57.2 KB | 10450.0 | 32.0 | 96.0 |
min | 57.2 KB | 10450.0 | 32.0 | 96.0 |
max | 57.2 KB | 10450.0 | 32.0 | 96.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 57.2 KB | 10450.0 | 32.0 | 96.0 |
Days | start station name | bikes | |
---|---|---|---|
type | int | enum | int |
mins | 15979.0 | 0.0 | 1.0 |
mean | 15994.4415311 | NaN | 99.3025837321 |
maxs | 16010.0 | 329.0 | 553.0 |
sigma | 9.23370172444 | NaN | 72.9721964301 |
zeros | 0 | 32 | 0 |
missing | 0 | 0 | 0 |
0 | 15979.0 | 1 Ave & E 15 St | 97.0 |
1 | 15979.0 | 1 Ave & E 18 St | 75.0 |
2 | 15979.0 | 1 Ave & E 30 St | 113.0 |
3 | 15979.0 | 10 Ave & W 28 St | 74.0 |
4 | 15979.0 | 11 Ave & W 27 St | 139.0 |
5 | 15979.0 | 11 Ave & W 41 St | 60.0 |
6 | 15979.0 | 12 Ave & W 40 St | 90.0 |
7 | 15979.0 | 2 Ave & E 31 St | 88.0 |
8 | 15979.0 | 2 Ave & E 58 St | 55.0 |
9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 |
[10450, 3]
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()
Quantiles of bikes-per-day
Probs | bikesQuantiles |
---|---|
0.01 | 4.49 |
0.1 | 19 |
0.25 | 43 |
0.333 | 57 |
0.5 | 87 |
0.667 | 118 |
0.75 | 137 |
0.9 | 192 |
0.99 | 334.51 |
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"] = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()
Bikes-Per-Day Rows:10,450 Cols:5 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 33 | 20.625 | 2.6 KB | 3.6613781 |
CBS | Bits | 6 | 3.7500002 | 666 B | 0.9236658 |
C1N | 1-Byte Integers (w/o NAs) | 27 | 16.875 | 10.4 KB | 14.803617 |
C1S | 1-Byte Fractions | 31 | 19.375 | 12.4 KB | 17.65228 |
C2 | 2-Byte Integers | 63 | 39.375 | 44.3 KB | 62.959057 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 70.4 KB | 10450.0 | 32.0 | 160.0 |
mean | 70.4 KB | 10450.0 | 32.0 | 160.0 |
min | 70.4 KB | 10450.0 | 32.0 | 160.0 |
max | 70.4 KB | 10450.0 | 32.0 | 160.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 70.4 KB | 10450.0 | 32.0 | 160.0 |
Days | start station name | bikes | Month | DayOfWeek | |
---|---|---|---|---|---|
type | int | enum | int | enum | enum |
mins | 15979.0 | 0.0 | 1.0 | 0.0 | 0.0 |
mean | 15994.4415311 | NaN | 99.3025837321 | 0.968612440191 | NaN |
maxs | 16010.0 | 329.0 | 553.0 | 1.0 | 6.0 |
sigma | 9.23370172444 | NaN | 72.9721964301 | 0.174371128617 | NaN |
zeros | 0 | 32 | 0 | 328 | 1635 |
missing | 0 | 0 | 0 | 0 | 0 |
0 | 15979.0 | 1 Ave & E 15 St | 97.0 | 9 | Mon |
1 | 15979.0 | 1 Ave & E 18 St | 75.0 | 9 | Mon |
2 | 15979.0 | 1 Ave & E 30 St | 113.0 | 9 | Mon |
3 | 15979.0 | 10 Ave & W 28 St | 74.0 | 9 | Mon |
4 | 15979.0 | 11 Ave & W 27 St | 139.0 | 9 | Mon |
5 | 15979.0 | 11 Ave & W 41 St | 60.0 | 9 | Mon |
6 | 15979.0 | 12 Ave & W 40 St | 90.0 | 9 | Mon |
7 | 15979.0 | 2 Ave & E 31 St | 88.0 | 9 | Mon |
8 | 15979.0 | 2 Ave & E 58 St | 55.0 | 9 | Mon |
9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 | 9 | Mon |
# ----------
# 3- Fit a model on train; using test as validation
# Function for doing class test/train/holdout split
def split_fit_predict(data):
global gbm0,drf0,glm0,dl0
# Classic Test/Train split
r = data['Days'].runif() # Random UNIForm numbers, one per row
train = data[ r < 0.6]
test = data[(0.6 <= r) & (r < 0.9)]
hold = data[ 0.9 <= r ]
print("Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow)
bike_names_x = data.names
bike_names_x.remove("bikes")
# Run GBM
s = time.time()
gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
max_depth=6,
learn_rate=0.1)
gbm0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
gbm_elapsed = time.time() - s
# Run DRF
s = time.time()
drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)
drf0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
drf_elapsed = time.time() - s
# Run GLM
if "WC1" in bike_names_x: bike_names_x.remove("WC1")
s = time.time()
glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
glm0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
glm_elapsed = time.time() - s
# Run DL
s = time.time()
dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
dl0.train(x =bike_names_x,
y ="bikes",
training_frame =train,
validation_frame=test)
dl_elapsed = time.time() - s
# ----------
# 4- Score on holdout set & report
train_mse_gbm = gbm0.model_performance(train).mse()
test_mse_gbm = gbm0.model_performance(test ).mse()
hold_mse_gbm = gbm0.model_performance(hold ).mse()
# print "GBM mse TRAIN=",train_mse_gbm,", mse TEST=",test_mse_gbm,", mse HOLDOUT=",hold_mse_gbm
train_mse_drf = drf0.model_performance(train).mse()
test_mse_drf = drf0.model_performance(test ).mse()
hold_mse_drf = drf0.model_performance(hold ).mse()
# print "DRF mse TRAIN=",train_mse_drf,", mse TEST=",test_mse_drf,", mse HOLDOUT=",hold_mse_drf
train_mse_glm = glm0.model_performance(train).mse()
test_mse_glm = glm0.model_performance(test ).mse()
hold_mse_glm = glm0.model_performance(hold ).mse()
# print "GLM mse TRAIN=",train_mse_glm,", mse TEST=",test_mse_glm,", mse HOLDOUT=",hold_mse_glm
train_mse_dl = dl0.model_performance(train).mse()
test_mse_dl = dl0.model_performance(test ).mse()
hold_mse_dl = dl0.model_performance(hold ).mse()
# print " DL mse TRAIN=",train_mse_dl,", mse TEST=",test_mse_dl,", mse HOLDOUT=",hold_mse_dl
# make a pretty HTML table printout of the results
header = ["Model", "mse TRAIN", "mse TEST", "mse HOLDOUT", "Model Training Time (s)"]
table = [
["GBM", train_mse_gbm, test_mse_gbm, hold_mse_gbm, round(gbm_elapsed,3)],
["DRF", train_mse_drf, test_mse_drf, hold_mse_drf, round(drf_elapsed,3)],
["GLM", train_mse_glm, test_mse_glm, hold_mse_glm, round(glm_elapsed,3)],
["DL ", train_mse_dl, test_mse_dl, hold_mse_dl , round(dl_elapsed,3) ],
]
h2o.display.H2ODisplay(table,header)
# --------------
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM. This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.
Training data has 5 columns and 6172 rows, test has 3238 rows, holdout has 1040 gbm Model Build Progress: [##################################################] 100% drf Model Build Progress: [##################################################] 100% glm Model Build Progress: [##################################################] 100% deeplearning Model Build Progress: [##################################################] 100%
Model | R2 TRAIN | R2 TEST | R2 HOLDOUT | Model Training Time (s) |
GBM | 0.9976981 | 0.9274821 | 0.9183267 | 5.612 |
DRF | 0.8294274 | 0.7694496 | 0.7611063 | 5.607 |
GLM | 0.8597208 | 0.8465429 | 0.8447966 | 0.14 |
DL | 0.9546943 | 0.9115880 | 0.8978001 | 6.845 |
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()
Parse Progress: [##################################################] 100% Rows:17,520 Cols:50 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 107 | 6.294118 | 8.4 KB | 0.7889721 |
C0D | Constant Reals | 436 | 25.647058 | 34.1 KB | 3.2148771 |
CXI | Sparse Integers | 17 | 1.0 | 1.5 KB | 0.1399135 |
C1 | 1-Byte Integers | 346 | 20.352942 | 197.4 KB | 18.634672 |
C1N | 1-Byte Integers (w/o NAs) | 214 | 12.588236 | 122.3 KB | 11.544063 |
C1S | 1-Byte Fractions | 214 | 12.588236 | 125.3 KB | 11.822968 |
C2S | 2-Byte Fractions | 196 | 11.529412 | 214.5 KB | 20.242111 |
C4S | 4-Byte Fractions | 170 | 10.0 | 356.1 KB | 33.612423 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 1.0 MB | 17520.0 | 34.0 | 1700.0 |
mean | 1.0 MB | 17520.0 | 34.0 | 1700.0 |
min | 1.0 MB | 17520.0 | 34.0 | 1700.0 |
max | 1.0 MB | 17520.0 | 34.0 | 1700.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 1.0 MB | 17520.0 | 34.0 | 1700.0 |
Year Local | Month Local | Day Local | Hour Local | Year UTC | Month UTC | Day UTC | Hour UTC | Cavok Reported | Cloud Ceiling (m) | Cloud Cover Fraction | Cloud Cover Fraction 1 | Cloud Cover Fraction 2 | Cloud Cover Fraction 3 | Cloud Cover Fraction 4 | Cloud Cover Fraction 5 | Cloud Cover Fraction 6 | Cloud Height (m) 1 | Cloud Height (m) 2 | Cloud Height (m) 3 | Cloud Height (m) 4 | Cloud Height (m) 5 | Cloud Height (m) 6 | Dew Point (C) | Humidity Fraction | Precipitation One Hour (mm) | Pressure Altimeter (mbar) | Pressure Sea Level (mbar) | Pressure Station (mbar) | Snow Depth (cm) | Temperature (C) | Visibility (km) | Weather Code 1 | Weather Code 1/ Description | Weather Code 2 | Weather Code 2/ Description | Weather Code 3 | Weather Code 3/ Description | Weather Code 4 | Weather Code 4/ Description | Weather Code 5 | Weather Code 5/ Description | Weather Code 6 | Weather Code 6/ Description | Weather Code Most Severe / Icon Code | Weather Code Most Severe | Weather Code Most Severe / Description | Wind Direction (degrees) | Wind Gust (m/s) | Wind Speed (m/s) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | int | int | int | int | int | real | real | real | real | real | int | int | int | real | real | real | int | int | int | real | real | real | real | int | int | int | real | real | int | enum | int | enum | int | enum | int | enum | int | enum | int | enum | int | int | enum | int | real | real |
mins | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 0.0 | 0.0 | 61.0 | 0.0 | 0.0 | 0.25 | 0.5 | NaN | NaN | NaN | 60.96 | 213.36 | 365.76 | NaN | NaN | NaN | -26.7 | 0.1251 | 0.0 | 983.2949 | NaN | NaN | NaN | -15.6 | 0.001 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 3.0 | 0.0 | 0.0 | 1.0 | 0.0 | 10.0 | 7.2 | 0.0 |
mean | 2013.5 | 6.52602739726 | 15.7205479452 | 11.5 | 2013.50057078 | 6.52511415525 | 15.721347032 | 11.5001141553 | 0.0 | 1306.31195846 | 0.416742490522 | 0.361207349081 | 0.872445384073 | 0.963045685279 | 0.0 | 0.0 | 0.0 | 1293.9822682 | 1643.73900166 | 2084.89386376 | 0.0 | 0.0 | 0.0 | 4.31304646766 | 0.596736389159 | 1.37993010753 | 1017.82581441 | 0.0 | 0.0 | 0.0 | 12.5789090701 | 14.3914429682 | 4.84251968504 | NaN | 3.65867689358 | NaN | 2.84660766962 | NaN | 2.01149425287 | NaN | 4.125 | NaN | 3.0 | 0.0 | 1.37848173516 | 4.84251968504 | NaN | 194.69525682 | 9.42216948073 | 2.41032887849 |
maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 2015.0 | 12.0 | 31.0 | 23.0 | 0.0 | 3657.6 | 1.0 | 1.0 | 1.0 | 1.0 | NaN | NaN | NaN | 3657.5999 | 3657.5999 | 3657.5999 | NaN | NaN | NaN | 24.4 | 1.0 | 26.924 | 1042.2113 | NaN | NaN | NaN | 36.1 | 16.0934 | 60.0 | 11.0 | 60.0 | 10.0 | 36.0 | 7.0 | 27.0 | 4.0 | 27.0 | 2.0 | 3.0 | 0.0 | 16.0 | 60.0 | 11.0 | 360.0 | 20.58 | 10.8 |
sigma | 0.500014270017 | 3.44794972385 | 8.79649804852 | 6.92238411188 | 0.500584411716 | 3.44782405458 | 8.79561488868 | 6.92230165203 | 0.0 | 995.339856966 | 0.462720830993 | 0.42770569708 | 0.197155690367 | 0.0861015598104 | -0.0 | -0.0 | -0.0 | 962.743095854 | 916.73861349 | 887.215847511 | -0.0 | -0.0 | -0.0 | 10.9731282097 | 0.185792011866 | 2.56215129179 | 7.46451697179 | -0.0 | -0.0 | -0.0 | 10.0396739531 | 3.69893623033 | 5.70486576983 | NaN | 6.13386253912 | NaN | 5.80553286364 | NaN | 3.12340844261 | NaN | 6.15223536611 | NaN | 0.0 | 0.0 | 4.07386062702 | 5.70486576983 | NaN | 106.350000031 | 1.81511871115 | 1.61469790524 |
zeros | 0 | 0 | 0 | 730 | 0 | 0 | 0 | 730 | 17455 | 0 | 8758 | 8758 | 0 | 0 | -17520 | -17520 | -17520 | 0 | 0 | 0 | -17520 | -17520 | -17520 | 268 | 0 | 501 | 0 | -17520 | -17520 | -17520 | 269 | 0 | 0 | 17 | 0 | 30 | 0 | 13 | -5044 | -5024 | -11241 | -11229 | -17030 | -17028 | 14980 | 0 | 17 | 0 | 0 | 2768 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 65 | 10780 | 375 | 375 | 14682 | 16535 | 17520 | 17520 | 17520 | 9103 | 14683 | 16535 | 17520 | 17520 | 17520 | 67 | 67 | 15660 | 360 | 17520 | 17520 | 17520 | 67 | 412 | 14980 | 14980 | 16477 | 16477 | 17181 | 17181 | 17433 | 17433 | 17504 | 17504 | 17518 | 17518 | 0 | 14980 | 14980 | 9382 | 14381 | 1283 |
0 | 2013.0 | 1.0 | 1.0 | 0.0 | 2013.0 | 1.0 | 1.0 | 5.0 | 0.0 | 2895.6 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 2895.5999 | 3352.8 | nan | nan | nan | nan | -5.0 | 0.5447 | nan | 1013.0917 | nan | nan | nan | 3.3 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 2.57 | |||||||
1 | 2013.0 | 1.0 | 1.0 | 1.0 | 2013.0 | 1.0 | 1.0 | 6.0 | 0.0 | 3048.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 3048.0 | nan | nan | nan | nan | nan | -4.4 | 0.5463 | nan | 1012.0759 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | 9.77 | 4.63 | |||||||
2 | 2013.0 | 1.0 | 1.0 | 2.0 | 2013.0 | 1.0 | 1.0 | 7.0 | 0.0 | 1828.8 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1828.7999 | nan | nan | nan | nan | nan | -3.3 | 0.619 | nan | 1012.4145 | nan | nan | nan | 3.3 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 7.72 | 1.54 | |||||||
3 | 2013.0 | 1.0 | 1.0 | 3.0 | 2013.0 | 1.0 | 1.0 | 8.0 | 0.0 | 1463.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1463.04 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 3.09 | |||||||
4 | 2013.0 | 1.0 | 1.0 | 4.0 | 2013.0 | 1.0 | 1.0 | 9.0 | 0.0 | 1402.1 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1402.08 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.7531 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | nan | 4.12 | |||||||
5 | 2013.0 | 1.0 | 1.0 | 5.0 | 2013.0 | 1.0 | 1.0 | 10.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | nan | 3.09 | |||||||
6 | 2013.0 | 1.0 | 1.0 | 6.0 | 2013.0 | 1.0 | 1.0 | 11.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3 | 0.5934 | nan | 1012.0759 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 9.26 | 3.09 | |||||||
7 | 2013.0 | 1.0 | 1.0 | 7.0 | 2013.0 | 1.0 | 1.0 | 12.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -3.3 | 0.5934 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | 9.26 | 4.63 | |||||||
8 | 2013.0 | 1.0 | 1.0 | 8.0 | 2013.0 | 1.0 | 1.0 | 13.0 | 0.0 | 1524.0 | 1.0 | 1.0 | nan | nan | nan | nan | nan | 1524.0 | nan | nan | nan | nan | nan | -2.8 | 0.6425 | nan | 1012.4145 | nan | nan | nan | 3.3 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | 260.0 | nan | 3.09 | |||||||
9 | 2013.0 | 1.0 | 1.0 | 9.0 | 2013.0 | 1.0 | 1.0 | 14.0 | 0.0 | 1524.0 | 1.0 | 0.9 | 1.0 | nan | nan | nan | nan | 1524.0 | 3657.5999 | nan | nan | nan | nan | -2.8 | 0.6159 | nan | 1012.4145 | nan | nan | nan | 3.9 | 16.0934 | nan | nan | nan | nan | nan | nan | 0.0 | nan | nan | 9.26 | 3.09 |
# Lots of columns in there! Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature. Alas, drop the "snow" column since it's
# all NA's. Also add in dew point and humidity just in case. Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]
wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!
Rows:17,520 Cols:9 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 46 | 15.0326805 | 3.6 KB | 1.780005 |
C1 | 1-Byte Integers | 34 | 11.111112 | 19.4 KB | 9.592678 |
C1N | 1-Byte Integers (w/o NAs) | 90 | 29.411766 | 51.5 KB | 25.494701 |
C1S | 1-Byte Fractions | 42 | 13.725491 | 24.0 KB | 11.894592 |
C2S | 2-Byte Fractions | 94 | 30.718956 | 103.4 KB | 51.238026 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 201.9 KB | 17520.0 | 34.0 | 306.0 |
mean | 201.9 KB | 17520.0 | 34.0 | 306.0 |
min | 201.9 KB | 17520.0 | 34.0 | 306.0 |
max | 201.9 KB | 17520.0 | 34.0 | 306.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 201.9 KB | 17520.0 | 34.0 | 306.0 |
Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | |
---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | real | real | real | real | enum |
mins | 2013.0 | 1.0 | 1.0 | 0.0 | -26.7 | 0.1251 | 0.0 | -15.6 | 0.0 |
mean | 2013.5 | 6.52602739726 | 15.7205479452 | 11.5 | 4.31304646766 | 0.596736389159 | 1.37993010753 | 12.5789090701 | NaN |
maxs | 2014.0 | 12.0 | 31.0 | 23.0 | 24.4 | 1.0 | 26.924 | 36.1 | 11.0 |
sigma | 0.500014270017 | 3.44794972385 | 8.79649804852 | 6.92238411188 | 10.9731282097 | 0.185792011866 | 2.56215129179 | 10.0396739531 | NaN |
zeros | 0 | 0 | 0 | 730 | 268 | 0 | 501 | 269 | 17 |
missing | 0 | 0 | 0 | 0 | 67 | 67 | 15660 | 67 | 14980 |
0 | 2013.0 | 1.0 | 1.0 | 0.0 | -5.0 | 0.5447 | nan | 3.3 | |
1 | 2013.0 | 1.0 | 1.0 | 1.0 | -4.4 | 0.5463 | nan | 3.9 | |
2 | 2013.0 | 1.0 | 1.0 | 2.0 | -3.3 | 0.619 | nan | 3.3 | |
3 | 2013.0 | 1.0 | 1.0 | 3.0 | -2.8 | 0.6159 | nan | 3.9 | |
4 | 2013.0 | 1.0 | 1.0 | 4.0 | -2.8 | 0.6159 | nan | 3.9 | |
5 | 2013.0 | 1.0 | 1.0 | 5.0 | -2.8 | 0.6159 | nan | 3.9 | |
6 | 2013.0 | 1.0 | 1.0 | 6.0 | -3.3 | 0.5934 | nan | 3.9 | |
7 | 2013.0 | 1.0 | 1.0 | 7.0 | -3.3 | 0.5934 | nan | 3.9 | |
8 | 2013.0 | 1.0 | 1.0 | 8.0 | -2.8 | 0.6425 | nan | 3.3 | |
9 | 2013.0 | 1.0 | 1.0 | 9.0 | -2.8 | 0.6159 | nan | 3.9 |
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days. Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970). Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.
Rows:730 Cols:11 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 80 | 21.390373 | 6.3 KB | 12.498779 |
C0D | Constant Reals | 13 | 3.4759357 | 1.0 KB | 2.0310516 |
C1 | 1-Byte Integers | 30 | 8.021391 | 2.6 KB | 5.2455816 |
C1N | 1-Byte Integers (w/o NAs) | 56 | 14.973262 | 4.9 KB | 9.801778 |
C1S | 1-Byte Fractions | 34 | 9.090909 | 3.5 KB | 7.0032225 |
C2S | 2-Byte Fractions | 34 | 9.090909 | 4.2 KB | 8.4288645 |
CUD | Unique Reals | 25 | 6.6844916 | 3.6 KB | 7.2297626 |
C8D | 64-bit Reals | 102 | 27.272728 | 23.9 KB | 47.76096 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 50.0 KB | 730.0 | 34.0 | 374.0 |
mean | 50.0 KB | 730.0 | 34.0 | 374.0 |
min | 50.0 KB | 730.0 | 34.0 | 374.0 |
max | 50.0 KB | 730.0 | 34.0 | 374.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 50.0 KB | 730.0 | 34.0 | 374.0 |
Year Local | Month Local | Day Local | Hour Local | Dew Point (C) | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | msec | Days | |
---|---|---|---|---|---|---|---|---|---|---|---|
type | int | int | int | int | real | real | real | real | enum | int | int |
mins | 2013.0 | 1.0 | 1.0 | 12.0 | -26.7 | 0.1723 | 0.0 | -13.9 | 0.0 | 1.3570704e+12 | 15706.0 |
mean | 2013.5 | 6.52602739726 | 15.7205479452 | 12.0 | 4.23012379642 | 0.539728198074 | 1.53125714286 | 14.0687757909 | NaN | 1.3885608526e+12 | 16070.5 |
maxs | 2014.0 | 12.0 | 31.0 | 12.0 | 23.3 | 1.0 | 12.446 | 34.4 | 10.0 | 1.420056e+12 | 16435.0 |
sigma | 0.500342818004 | 3.45021529307 | 8.80227802701 | 0.0 | 11.1062964725 | 0.179945027923 | 2.36064248615 | 10.3989855149 | NaN | 18219740080.4 | 210.877136425 |
zeros | 0 | 0 | 0 | 0 | 14 | 0 | -174 | 7 | -83 | 0 | 0 |
missing | 0 | 0 | 0 | 0 | 3 | 3 | 660 | 3 | 620 | 0 | 0 |
0 | 2013.0 | 1.0 | 1.0 | 12.0 | -3.3 | 0.5934 | nan | 3.9 | 1.3570704e+12 | 15706.0 | |
1 | 2013.0 | 1.0 | 2.0 | 12.0 | -11.7 | 0.4806 | nan | -2.2 | 1.3571568e+12 | 15707.0 | |
2 | 2013.0 | 1.0 | 3.0 | 12.0 | -10.6 | 0.5248 | nan | -2.2 | 1.3572432e+12 | 15708.0 | |
3 | 2013.0 | 1.0 | 4.0 | 12.0 | -7.2 | 0.4976 | nan | 2.2 | 1.3573296e+12 | 15709.0 | |
4 | 2013.0 | 1.0 | 5.0 | 12.0 | -7.2 | 0.426 | nan | 4.4 | 1.357416e+12 | 15710.0 | |
5 | 2013.0 | 1.0 | 6.0 | 12.0 | -1.7 | 0.6451 | nan | 4.4 | haze | 1.3575024e+12 | 15711.0 |
6 | 2013.0 | 1.0 | 7.0 | 12.0 | -6.1 | 0.4119 | nan | 6.1 | 1.3575888e+12 | 15712.0 | |
7 | 2013.0 | 1.0 | 8.0 | 12.0 | -1.7 | 0.5314 | nan | 7.2 | 1.3576752e+12 | 15713.0 | |
8 | 2013.0 | 1.0 | 9.0 | 12.0 | 0.6 | 0.56 | nan | 8.9 | haze | 1.3577616e+12 | 15714.0 |
9 | 2013.0 | 1.0 | 10.0 | 12.0 | -6.1 | 0.3952 | nan | 6.7 | 1.357848e+12 | 15715.0 |
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()
Merge Daily Weather with Bikes-Per-Day Rows:10,450 Cols:10 Chunk compression summary:
chunk_type | chunk_name | count | count_percentage | size | size_percentage |
C0L | Constant Integers | 66 | 20.625 | 5.2 KB | 3.6253278 |
C0D | Constant Reals | 33 | 10.3125 | 2.6 KB | 1.8126639 |
CBS | Bits | 6 | 1.8750001 | 666 B | 0.4572857 |
C1 | 1-Byte Integers | 4 | 1.25 | 1.5 KB | 1.0821055 |
C1N | 1-Byte Integers (w/o NAs) | 28 | 8.75 | 10.8 KB | 7.599456 |
C1S | 1-Byte Fractions | 31 | 9.6875 | 12.4 KB | 8.739238 |
C2 | 2-Byte Integers | 63 | 19.6875 | 44.3 KB | 31.169582 |
CUD | Unique Reals | 89 | 27.812498 | 64.7 KB | 45.514343 |
Frame distribution summary:
size | number_of_rows | number_of_chunks_per_column | number_of_chunks | |
172.16.2.61:54321 | 142.2 KB | 10450.0 | 32.0 | 320.0 |
mean | 142.2 KB | 10450.0 | 32.0 | 320.0 |
min | 142.2 KB | 10450.0 | 32.0 | 320.0 |
max | 142.2 KB | 10450.0 | 32.0 | 320.0 |
stddev | 0 B | 0.0 | 0.0 | 0.0 |
total | 142.2 KB | 10450.0 | 32.0 | 320.0 |
Days | start station name | bikes | Month | DayOfWeek | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | Dew Point (C) | |
---|---|---|---|---|---|---|---|---|---|---|
type | int | enum | int | enum | enum | real | int | real | enum | real |
mins | 15979.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.3485 | 0.0 | 9.4 | 2.0 | -2.2 |
mean | 15994.4415311 | NaN | 99.3025837321 | 0.968612440191 | NaN | 0.562374191388 | 0.0 | 16.9630717703 | NaN | 7.77999043062 |
maxs | 16010.0 | 329.0 | 553.0 | 1.0 | 6.0 | 0.8718 | 0.0 | 26.1 | 8.0 | 19.4 |
sigma | 9.23370172444 | NaN | 72.9721964301 | 0.174371128617 | NaN | 0.149631413472 | 0.0 | 4.29746634617 | NaN | 6.49151146664 |
zeros | 0 | 32 | 0 | 328 | 1635 | 0 | 10450 | 0 | -8494 | 0 |
missing | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9134 | 0 |
0 | 15979.0 | 1 Ave & E 15 St | 97.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
1 | 15979.0 | 1 Ave & E 18 St | 75.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
2 | 15979.0 | 1 Ave & E 30 St | 113.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
3 | 15979.0 | 10 Ave & W 28 St | 74.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
4 | 15979.0 | 11 Ave & W 27 St | 139.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
5 | 15979.0 | 11 Ave & W 41 St | 60.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
6 | 15979.0 | 12 Ave & W 40 St | 90.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
7 | 15979.0 | 2 Ave & E 31 St | 88.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
8 | 15979.0 | 2 Ave & E 58 St | 55.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 | |
9 | 15979.0 | 3 Ave & Schermerhorn St | 8.0 | 9 | Mon | 0.4315 | 0.0 | 23.9 | 10.6 |
Days | start station name | bikes | Month | DayOfWeek | Humidity Fraction | Rain (mm) | Temperature (C) | WC1 | Dew Point (C) |
---|---|---|---|---|---|---|---|---|---|
15979 | 1 Ave & E 15 St | 97 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 1 Ave & E 18 St | 75 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 1 Ave & E 30 St | 113 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 10 Ave & W 28 St | 74 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 11 Ave & W 27 St | 139 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 11 Ave & W 41 St | 60 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 12 Ave & W 40 St | 90 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 2 Ave & E 31 St | 88 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 2 Ave & E 58 St | 55 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 | |
15979 | 3 Ave & Schermerhorn St | 8 | 9 | Mon | 0.4315 | 0 | 23.9 | 10.6 |
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)
Training data has 10 columns and 6289 rows, test has 3080 rows, holdout has 1081 gbm Model Build Progress: [##################################################] 100% drf Model Build Progress: [##################################################] 100% glm Model Build Progress: [##################################################] 100% deeplearning Model Build Progress: [##################################################] 100%
Model | R2 TRAIN | R2 TEST | R2 HOLDOUT | Model Training Time (s) |
GBM | 0.9954410 | 0.9255962 | 0.9230051 | 6.706 |
DRF | 0.8491125 | 0.7430226 | 0.7442895 | 6.692 |
GLM | 0.8660565 | 0.8446801 | 0.8673705 | 0.139 |
DL | 0.9617874 | 0.9117793 | 0.9213475 | 7.972 |