In [1]:
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error as mae_
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
bike_rentals = pd.read_csv('bike_rental_hour.csv', parse_dates= ['dteday'])
target = 'cnt'
columns = list(bike_rentals.columns.drop([target,'casual','dteday','registered']))

instant - A unique sequential ID number for each row

dteday - The date of the rentals

season - The season in which the rentals occurred

yr - The year the rentals occurred

mnth - The month the rentals occurred

hr - The hour the rentals occurred

holiday - Whether or not the day was a holiday

weekday - The day of the week (as a number, 0 to 7)

workingday - Whether or not the day was a working day

weathersit - The weather (as a categorical variable)

temp - The temperature, on a 0-1 scale

atemp - The adjusted temperature

hum - The humidity, on a 0-1 scale

windspeed - The wind speed, on a 0-1 scale

casual - The number of casual riders (people who hadn't previously signed up with the bike sharing program)

registered - The number of registered riders (people who had already signed up)

cnt - The total number of bike rentals (casual + registered)

In [3]:
bike_rentals.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
instant       17379 non-null int64
dteday        17379 non-null datetime64[ns]
season        17379 non-null int64
yr            17379 non-null int64
mnth          17379 non-null int64
hr            17379 non-null int64
holiday       17379 non-null int64
weekday       17379 non-null int64
workingday    17379 non-null int64
weathersit    17379 non-null int64
temp          17379 non-null float64
atemp         17379 non-null float64
hum           17379 non-null float64
windspeed     17379 non-null float64
casual        17379 non-null int64
registered    17379 non-null int64
cnt           17379 non-null int64
dtypes: datetime64[ns](1), float64(4), int64(12)
memory usage: 2.3 MB
In [4]:
plt.hist(bike_rentals['cnt'])
plt.show()
In [5]:
bike_rentals.corr()['cnt'].sort_values(ascending = False)*100
Out[5]:
cnt           100.000000
registered     97.215073
casual         69.456408
temp           40.477228
atemp          40.092930
hr             39.407150
instant        27.837869
yr             25.049490
season         17.805573
mnth           12.063776
windspeed       9.323378
workingday      3.028437
weekday         2.689986
holiday        -3.092730
weathersit    -14.242614
hum           -32.291074
Name: cnt, dtype: float64

High rate of rentals from registered members, and casual riders. Temperature has moderate relation, and humidity has moderate inverse relation (high humidity low rental rate).

In [6]:
def hour_category(hour):
    if hour in range(6,12):
        return 1
    elif hour in range(12,18):
        return 2
    elif hour in range(18,24):
        return 3
    elif hour in range(0,6):
        return 4
bike_rentals['time_label'] = bike_rentals['hr'].apply(hour_category)
bike_rentals[['hr','time_label']].sample(5)
Out[6]:
hr time_label
5157 19 3
1686 21 3
15869 10 1
3769 23 3
13264 21 3

For error ratings I'll use mean absolute error, because it's very easy to understand what it means. If you predict 100 rentals with a mean absolute error of 10, then you are expecting 100 give or take 10. As straightforward a measure as you could ask for.

In [7]:
random.seed(1)
# random.seed allows for controlling randomness so results are reproducable
# and verifiable
train = bike_rentals.sample(frac = .8)

test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]
# "everything except where bike_rentals index matches train index"
In [8]:
col_corrs = 100*bike_rentals.corr()['cnt']
col_corrs[columns].sort_values(ascending = False)
Out[8]:
temp          40.477228
atemp         40.092930
hr            39.407150
instant       27.837869
yr            25.049490
season        17.805573
mnth          12.063776
windspeed      9.323378
workingday     3.028437
weekday        2.689986
holiday       -3.092730
weathersit   -14.242614
hum          -32.291074
Name: cnt, dtype: float64
In [9]:
def model_test(model):
    model.fit(train[columns],train[target])
    prediction1 = model.predict(test[columns])
    # mean absolute error, (true, predicted)
    error = mae_(test[target], prediction1) 
    print(mae_(test[target], prediction1))
    print('--------------')
    print(test[target].describe())
model_test(LinearRegression())
106.31518181294032
--------------
count    3476.000000
mean      191.565305
std       180.493989
min         1.000000
25%        43.000000
50%       146.000000
75%       286.000000
max       977.000000
Name: cnt, dtype: float64

With the error being 107 compared to even our maximum of 900, our margin of error is quite large at this time. Significant changes need to be made to any meaningful accuracy.

In [10]:
model_test(DecisionTreeRegressor())
32.601841196777904
--------------
count    3476.000000
mean      191.565305
std       180.493989
min         1.000000
25%        43.000000
50%       146.000000
75%       286.000000
max       977.000000
Name: cnt, dtype: float64

Evidently, the decision tree algorith is much more accurate with adjusting. 33.5 compared to the 107 of the linear regression.

In [11]:
model_test(RandomForestRegressor())
26.063521288837745
--------------
count    3476.000000
mean      191.565305
std       180.493989
min         1.000000
25%        43.000000
50%       146.000000
75%       286.000000
max       977.000000
Name: cnt, dtype: float64

Current error is now 26.7 compared to the previous 33.6. Random forrest appears to be the correct method.

In [13]:
def adjustmant_(model):
    model.fit(train[columns],train[target])
    prediction1 = model.predict(test[columns])
    # mean absolute error, (true, predicted)
    return mae_(test[target], prediction1) 

mae_list = []
for a in range(100):
    margin = adjustmant_(RandomForestRegressor(max_depth=1+a, 
                                       min_samples_split=2))
    mae_list.append(margin)
max_ = max(mae_list)
print([index for index,value in enum(mae_list)if value == max_])    

TypeErrorTraceback (most recent call last)
<ipython-input-13-288a407c9260> in <module>()
      8 for a in range(100):
      9     margin = adjustmant_(RandomForestRegressor(max_depth=1+a, 
---> 10                                        min_samples_split=2))
     11     mae_list.append(margin)
     12 max_ = max(mae_list)

<ipython-input-13-288a407c9260> in adjustmant_(model)
      1 def adjustmant_(model):
      2     model.fit(train[columns],train[target])
----> 3     prediction1 = model.predict(test[columns])
      4     # mean absolute error, (true, predicted)
      5     return mae_(test[target], prediction1)

TypeError: 'function' object is not subscriptable