from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae_
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
bike_rentals = pd.read_csv('bike_rental_hour.csv', parse_dates= ['dteday'])
target = 'cnt'
columns = list(bike_rentals.columns.drop([target,'casual','dteday','registered']))
instant - A unique sequential ID number for each row
dteday - The date of the rentals
season - The season in which the rentals occurred
yr - The year the rentals occurred
mnth - The month the rentals occurred
hr - The hour the rentals occurred
holiday - Whether or not the day was a holiday
weekday - The day of the week (as a number, 0 to 7)
workingday - Whether or not the day was a working day
weathersit - The weather (as a categorical variable)
temp - The temperature, on a 0-1 scale
atemp - The adjusted temperature
hum - The humidity, on a 0-1 scale
windspeed - The wind speed, on a 0-1 scale
casual - The number of casual riders (people who hadn't previously signed up with the bike sharing program)
registered - The number of registered riders (people who had already signed up)
cnt - The total number of bike rentals (casual + registered)
bike_rentals.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17379 entries, 0 to 17378 Data columns (total 17 columns): instant 17379 non-null int64 dteday 17379 non-null datetime64[ns] season 17379 non-null int64 yr 17379 non-null int64 mnth 17379 non-null int64 hr 17379 non-null int64 holiday 17379 non-null int64 weekday 17379 non-null int64 workingday 17379 non-null int64 weathersit 17379 non-null int64 temp 17379 non-null float64 atemp 17379 non-null float64 hum 17379 non-null float64 windspeed 17379 non-null float64 casual 17379 non-null int64 registered 17379 non-null int64 cnt 17379 non-null int64 dtypes: datetime64[ns](1), float64(4), int64(12) memory usage: 2.3 MB
plt.hist(bike_rentals['cnt'])
plt.show()
bike_rentals.corr()['cnt'].sort_values(ascending = False)*100
cnt 100.000000 registered 97.215073 casual 69.456408 temp 40.477228 atemp 40.092930 hr 39.407150 instant 27.837869 yr 25.049490 season 17.805573 mnth 12.063776 windspeed 9.323378 workingday 3.028437 weekday 2.689986 holiday -3.092730 weathersit -14.242614 hum -32.291074 Name: cnt, dtype: float64
High rate of rentals from registered members, and casual riders. Temperature has moderate relation, and humidity has moderate inverse relation (high humidity low rental rate).
def hour_category(hour):
if hour in range(6,12):
return 1
elif hour in range(12,18):
return 2
elif hour in range(18,24):
return 3
elif hour in range(0,6):
return 4
bike_rentals['time_label'] = bike_rentals['hr'].apply(hour_category)
bike_rentals[['hr','time_label']].sample(5)
hr | time_label | |
---|---|---|
5157 | 19 | 3 |
1686 | 21 | 3 |
15869 | 10 | 1 |
3769 | 23 | 3 |
13264 | 21 | 3 |
For error ratings I'll use mean absolute error, because it's very easy to understand what it means. If you predict 100 rentals with a mean absolute error of 10, then you are expecting 100 give or take 10. As straightforward a measure as you could ask for.
random.seed(1)
# random.seed allows for controlling randomness so results are reproducable
# and verifiable
train = bike_rentals.sample(frac = .8)
test = bike_rentals.loc[~bike_rentals.index.isin(train.index)]
# "everything except where bike_rentals index matches train index"
col_corrs = 100*bike_rentals.corr()['cnt']
col_corrs[columns].sort_values(ascending = False)
temp 40.477228 atemp 40.092930 hr 39.407150 instant 27.837869 yr 25.049490 season 17.805573 mnth 12.063776 windspeed 9.323378 workingday 3.028437 weekday 2.689986 holiday -3.092730 weathersit -14.242614 hum -32.291074 Name: cnt, dtype: float64
def model_test(model):
model.fit(train[columns],train[target])
prediction1 = model.predict(test[columns])
# mean absolute error, (true, predicted)
error = mae_(test[target], prediction1)
print(mae_(test[target], prediction1))
print('--------------')
print(test[target].describe())
model_test(LinearRegression())
106.31518181294032 -------------- count 3476.000000 mean 191.565305 std 180.493989 min 1.000000 25% 43.000000 50% 146.000000 75% 286.000000 max 977.000000 Name: cnt, dtype: float64
With the error being 107 compared to even our maximum of 900, our margin of error is quite large at this time. Significant changes need to be made to any meaningful accuracy.
model_test(DecisionTreeRegressor())
32.601841196777904 -------------- count 3476.000000 mean 191.565305 std 180.493989 min 1.000000 25% 43.000000 50% 146.000000 75% 286.000000 max 977.000000 Name: cnt, dtype: float64
Evidently, the decision tree algorith is much more accurate with adjusting. 33.5 compared to the 107 of the linear regression.
model_test(RandomForestRegressor())
26.063521288837745 -------------- count 3476.000000 mean 191.565305 std 180.493989 min 1.000000 25% 43.000000 50% 146.000000 75% 286.000000 max 977.000000 Name: cnt, dtype: float64
Current error is now 26.7 compared to the previous 33.6. Random forrest appears to be the correct method.
def adjustmant_(model):
model.fit(train[columns],train[target])
prediction1 = model.predict(test[columns])
# mean absolute error, (true, predicted)
return mae_(test[target], prediction1)
mae_list = []
for a in range(100):
margin = adjustmant_(RandomForestRegressor(max_depth=1+a,
min_samples_split=2))
mae_list.append(margin)
max_ = max(mae_list)
print([index for index,value in enum(mae_list)if value == max_])
TypeErrorTraceback (most recent call last) <ipython-input-13-288a407c9260> in <module>() 8 for a in range(100): 9 margin = adjustmant_(RandomForestRegressor(max_depth=1+a, ---> 10 min_samples_split=2)) 11 mae_list.append(margin) 12 max_ = max(mae_list) <ipython-input-13-288a407c9260> in adjustmant_(model) 1 def adjustmant_(model): 2 model.fit(train[columns],train[target]) ----> 3 prediction1 = model.predict(test[columns]) 4 # mean absolute error, (true, predicted) 5 return mae_(test[target], prediction1) TypeError: 'function' object is not subscriptable