import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn. metrics import mean_squared_error
def reg_func(dataset, algorithm):
dataset.index=np.random.permutation(dataset.index)
features=dataset.columns.drop('cnt')
rmses_test=[]
mses_test=[]
kf=KFold(n_splits= 10, shuffle=True, random_state=1)
for train_index, test_index in kf.split(dataset):
X_train, X_test= dataset[features].iloc[train_index, :], dataset[features].iloc[test_index, :]
Y_train, Y_test= dataset['cnt'].iloc[train_index], dataset['cnt'].iloc[test_index]
algor=algorithm()
algor.fit(X_train, Y_train)
predictions_test=algor.predict(X_test)
mse_test=mean_squared_error(predictions_test, Y_test)
mses_test.append(mse_test)
rmse_test=mse_test**0.5
rmses_test.append(rmse_test)
print ('Mean of MSES and RMSES for {} is :'.format(algorithm.__name__), ' ', round (np.mean(mses_test), 2), 'and', round (np.mean(rmses_test), 2) )
#import dataset
bike_rentals=pd.read_csv('bike_rental_hour.csv')
bike_rentals.head()
instant | dteday | season | yr | mnth | hr | holiday | weekday | workingday | weathersit | temp | atemp | hum | windspeed | casual | registered | cnt | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2011-01-01 | 1 | 0 | 1 | 0 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.81 | 0.0 | 3 | 13 | 16 |
1 | 2 | 2011-01-01 | 1 | 0 | 1 | 1 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0.0 | 8 | 32 | 40 |
2 | 3 | 2011-01-01 | 1 | 0 | 1 | 2 | 0 | 6 | 0 | 1 | 0.22 | 0.2727 | 0.80 | 0.0 | 5 | 27 | 32 |
3 | 4 | 2011-01-01 | 1 | 0 | 1 | 3 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0.0 | 3 | 10 | 13 |
4 | 5 | 2011-01-01 | 1 | 0 | 1 | 4 | 0 | 6 | 0 | 1 | 0.24 | 0.2879 | 0.75 | 0.0 | 0 | 1 | 1 |
#Check the distribution of target column
bike_rentals['cnt'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1a0e64e048>
#Correlation analysis bewteen target column and predictors columns
bike_rentals.corr()['cnt'].sort_values()
hum -0.322911 weathersit -0.142426 holiday -0.030927 weekday 0.026900 workingday 0.030284 windspeed 0.093234 mnth 0.120638 season 0.178056 yr 0.250495 instant 0.278379 hr 0.394071 atemp 0.400929 temp 0.404772 casual 0.694564 registered 0.972151 cnt 1.000000 Name: cnt, dtype: float64
#Evaluate 'cnt' distribution during different months, seasons, and weather situations
fig, axes=plt.subplots(3, 1, figsize=(15, 15))
sns.stripplot(ax=axes[0], x='mnth', y='cnt', data=bike_rentals, jitter=0.4, size=3)
sns.stripplot(ax=axes[1], x='season', y='cnt', data=bike_rentals, jitter=0.4)
sns.stripplot(ax=axes[2], x='weathersit', y='cnt', data=bike_rentals, jitter=0.4)
plt.show()
bike_rentals.isnull().sum()
instant 0 dteday 0 season 0 yr 0 mnth 0 hr 0 holiday 0 weekday 0 workingday 0 weathersit 0 temp 0 atemp 0 hum 0 windspeed 0 casual 0 registered 0 cnt 0 dtype: int64
bike_rentals=bike_rentals.drop(columns=['instant', 'dteday'], axis=1)
def change_hr(hr):
if hr >=6 and hr<12:
hr=1
elif hr >=12 and hr<18:
hr=2
elif hr >=18 and hr<24:
hr= 3
elif hr >=0 and hr<6:
hr= 4
return hr
bike_rentals['time_label']=bike_rentals['hr'].apply(change_hr)
#Detect peak hours
plt.figure(figsize=(10, 6))
bike_rentals.groupby('hr')['cnt'].mean().plot.bar()
plt.xlabel('Hour', size=20)
plt.ylabel('cnt', size=20)
<matplotlib.text.Text at 0x7f1a0bf24128>
#Convert 'hr' column to peak_hour column
def change_hr_2(hr):
if hr >=7 or hr<20:
hr=1 #peak_hour
else:
hr=0 #non-peak hour
return hr
bike_rentals['peak_hour']=bike_rentals['hr'].apply(change_hr_2)
#Convert the "temp" column back to the actual temperature values
bike_rentals['temp_real']=(bike_rentals['temp']*47)-8
#Convert 'temp_real' column to 'temp_real_cat' column
def change_temp(temp_real):
if temp_real <=10:
temp_real=0 #cold
elif temp_real >10 and temp_real <=15:
temp_real=1 #cool
elif temp_real>15 and temp_real<=25:
temp_real=2 #good
elif temp_real>25:
temp_real=3 #hot
return temp_real
bike_rentals['temp_real_cat']=bike_rentals['temp_real'].apply(change_temp)
revised_categorical_features=['time_label', 'peak_hour', 'temp_real_cat', 'season', 'yr', 'mnth', 'weekday','workingday']
for feature in revised_categorical_features:
bike_rentals[feature]=bike_rentals[feature].astype('category')
dummy_variable=pd.get_dummies(bike_rentals[feature], prefix=feature)
bike_rentals=pd.concat([bike_rentals, dummy_variable], axis=1).drop(columns=feature, axis=1)
def iqr(dataset, col):
q1=dataset[col].quantile(0.25)
q3=dataset[col].quantile(0.75)
iqr=q3-q1
dataset_new=dataset[~((dataset[col]>(q3+1.5*iqr)) | (dataset[col]<(q1-1.5*iqr)))]
return dataset_new
for item in ['cnt', 'temp', 'atemp']:
bike_rentals=iqr(bike_rentals, item)
#Remove parallel columns
bike_rentals=bike_rentals.drop(columns=['casual', 'registered', 'temp', 'temp_real'], axis=1)
#Final features
bike_rentals.columns
Index(['hr', 'holiday', 'weathersit', 'atemp', 'hum', 'windspeed', 'cnt', 'time_label_1', 'time_label_2', 'time_label_3', 'time_label_4', 'peak_hour_1', 'temp_real_cat_0', 'temp_real_cat_1', 'temp_real_cat_2', 'temp_real_cat_3', 'season_1', 'season_2', 'season_3', 'season_4', 'yr_0', 'yr_1', 'mnth_1', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'workingday_0', 'workingday_1'], dtype='object')
#Check the efficacy of three algorithms
reg_func(bike_rentals, LinearRegression)
reg_func(bike_rentals, DecisionTreeRegressor)
reg_func(bike_rentals, RandomForestRegressor)
Mean of MSES and RMSES for LinearRegression is : 11329.86 and 106.41 Mean of MSES and RMSES for DecisionTreeRegressor is : 2932.97 and 54.11 Mean of MSES and RMSES for RandomForestRegressor is : 1723.38 and 41.48
#Use of GridSearchCV to detect best parameters
from sklearn.model_selection import GridSearchCV
bike_rentals.index=np.random.permutation(bike_rentals.index)
features=bike_rentals.columns.drop('cnt')
rf=RandomForestRegressor()
hyperparameters= {'bootstrap': [True, False],'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4],'min_samples_split': [2, 5, 10],'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
hyperparameters_1= {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None]}
hyperparameters_2= {'max_features': ['auto', 'sqrt']}
hyperparameters_3= {'min_samples_leaf': [1, 2, 4]}
hyperparameters_4= {'min_samples_split': [2, 5, 10]}
hyperparameters_5= {'n_estimators': [200, 400, 600]}
list1=[hyperparameters_1, hyperparameters_2, hyperparameters_3, hyperparameters_4, hyperparameters_5]
for item in list1:
grid=GridSearchCV(rf, param_grid=item, cv=10)
grid.fit(bike_rentals[features], bike_rentals['cnt'])
print (grid.best_params_)
{'max_depth': None} {'max_features': 'auto'} {'min_samples_leaf': 2} {'min_samples_split': 5} {'n_estimators': 400}
#Evaluate model using optimized parameters
bike_rentals.index=np.random.permutation(bike_rentals.index)
features=bike_rentals.columns.drop('cnt')
rmses_test=[]
mses_test=[]
kf=KFold(n_splits= 10, shuffle=True, random_state=1)
for train_index, test_index in kf.split(bike_rentals):
X_train, X_test= bike_rentals[features].iloc[train_index, :], bike_rentals[features].iloc[test_index, :]
Y_train, Y_test= bike_rentals['cnt'].iloc[train_index], bike_rentals['cnt'].iloc[test_index]
rf=RandomForestRegressor(max_depth=90, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=600)
rf.fit(X_train, Y_train)
predictions_test=rf.predict(X_test)
mse_test=mean_squared_error(predictions_test, Y_test)
mses_test.append(mse_test)
rmse_test=mse_test**0.5
rmses_test.append(rmse_test)
print ('Mean of MSES and RMSES for optimized {} is :'.format(RandomForestRegressor.__name__), ' ', round (np.mean(mses_test), 2), 'and', round (np.mean(rmses_test), 2) )
Mean of MSES and RMSES for optimized RandomForestRegressor is : 1338.25 and 36.58 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1533.69 and 39.08 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1532.44 and 39.09 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1570.95 and 39.59 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1523.21 and 38.97 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1510.73 and 38.82 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1495.36 and 38.62 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1527.5 and 39.03 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1522.84 and 38.97 Mean of MSES and RMSES for optimized RandomForestRegressor is : 1532.52 and 39.1
using the optimized regressor (RandomForestRegressor(max_depth=90, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=600)), I could reach to a final MSES value of 1532 and RMSES value of 39.1.