In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
import sklearn

#feature engr'ing
from sklearn.preprocessing import OneHotEncoder

#models
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import SVR

#metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#ignore some warnings we dont care about
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
In [7]:
names = ['instant','date','season','year','month','hour','holiday','weekday','workingday','weathersit','temp','atemp','humidity','windspeed','casual','registered','total']
dataset = pd.read_csv('hour.csv',names=names,header=0,usecols=[*range(2,17)])

print(dataset.head(5))
   season  year  month  hour  holiday  weekday  workingday  weathersit  temp  \
0       1     0      1     0        0        6           0           1  0.24   
1       1     0      1     1        0        6           0           1  0.22   
2       1     0      1     2        0        6           0           1  0.22   
3       1     0      1     3        0        6           0           1  0.24   
4       1     0      1     4        0        6           0           1  0.24   

    atemp  humidity  windspeed  casual  registered  total  
0  0.2879      0.81        0.0       3          13     16  
1  0.2727      0.80        0.0       8          32     40  
2  0.2727      0.80        0.0       5          27     32  
3  0.2879      0.75        0.0       3          10     13  
4  0.2879      0.75        0.0       0           1      1  
In [8]:
weatherSits = dataset['weathersit'].values
total = dataset['total'].values

#One-Hot Encoding
OHEr = OneHotEncoder(sparse=False)
weatherSits = weatherSits.reshape(len(weatherSits), 1)
OHEd = OHEr.fit_transform(weatherSits)

#remove unwanted columns
dataset = dataset.drop(columns=['weathersit','casual','registered','total'])

#add new OHE columns and put total back at the end
dataset['weather1'] = np.transpose(OHEd)[0]
dataset['weather2'] = np.transpose(OHEd)[1]
dataset['weather3'] = np.transpose(OHEd)[2]
dataset['weather4'] = np.transpose(OHEd)[3]
dataset['total'] = total

print(dataset.head(5))
   season  year  month  hour  holiday  weekday  workingday  temp   atemp  \
0       1     0      1     0        0        6           0  0.24  0.2879   
1       1     0      1     1        0        6           0  0.22  0.2727   
2       1     0      1     2        0        6           0  0.22  0.2727   
3       1     0      1     3        0        6           0  0.24  0.2879   
4       1     0      1     4        0        6           0  0.24  0.2879   

   humidity  windspeed  weather1  weather2  weather3  weather4  total  
0      0.81        0.0       1.0       0.0       0.0       0.0     16  
1      0.80        0.0       1.0       0.0       0.0       0.0     40  
2      0.80        0.0       1.0       0.0       0.0       0.0     32  
3      0.75        0.0       1.0       0.0       0.0       0.0     13  
4      0.75        0.0       1.0       0.0       0.0       0.0      1  
In [9]:
array = dataset.values
X = array[:,0:15]
Y = array[:,15]
validation_size = 0.20
seed = 11

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X,Y,test_size=validation_size,random_state=seed)
In [14]:
reg_model = LinearRegression()
reg_model.fit(X_train,Y_train)
reg_model.score(X_validation,Y_validation)
Out[14]:
0.41800686189447844
In [33]:
reg_model = RandomForestRegressor(max_depth=60,random_state=0,n_estimators=500)
reg_model.fit(X_train,Y_train)
reg_model.score(X_validation,Y_validation)
Out[33]:
0.9478203512307453
In [17]:
reg_model = ElasticNet(random_state=0)
reg_model.fit(X_train,Y_trcorner&2
              ain)
reg_model.score(X_validation,Y_validation)
Out[17]:
0.2727850252848387
In [19]:
reg_model = Lasso(random_state=0)
reg_model.fit(X_train,Y_train)
reg_model.score(X_validation,Y_validation)
Out[19]:
0.4147073584579234
In [21]:
reg_model = Ridge(random_state=0)
reg_model.fit(X_train,Y_train)
reg_model.score(X_validation,Y_validation)
Out[21]:
0.41804293724431285
In [32]:
reg_model = SVR(gamma='scale', C=400.0, epsilon=0.2)
reg_model.fit(X_train,Y_train)
reg_model.score(X_validation,Y_validation)
Out[32]:
0.7816133895418931
In [ ]: