note: at one point i used all of these but over time got rid of ones that were unhelpful
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import cross_val_score
# import statsmodels.formula.api as smf
import time
from itertools import chain
%matplotlib inline
the complete breakdown of what I plan on doing for my own clarity
# here i have created 2 functions to deal with data and to create submissions.
# nicedata() is a bit sloppy with names of col and getting them in the order i wanted
def nicedata(data):
'''
change data into good format. use for both train and test
'''
yearmonthday,time = zip(*[item.split(' ') for item in data['datetime']])
year, month, day = zip(*[item.split('-') for item in yearmonthday])
data2 = data
data2 = data.drop('datetime',1)
#split datatime into year month day and time.
data2['year'] = year
data2['month']= month
# data2['month']=month
data2['day']=day
data2['time']=time
col = data2.columns.tolist()
#change columns
for x in xrange(4):
col = col[-1:]+col[:-1]
# col[0], col[1] = col[1], col[0]
data2 = data2[col]
#remove 00:00 and change all to int from float
data2['time']=data2['time'].str.replace(':00:00','').astype(int)
data2[['year','month','day']]=data2[['year','month','day']].astype(int)
return data2
# data = pd.read_csv('data/train.csv')
# data = nicedata(data)
def submitdata(pred,name='submission'):
'''
use with predictions in this function as it uses time in string as well
pulls datetime from original file rather than recombine year month day time
'''
#extract name from original 'datetime' feature
keep = pd.read_csv('data/test.csv')
keep = keep['datetime']
#save to file
submit = pd.concat([keep,pred],axis=1)
submit.columns=['datetime','count']
submit['count']=submit['count'].astype(int)
# submit.ix[submit['count'] <= 0, :] = 1
timestr = time.strftime("%m-%d")
submit.to_csv('data/'+name+timestr+'.csv',index=False)
def normalizedata(data):
'''
use if you wish to normalize any of the features from [0,1]
'''
features_to_norm = ['temp','atemp','humidity','windspeed']
for x in features_to_norm:
#normalized between 0 and 1 but can later change
temporary = (data[x] - min(data[x]))/(max(data[x])- min(data[x]))
data[x]=temporary
return data
#this removes warning that will otherwise come about
pd.options.mode.chained_assignment = None
Here we will plot the data for 2 days, 1 working and 1 nonworking. Days are NOT always 24 hours, sometimes data is missing. On the last day we will average over the month just to show that there is some similarity (i.e. the first few days are not so different from the days overall).
data = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
data = nicedata(data)
test2 = nicedata(test)
day_1 = data[0:24]
day_2 = data[24:47]
day_3 = data[47:69]
day_4 = data[69:92]
#re add (ax2, ax3) to below command and uncomment out below to make 4 graphs
f, (ax1, ax4) = plt.subplots(2, sharex=True, sharey=True)
ax1.plot(day_1.time,day_1['count'])
ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(day_2.time,day_2['count'])
# ax3.plot(day_3.time,day_3['count'])
ax4.plot(day_4.time,day_4['count'])
f.subplots_adjust(hspace=0)
f.set_size_inches(12,7)
jan2011 = data[(data['month']==1) & (data['year']==2011) & data['workingday']==True]
grouped = jan2011.groupby('time')
grouped = grouped.agg(np.mean)
plt.plot(grouped['count'],c='r')
# plt.figure()
# below plot if necessary
# plt.plot(data['time'],data['count'].values)
[<matplotlib.lines.Line2D at 0x10c693bd0>]
As you can see, for working days (the second 2) there is a clear spike during morning and afternoon rushes.
Using two different models, SVR and RandomForestRegressor, then comparing cross validation scores. Note that you cannot use data from future data (i.e. you cant use data from Feb of 2011 for Jan 2011 dates) so I just iterated over each month in each year and created a seperate model for each. An intersting idea would be to use all previous months to create a model for each month but it did not work for me as expected.
test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)
test = normalizedata(test)
data = normalizedata(data)
years = [2011,2012]
months = list(xrange(1,13,1))
hours = list(xrange(0,24,1))
pred_months_years = []
pd.DataFrame(pred_months_years)
for y in [2011,2012]:
for m in range(1,13,1):
# print(y,m)
monthSVR = SVR(kernel='rbf', C=1e3, gamma=1)
muse = data[(data['month']==m) & (data['year']==y)]
tuse = test[(test['month']==m) & (test['year']==y)]
monthSVR.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
predmonthSVR = monthSVR.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
pd.DataFrame(predmonthSVR)
pred_months_years.append(predmonthSVR)
z = pred_months_years
merged = list(chain(*z))
z = pd.Series(merged)
z.astype(int);
z.name = "count"
predictionsSVR = pd.DataFrame(z)
predictionsSVR.ix[predictionsSVR['count'] <= 1, :] = 1
# submitdata(subsub,name='SVRrbf')
compared to random forest
test = pd.read_csv('data/test.csv')
data = pd.read_csv('data/train.csv')
data = nicedata(data)
test= nicedata(test)
test = normalizedata(test)
data = normalizedata(data)
pred_months_years = []
pd.DataFrame(pred_months_years)
# final = []
# pd.Series(final)
for y in [2011,2012]:
for m in range(1,13,1):
# print(y,m)
monthforest = RandomForestRegressor(n_estimators=250)
muse = data[(data['month']==m) & (data['year']==y)]
tuse = test[(test['month']==m) & (test['year']==y)]
monthforest.fit(muse[['time','workingday','atemp','humidity','windspeed','weather']],muse['count'])
predmonth = monthforest.predict(tuse[['time','workingday','atemp','humidity','windspeed','weather']])
pd.DataFrame(predmonth)
pred_months_years.append(predmonth)
z = pred_months_years
merged = list(chain(*z))
predictionsRF = pd.Series(merged)
predictionsRF.astype(int)
submitdata(predictionsRF,name='timeinforest')
Here we will plot the random forest predictions and the svm predictions and compare it to a day near it. i would like to make it so it finds a random day from the predictions and then compares with nearest day from train in the future.
fig = plt.figure()
fig.set_size_inches(10,7)
rf1 = plt.subplot2grid((3, 4), (0, 0), colspan=2)
svr2 = plt.subplot2grid((3, 4), (0, 2), colspan=2)
ax3 = plt.subplot2grid((3, 4), (1, 0), colspan=4, rowspan=1)
rf1.plot(test.time[0:24],predictionsRF[0:24])
svr2.plot(test.time[0:24],predictionsSVR[0:24])# example_plot(ax3)
ax3.plot(day_19.time,day_19['count'])
ax3.set_xlabel('Time')
ax3.set_ylabel('Count')
ax3.set_title('Day 19'),rf1.set_title('Day 20 Random Forest'),svr2.set_title('Day 20 SVR')
plt.tight_layout()
cross validation:
scoreMF = cross_val_score(monthforest, muse[['time','workingday','atemp','humidity','windspeed','weather']],
muse['count'], cv=5)
scoreSVR = cross_val_score(monthSVR, muse[['time','workingday','atemp','humidity','windspeed','weather']],
muse['count'], cv=5)
print("Accuracy of Random Forest Regressor: %0.2f (+/- %0.2f)" % (scoreMF.mean(), scoreMF.std() * 2))
print("Accuracy of SVR: %0.2f (+/- %0.2f)" % (scoreSVR.mean(), scoreSVR.std() * 2))
Accuracy of Random Forest Regressor: 0.87 (+/- 0.09) Accuracy of SVR: 0.81 (+/- 0.10)
# from sklearn import tree
# dt = tree.ExtraTreeRegressor()
# #set target, train and test. train and test must have same number of features
# df = data
# target = df['count']
# train = df[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# test = test2[['time','holiday','season','temp','atemp','windspeed','weather','humidity']]
# dt.fit(train,target)
# predicted_probs = dt.predict(test)
# predicted_probs = pd.Series(predicted_probs)
# predicted_probs = predicted_probs.map(lambda x: int(x))
# keep = pd.read_csv('data/test.csv')
# keep = keep['datetime']
# #save to file
# submit = pd.concat([keep,predicted_probs],axis=1)
# # print(forest.feature_importances_)
# submit.columns=['datetime','count']
# submit.to_csv('data/submissiondtree.csv',index=False)
# plt.figure()
# # pl.scatter(tr, y, c="k", label="data")
# plt.plot(train['time'], target, c="g", label="train", linewidth=2)
# # plt.plot(test['time'], predicted_probs, c="r", label="test", linewidth=2)
# plt.xlabel("data")
# plt.ylabel("target")
# plt.title("Decision Tree Regression")
# plt.legend()
# plt.show()
#removed plot for SVM
# day_19 = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],subsub[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)
#removed plot for random forest
# day_19rf = data[(data['month']==1) & (data['year']==2011) & (data['day']==19)]
# f, (ax1, ax2) = plt.subplots(2, sharex=True, sharey=True)
# ax1.plot(day_19.time,day_19['count'])
# # ax1.set_title('2 Weekends, 2 Working days')
# ax2.plot(test.time[0:24],z[0:24])
# # ax3.plot(day_3.time,day_3['count'])
# # ax4.plot(day_4.time,day_4['count'])
# f.subplots_adjust(hspace=0)
# f.set_size_inches(10,7)
# submitdata(z,name='timeinforest')