#!/usr/bin/env python
# coding: utf-8

# ### Competition Series 1: Forecast use of a city bike share system
# 
# Data come from kaggle https://www.kaggle.com/c/bike-sharing-demand.
# 
# https://www.kaggle.com/c/bike-sharing-demand/leaderboard

# In[1]:


import pandas as pd
import numpy as np
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)

get_ipython().run_line_magic('matplotlib', 'inline')
sns.set_context("paper")


# ### 1. import the data

# In[2]:


train = pd.read_csv('bike_train.csv')
test = pd.read_csv('bike_test.csv')


# ### 2. add new variables
# 
# - When I see the time variable, I will always construct other features such as 'year' 'month' 'day' 'weekday'  'hour'.Here, the variable of 'day' is used to split dataset, so I do not make 'day' in test.
# 
# - By observing the biking behavior of casual and registered users, I add two features 'adverse_rush_hour' and 'rush_hour_working'.
# 
# - After I checked many messages on the kaggle discussion group, I find out the log value is important to skew distribution, especailly to the variables casual and register and count, our goal variables.

# In[3]:


train['datetime'] = pd.to_datetime(train['datetime'])
train['year'] = train['datetime'].map(lambda x: x.year)
train['month'] = train['datetime'].map(lambda x: x.month)
train['hour'] = train['datetime'].map(lambda x: x.hour)
train['day'] = train['datetime'].map(lambda x: x.day)
train['weekday'] = train['datetime'].map(lambda x: x.dayofweek)

test['datetime'] = pd.to_datetime(test['datetime'])
test['year'] = test['datetime'].map(lambda x: x.year)
test['month'] = test['datetime'].map(lambda x: x.month)
test['hour'] = test['datetime'].map(lambda x: x.hour)
test['weekday'] = test['datetime'].map(lambda x: x.dayofweek)

# workingday is already exclude weekend and holiday
train['weekend'] = (train['weekday'] == 5) | (train['weekday'] == 6)
test['weekend'] = (test['weekday'] == 5) | (test['weekday'] == 6)


# In[6]:


# nonmeber bikers
# just like freelance, no workingday is more active 
sns.boxplot(x = 'hour', y = 'casual', hue = 'workingday', data = train);


# In[4]:


# for casual pattern
train['adverse_rush_hour'] = abs(train['hour'] -14)
test['adverse_rush_hour'] = abs(test['hour'] -14)


# In[8]:


sns.lmplot(x = 'hour', y = 'adverse_rush_hour', data = train, lowess = True, scatter_kws={"s": 80});


# In[9]:


# member bikers
# at workingday, they active at 8 o'clock, 17 o'clock, 18 o'clock rush hour.
# feature rush_hour_working focus on above situation.
####################
# at no workingday, their biking behaviour just like the nonmember biker (casual). 
# feature adverse_rush_hour focus on above situation.
sns.boxplot(x = 'hour', y = 'registered', hue = 'workingday', data = train);


# In[5]:


# for registered workingday pattern
train['rush_hour_working'] = ((train['hour'] == 7) | (train['hour'] == 8) | (train['hour'] == 17) |\
                    (train['hour'] == 18) | (train['hour'] == 19)) & (train['workingday'] == 1)

test['rush_hour_working'] = ((test['hour'] == 7) | (test['hour'] == 8) | (test['hour'] == 17) |\
                    (test['hour'] == 18) | (test['hour'] == 19)) & (test['workingday'] == 1)


# In[27]:


# skew distribution
sns.distplot(train.casual);


# In[28]:


# log
sns.distplot(np.log(train.casual + 1));


# In[34]:


sns.distplot(train.registered);


# In[33]:


sns.distplot(np.log(train.registered + 1));


# In[36]:


sns.distplot(train['count']);


# In[37]:


sns.distplot(np.log(train['count'] + 1));


# ### 3. train the model

# In[12]:


from sklearn.cross_validation import cross_val_score

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LarsCV, LassoLarsCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, ExtraTreesRegressor, \
AdaBoostRegressor, GradientBoostingRegressor


# In[14]:


# model
lr = LinearRegression()
ridge = RidgeCV()
lasso = LassoCV()
net = ElasticNetCV()
lar = LarsCV()
lassolars = LassoLarsCV()

tree = DecisionTreeRegressor(random_state = 0)
knn = KNeighborsRegressor()
svr_l = SVR(kernel = 'linear')
svr_r = SVR()

n_estimators = 100
bag = BaggingRegressor(DecisionTreeRegressor(), n_estimators = n_estimators, random_state = 0)
rf = RandomForestRegressor(n_estimators = n_estimators, random_state = 0)
etree = ExtraTreesRegressor(n_estimators = n_estimators, random_state = 0)
ada = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = n_estimators, random_state = 0)
gbr = GradientBoostingRegressor(n_estimators = n_estimators, random_state = 0)


# #### 3.1 split the train into train and test

# In[38]:


# days 1 to 14
train_train = train[train.day < 15]
# days 15 to 19
train_test = train[train.day >= 15] 


# #### 3.2 choose the variables
# - Here, I drop 'datetime', because it is not the type of numeric, only numeric or string could run smoothly in some algorithms, and the biggest reason is that I have already derive messages from 'datetime' by create variables 'year' 'month' 'weekday' 'weekend' and 'hour'.

# In[42]:


train.columns


# In[55]:


x_1 = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', \
       'year', 'month', 'hour', 'weekday', 'weekend', 'adverse_rush_hour', 'rush_hour_working'] 


# #### 3.3 the index to compare models by error

# In[49]:


def rmsle(pred_count, actual_count):
    return round(np.sqrt(np.mean((np.log(pred_count + 1) - np.log(actual_count + 1)) ** 2)), 5) 


# #### 3.4 try all kinds of models

# In[58]:


# bagging
pred = np.exp(bag.fit(train_train[x_1], np.log(train_train['casual'] + 1)).predict(train_test[x_1])) +\
        np.exp(bag.fit(train_train[x_1], np.log(train_train['registered'] + 1)).predict(train_test[x_1])) - 2
rmsle(pred, train_test['count']) 


# In[60]:


# rf
pred = np.exp(rf.fit(train_train[x_1], np.log(train_train['casual'] + 1)).predict(train_test[x_1])) +\
        np.exp(rf.fit(train_train[x_1], np.log(train_train['registered'] + 1)).predict(train_test[x_1])) - 2
rmsle(pred, train_test['count']) 


# In[62]:


# etree
pred = np.exp(etree.fit(train_train[x_1], np.log(train_train['casual'] + 1)).predict(train_test[x_1])) +\
        np.exp(etree.fit(train_train[x_1], np.log(train_train['registered'] + 1)).predict(train_test[x_1])) - 2
rmsle(pred, train_test['count']) 


# In[86]:


# gbr
pred = np.exp(gbr.fit(train_train[x_1], np.log(train_train['casual'] + 1)).predict(train_test[x_1])) +\
        np.exp(gbr.fit(train_train[x_1], np.log(train_train['registered'] + 1)).predict(train_test[x_1])) - 2
rmsle(pred, train_test['count']) 


# In[87]:


# ada
pred = np.exp(ada.fit(train_train[x_1], np.log(train_train['casual'] + 1)).predict(train_test[x_1])) +\
        np.exp(ada.fit(train_train[x_1], np.log(train_train['registered'] + 1)).predict(train_test[x_1])) - 2
rmsle(pred, train_test['count']) 


# #### 3.5 test models in the test dataset

# In[98]:


# bagging
# 0.43139 x_1
etree_s_pred2 = np.exp(bag.fit(train[x_1], np.log(train['casual'] + 1)).predict(test[x_1])) +\
                 np.exp(bag.fit(train[x_1], np.log(train['registered'] + 1)).predict(test[x_1])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_bag_x_1.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[97]:


# random forest
# predict registered and casual separately
# 0.43000 x_1
etree_s_pred2 = np.exp(rf.fit(train[x_1], np.log(train['casual'] + 1)).predict(test[x_1])) +\
                 np.exp(rf.fit(train[x_1], np.log(train['registered'] + 1)).predict(test[x_1])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_rf_x_1.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[96]:


# Extra Tree
# predict registered and casual separately
# 0.41339 x_1
etree_s_pred2 = np.exp(etree.fit(train[x_1], np.log(train['casual'] + 1)).predict(test[x_1])) +\
                 np.exp(etree.fit(train[x_1], np.log(train['registered'] + 1)).predict(test[x_1])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_etree_x_1.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[99]:


# gbr
# 0.42185 x_1
etree_s_pred2 = np.exp(gbr.fit(train[x_1], np.log(train['casual'] + 1)).predict(test[x_1])) +\
                 np.exp(gbr.fit(train[x_1], np.log(train['registered'] + 1)).predict(test[x_1])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_gbr_x_1.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[100]:


# ada
# 0.41982 x_1
etree_s_pred2 = np.exp(ada.fit(train[x_1], np.log(train['casual'] + 1)).predict(test[x_1])) +\
                 np.exp(ada.fit(train[x_1], np.log(train['registered'] + 1)).predict(test[x_1])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_ada_x_1.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# #### 3.6 the importance of variables in models

# In[102]:


imp = etree.fit(train[x_1], np.log(train['registered'] + 1)).feature_importances_
df_imp = pd.DataFrame({'features':x_1, 'importance': imp})
sns.barplot(y = 'features', x = 'importance', data = df_imp);


# In[103]:


imp = etree.fit(train[x_1], np.log(train['casual'] + 1)).feature_importances_
df_imp = pd.DataFrame({'features':x_1, 'importance': imp})
sns.barplot(y = 'features', x = 'importance', data = df_imp);


# In[104]:


imp = ada.fit(train[x_1], np.log(train['registered'] + 1)).feature_importances_
df_imp = pd.DataFrame({'features':x_1, 'importance': imp})
sns.barplot(y = 'features', x = 'importance', data = df_imp);


# In[105]:


imp = ada.fit(train[x_1], np.log(train['casual'] + 1)).feature_importances_
df_imp = pd.DataFrame({'features':x_1, 'importance': imp})
sns.barplot(y = 'features', x = 'importance', data = df_imp);


# #### 3.7 add new features
# 
# Below fetures is introduced by this [link](https://www.analyticsvidhya.com/blog/2015/06/solution-kaggle-competition-bike-sharing-demand/). 

# In[6]:


train['hour_reg'] = 0
train.loc[train.hour < 7, 'hour_reg'] = 1
train.loc[train.hour == 7, 'hour_reg'] = 2
train.loc[train.hour == 8, 'hour_reg'] = 3
train.loc[(train.hour > 8) & (train.hour < 16), 'hour_reg'] = 4
train.loc[(train.hour == 16) | (train.hour == 17), 'hour_reg'] = 5
train.loc[(train.hour == 18) | (train.hour == 19), 'hour_reg'] = 6
train.loc[train.hour >= 20, 'hour_reg'] = 7

test['hour_reg'] = 0
test.loc[test.hour < 7, 'hour_reg'] = 1
test.loc[test.hour == 7, 'hour_reg'] = 2
test.loc[test.hour == 8, 'hour_reg'] = 3
test.loc[(test.hour > 8) & (test.hour < 16), 'hour_reg'] = 4
test.loc[(test.hour == 16) | (test.hour == 17), 'hour_reg'] = 5
test.loc[(test.hour == 18) | (test.hour == 19), 'hour_reg'] = 6
test.loc[test.hour >= 20, 'hour_reg'] = 7


# In[7]:


train['hour_cas'] = 0
train.loc[train.hour <= 7, 'hour_cas'] = 1
train.loc[(train.hour == 8) | (train.hour == 9), 'hour_cas'] = 2
train.loc[(train.hour >= 10) & (train.hour < 20), 'hour_cas'] = 3
train.loc[train.hour >= 20, 'hour_cas'] = 4

test['hour_cas'] = 0
test.loc[test.hour <= 7, 'hour_cas'] = 1
test.loc[(test.hour == 8) | (test.hour == 9), 'hour_cas'] = 2
test.loc[(test.hour >= 10) & (test.hour < 20), 'hour_cas'] = 3
test.loc[test.hour >= 20, 'hour_cas'] = 4


# In[8]:


train['temp_reg'] = 0
train.loc[train.temp < 13, 'temp_reg'] = 1
train.loc[(train.temp >= 13) & (train.temp < 23), 'temp_reg'] = 2
train.loc[(train.temp >= 23) & (train.temp < 30), 'temp_reg'] = 3
train.loc[train.temp >= 30, 'temp_reg'] = 4

test['temp_reg'] = 0
test.loc[test.temp < 13, 'temp_reg'] = 1
test.loc[(test.temp >= 13) & (test.temp < 23), 'temp_reg'] = 2
test.loc[(test.temp >= 23) & (test.temp < 30), 'temp_reg'] = 3
test.loc[test.temp >= 30, 'temp_reg'] = 4


# In[9]:


train['temp_cas'] = 0
train.loc[train.temp < 15, 'temp_cas'] = 1
train.loc[(train.temp >= 15) & (train.temp < 23), 'temp_cas'] = 2
train.loc[(train.temp >= 23) & (train.temp < 30), 'temp_cas'] = 3
train.loc[train.temp >= 30, 'temp_cas'] = 4

test['temp_cas'] = 0
test.loc[test.temp < 15, 'temp_cas'] = 1
test.loc[(test.temp >= 15) & (test.temp < 23), 'temp_cas'] = 2
test.loc[(test.temp >= 23) & (test.temp < 30), 'temp_cas'] = 3
test.loc[test.temp >= 30, 'temp_cas'] = 4


# In[10]:


train['year_part'] = 0
train.loc[(train.year == 2011) & (train.month <= 3), 'year_part'] = 1
train.loc[(train.year == 2011) & (train.month > 3) & (train.month <= 6), 'year_part'] = 2
train.loc[(train.year == 2011) & (train.month > 6) & (train.month <= 9), 'year_part'] = 3
train.loc[(train.year == 2011) & (train.month > 9), 'year_part'] = 4
train.loc[(train.year == 2012) & (train.month <= 3), 'year_part'] = 5
train.loc[(train.year == 2012) & (train.month > 3) & (train.month <= 6), 'year_part'] = 6
train.loc[(train.year == 2012) & (train.month > 6) & (train.month <= 9), 'year_part'] = 7
train.loc[(train.year == 2012) & (train.month > 9), 'year_part'] = 8

test['year_part'] = 0
test.loc[(test.year == 2011) & (test.month <= 3), 'year_part'] = 1
test.loc[(test.year == 2011) & (test.month > 3) & (test.month <= 6), 'year_part'] = 2
test.loc[(test.year == 2011) & (test.month > 6) & (test.month <= 9), 'year_part'] = 3
test.loc[(test.year == 2011) & (test.month > 9), 'year_part'] = 4
test.loc[(test.year == 2012) & (test.month <= 3), 'year_part'] = 5
test.loc[(test.year == 2012) & (test.month > 3) & (test.month <= 6), 'year_part'] = 6
test.loc[(test.year == 2012) & (test.month > 6) & (test.month <= 9), 'year_part'] = 7
test.loc[(test.year == 2012) & (test.month > 9), 'year_part'] = 8


# In[11]:


train['day_type'] = 0
train.loc[(train.holiday == 0) & (train.workingday == 0), 'day_type'] = 1
train.loc[(train.holiday == 0) & (train.workingday == 1), 'day_type'] = 2
train.loc[train.holiday == 1, 'day_type'] = 3

test['day_type'] = 0
test.loc[(test.holiday == 0) & (test.workingday == 0), 'day_type'] = 1
test.loc[(test.holiday == 0) & (test.workingday == 1), 'day_type'] = 2
test.loc[test.holiday == 1, 'day_type'] = 3


# In[20]:


x_t3_r = ['season', 'holiday', 'workingday', 'weather', 'atemp', 'humidity', 'windspeed', 'year', 'hour',\
        'weekday', 'hour_reg', 'temp_reg', 'year_part', 'day_type', 'rush_hour_working', 'adverse_rush_hour', 'temp'] 

x_t3_c = ['season', 'holiday', 'workingday', 'weather', 'atemp', 'humidity', 'windspeed', 'year', 'hour',\
        'weekday', 'hour_cas', 'temp_cas', 'year_part', 'day_type', 'rush_hour_working', 'adverse_rush_hour', 'temp'] 


# In[19]:


# Extra Tree
# predict registered and casual separately
# log
# 0.41573 x_t2
# 0.42089 x_t3
# 0.41344 x_t3_r 100trees p/3
# 0.41340 500trees p/3
# 0.42489 add temp month 500trees p/3
# 0.42561 add temp month 100tree p/3
# 0.42532 add temp month 100tree sqrt
etree2 = ExtraTreesRegressor(n_estimators = 100, random_state = 0, max_features = 'sqrt')
etree_s_pred2 = np.exp(etree2.fit(train[x_t3_c], np.log(train['casual'] + 1)).predict(test[x_t3_c])) +\
                 np.exp(etree2.fit(train[x_t3_r], np.log(train['registered'] + 1)).predict(test[x_t3_r])) - 2
etree_s_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(etree_s_pred2)}) 
etree_s_df2.to_csv('python_etree_x_t3.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[21]:


# random forest
# predict registered and casual separately
# log
# 0.43148 x_t2
# 0.44282 x_t3 top5
# 0.44282 top5+
# 0.40415 x_t3_r 250trees p/3
# 0.40389 500trees sqrt
# add temp month 0.41720
# add month 0.42208
# add temp 0.40146
rf2 = RandomForestRegressor(n_estimators = 500, random_state = 0, max_features = "sqrt") 
rf_pred2 = np.exp(rf2.fit(train[x_t3_c], np.log(train['casual'] + 1)).predict(test[x_t3_c])) +\
                 np.exp(rf2.fit(train[x_t3_r], np.log(train['registered'] + 1)).predict(test[x_t3_r])) - 2
rf_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(rf_pred2)}) 
rf_df2.to_csv('python_rf_x_t3.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")


# In[15]:


# try ada 
# x_t3_r 0.43519
ada = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators = 100, random_state = 0)
rf_pred2 = np.exp(ada.fit(train[x_t3_c], np.log(train['casual'] + 1)).predict(test[x_t3_c])) +\
                 np.exp(ada.fit(train[x_t3_r], np.log(train['registered'] + 1)).predict(test[x_t3_r])) - 2
rf_df2 = pd.DataFrame({'datetime': test.datetime, 'count':pd.Series(rf_pred2)}) 
rf_df2.to_csv('python_ada_x_t3.csv', index = False, date_format = "%Y-%m-%d %H:%M:%S")