Research plan
There are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 16 days of each month, while the test set is the 17-19th day of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period. That is, predict "count" without using "count" or it's components "casual" and "registered".
Data Fields
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import CatBoostRegressor, Pool, cv
from sklearn.metrics import mean_squared_error
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# Fixing random seed
np.random.seed(17)
print(os.listdir("../input"))
# Fixing random seed
np.random.seed(17)
# Read data
data_df = pd.read_csv('../input/train_luc.csv')
# Convert to datetime
data_df['datetime'] = pd.to_datetime(data_df['datetime'])
# Sort by datetime
data_df.sort_values(by='datetime')
# Look at the first rows of the training set
data_df.head()
data_df.shape
Train contains 3 target columns: * 'casual', 'registred', 'count'.** Column 'count' is the sum of columns 'casual' and 'registred'. Check it:*
(data_df['casual'] + data_df['registered'] - data_df['count']).value_counts()
# Get info by train
data_df.info()
Excelent! We have not NaN.
# Get statistics by train_df
data_df.describe()
I split data into train and hold-out samples
train_df, test_df, y_train, y_test = train_test_split(data_df.drop(['casual', 'registered', 'count'], axis=1), data_df[['casual', 'registered', 'count']],
test_size=0.3, random_state=17, shuffle=True)
def draw_train_test_distribution(column):
_, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,3))
sns.distplot(train_df[column], ax = axes[0], label='train')
sns.distplot(test_df[column], ax = axes[1], label='test');
# The distribution of the indicative features
draw_train_test_distribution('season')
draw_train_test_distribution('holiday')
draw_train_test_distribution('workingday')
draw_train_test_distribution('weather');
The distribution of the indicative features ('season', 'holiday', 'workingday', 'weather') on the train and test coincides. There is 'weather' with value 4 on the test and absent on the train
test_df[test_df['weather'] == 4]['weather'].count()
Only 1 exampe contains 'weather'=4. This can be neglected
# The distribution of the numerical features on the train and test
draw_train_test_distribution('temp')
draw_train_test_distribution('atemp')
draw_train_test_distribution('humidity')
draw_train_test_distribution('windspeed')
The distribution of the scale features ('temp', 'atemp', 'humidity', 'windspeed') on the train and test coincides
def transformation(columnName, func = np.log1p):
temp_train = pd.DataFrame(index=train_df.index)
temp_train[columnName] = train_df[columnName].apply(func)
temp_test = pd.DataFrame(index=test_df.index)
temp_test[columnName] = test_df[columnName].apply(func)
_, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,3))
sns.distplot(temp_train, ax = axes[0])
sns.distplot(temp_test, ax = axes[1]);
transformation('temp')
transformation('atemp')
transformation('humidity')
After the transformation of the distribution on the train_df and on the test_df began to resemble each other more
train_df['temp_tr'] = train_df['temp'].apply(np.log1p)
test_df['temp_tr'] = test_df['temp'].apply(np.log1p)
train_df['atemp_tr'] = train_df['atemp'].apply(np.log1p)
test_df['atemp_tr'] = test_df['atemp'].apply(np.log1p)
train_df['humidity_tr'] = train_df['humidity'].apply(np.log1p)
test_df['humidity_tr'] = test_df['humidity'].apply(np.log1p)
corr = train_df.join(y_train).corr('spearman')
plt.figure(figsize = ( 12 , 10 ))
sns.heatmap(corr,annot=True,fmt='.2f',cmap="YlGnBu");
We can see a strong correlation between target columns (What was expected). Among customers, correlation with registered users is higher than with unregistered ones. In this case, unregistered looking more at the weather.
Correlations temp/temp_tr, atemp/atemp_tr, humidity/humidity_tr are equals. I'll use *_tr features, because their distibutions into train/test are similar.
**Idea: Build an ensemble using 3 models with different target!**
'Holiday' has a low correlation with targets. I'll not use it.
'Workingday' has a lower correlation with 'registered' and 'count' than with 'casual'
'Windspeed' affects both registered and unregistered users
The effect of 'temp' and 'atemp' is the same.
According to the terms of competition i will use Root Mean Squared Error (RMSE)
The course recommend using Gradient boosting model as one of the most powerful. I want use Catboost library, because i want to understand it. From catboost library i 'll use CatBoostRegression, because current task is related to regression.
X_train = train_df.drop(['holiday', 'datetime', 'temp', 'atemp', 'humidity'], axis=1)
X_test = test_df.drop(['holiday', 'datetime', 'temp', 'atemp', 'humidity'], axis=1)
cat_features = [0, 1, 2]
X_train_cbr, X_test_cbr, y_train_cbr, y_test_cbr = train_test_split(X_train, y_train, test_size=0.3, random_state=17, shuffle=True)
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import colorama
# the number of random sets of hyperparameters
N_HYPEROPT_PROBES = 100
# hyperparameter sampling algorithm
HYPEROPT_ALGO = tpe.suggest
space ={
'depth': hp.quniform("depth", 4, 10, 1),
'learning_rate': hp.loguniform('learning_rate', -3.0, -0.7),
'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
}
def get_catboost_params(space):
params = dict()
params['learning_rate'] = space['learning_rate']
params['depth'] = int(space['depth'])
params['l2_leaf_reg'] = space['l2_leaf_reg']
return params
def objective(space, target_column='count'):
global obj_call_count, cur_best_rmse
obj_call_count += 1
print('\nCatBoost objective call #{} cur_best_acc={:7.5f}'.format(obj_call_count, cur_best_rmse) )
params = get_catboost_params(space)
sorted_params = sorted(space.items(), key=lambda z: z[0])
params_str = str.join(' ', ['{}={}'.format(k, v) for k, v in sorted_params])
print('Params: {}'.format(params_str) )
model = CatBoostRegressor(iterations=2000,
cat_features = cat_features,
learning_rate=params['learning_rate'],
depth=int(params['depth']),
use_best_model=True,
eval_metric='RMSE',
l2_leaf_reg=params['l2_leaf_reg'],
early_stopping_rounds=50,
random_seed=17,
verbose=False
)
model.fit(X_train_cbr, y_train_cbr[target_column],
eval_set=(X_test_cbr, y_test_cbr[target_column]),
verbose=False)
nb_trees = model.get_best_iteration()
print('nb_trees={}'.format(nb_trees))
y_pred = model.predict(X_test_cbr)
rmse = np.sqrt(mean_squared_error(y_test_cbr[target_column], y_pred))
print('rmse={}, Params:{}, nb_trees={}\n'.format(rmse, params_str, nb_trees ))
if rmse<cur_best_rmse:
cur_best_rmse = rmse
print(colorama.Fore.GREEN + 'NEW BEST RMSE={}'.format(cur_best_rmse) + colorama.Fore.RESET)
return{'loss':rmse, 'status': STATUS_OK }
%%time
obj_call_count = 0
cur_best_rmse = np.inf
trials = Trials()
best = fmin(fn=objective,
space=space,
algo=HYPEROPT_ALGO,
max_evals=N_HYPEROPT_PROBES,
trials=trials,
verbose=1)
print('The best params:')
print( best )
cbr = CatBoostRegressor(random_seed=17,
eval_metric='RMSE',
iterations=2000,
max_depth=best['depth'],
early_stopping_rounds=50,
learning_rate=best['learning_rate'],
l2_leaf_reg=best['l2_leaf_reg'],
use_best_model=True)
cbr.fit(X_train_cbr, y_train_cbr['count'],
eval_set=(X_test_cbr, y_test_cbr['count']),
cat_features=cat_features,
silent=True,
plot=True);
rmse_learn = cbr.evals_result_['learn']['RMSE']
rmse_test = cbr.evals_result_['validation_0']['RMSE']
plt.plot(rmse_learn)
plt.plot(rmse_test)
plt.title('RMSE on train/test data')
plt.xlabel('trees count')
plt.ylabel('rmse value')
plt.legend(['leanr', 'test']);
#Get important features
cbr.feature_importances_
Most important features are 'humidity_tr', 'atemp_tr' and 'temp_tr'
Add new features from 'datetime'
train_df['hour'] = train_df['datetime'].apply(lambda ts: ts.hour)
test_df['hour'] = test_df['datetime'].apply(lambda ts: ts.hour)
train_df['weekday'] = train_df['datetime'].apply(lambda ts: ts.isoweekday())
test_df['weekday'] = test_df['datetime'].apply(lambda ts: ts.isoweekday())
train_df['month'] = train_df['datetime'].apply(lambda ts: ts.month)
test_df['month'] = test_df['datetime'].apply(lambda ts: ts.month)
draw_train_test_distribution('hour')
draw_train_test_distribution('weekday');
draw_train_test_distribution('month');
Disributions in train and test datasets are equale.
hours_mean_count = {}
hours_mean_casual = {}
hours_mean_registered = {}
hours_mean ={}
hours = np.unique(train_df['hour'])
print("hours:",hours)
temp_train_df = train_df.join(y_train)
for h in hours:
hours_mean_count[h] = temp_train_df.loc[temp_train_df['hour'] == h]['count'].mean()
hours_mean_casual[h] = temp_train_df.loc[temp_train_df['hour'] == h]['casual'].mean()
hours_mean_registered[h] = temp_train_df.loc[temp_train_df['hour'] == h]['registered'].mean()
hours_mean[h] = [temp_train_df.loc[temp_train_df['hour'] == h]['count'].mean(),
temp_train_df.loc[temp_train_df['hour'] == h]['casual'].mean(),
temp_train_df.loc[temp_train_df['hour'] == h]['registered'].mean()]
hours_df = pd.DataFrame.from_dict(hours_mean, orient='index',
columns=['count', 'casual', 'registered'])
hours_df['hours'] = hours
hours_df.plot(x='hours', y=['count', 'casual', 'registered'], kind='bar',
title = 'Measured bike use over 2 years', legend = True );
del temp_train_df
gc.collect()
Ok. It makes sense to make features by the hour: night (23-0-6), day (7-22)
train_df['night'] = train_df['hour'].apply(lambda h: 1 if (h>=23) | (h<=6) else 0)
train_df['day'] = train_df['hour'].apply(lambda h: 1 if (h<23) & (h>6) else 0)
test_df['night'] = test_df['hour'].apply(lambda h: 1 if (h>=23) | (h<=6) else 0)
test_df['day'] = test_df['hour'].apply(lambda h: 1 if (h<23) & (h>6) else 0)
def make_harmonic_features(value, period=24):
new_value = value * 2 * np.pi / period
return np.cos(new_value), np.sin(new_value)
train_df['hour_cos'], train_df['hour_sin'] = make_harmonic_features(train_df['hour'])
test_df['hour_cos'], test_df['hour_sin'] = make_harmonic_features(test_df['hour'])
train_df.head()
Build new model with new params
X_train = train_df.drop(['holiday', 'datetime', 'temp', 'atemp', 'humidity'], axis=1)
X_test = test_df.drop(['holiday', 'datetime', 'temp', 'atemp', 'humidity'], axis=1)
X_train.head()
cat_features = [0, 1, 2, 7, 8, 9, 10, 11]
X_train_cbr, X_test_cbr, y_train_cbr, y_test_cbr = train_test_split(X_train, y_train, test_size=0.3, random_state=17, shuffle=True)
%%time
obj_call_count = 0
cur_best_rmse = np.inf
trials = Trials()
best = fmin(fn=objective,
space=space,
algo=HYPEROPT_ALGO,
max_evals=N_HYPEROPT_PROBES,
trials=trials,
verbose=1)
print('The best params:')
print(best)
cbr = CatBoostRegressor(random_seed=17,
eval_metric='RMSE',
iterations=2000,
max_depth=best['depth'],
early_stopping_rounds=50,
learning_rate=best['learning_rate'],
l2_leaf_reg=best['l2_leaf_reg'])
cbr.fit(X_train_cbr, y_train_cbr['count'],
eval_set=(X_test_cbr, y_test_cbr['count']),
cat_features=cat_features,
silent=True,
plot=True);
rmse_learn = cbr.evals_result_['learn']['RMSE']
rmse_test = cbr.evals_result_['validation_0']['RMSE']
plt.plot(rmse_learn)
plt.plot(rmse_test)
plt.title('RMSE on train/test data')
plt.xlabel('trees count')
plt.ylabel('rmse value')
plt.legend(['leanr', 'test']);
%%time
cbr = CatBoostRegressor(random_seed=17,
eval_metric='RMSE',
iterations=2000,
max_depth=best['depth'],
# early_stopping_rounds=50, #because fit without eval_set
learning_rate=best['learning_rate'],
l2_leaf_reg=best['l2_leaf_reg'])
cbr.fit(X_train, y_train['count'],
cat_features=cat_features,
silent=True,
plot=False);
y_pred = cbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test['count'], y_pred))
print('RMSE = ', rmse)
RMSE for hold_out samples coincides with the RMSE on cross-validation!!!