%matplotlib inline
from matplotlib import pylab as plt
import matplotlib.dates as mdates
plt.rcParams['figure.figsize'] = (15.0, 8.0)
import pandas as pd
import seaborn as sns
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
Using TensorFlow backend.
Couldn't import dot_parser, loading of dot files will not be possible.
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from bokeh.charts import TimeSeries, output_file, show
from bokeh.io import output_notebook
output_notebook()
Eleven months data from one bed one bath apartment unit in San Jose, CA region was picked for this experiment. The electricity consumption is recorded in 15 minutes interval by the energy supply company. The raw data contains information such as type, date, start time, end time, usage, units, cost and notes fields. The start time and end time is the measurement interval. In this data, the interval is 15 minutes. The usage in 15 minutes interval is provided in kWh unit, and the cost for the consumption is presented in the dollar. Before we deep dive into the data, some quick feature engineering steps are done to enrich the data with more features.
data = pd.read_csv("D202.csv")
data.head(2)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | |
---|---|---|---|---|---|---|---|---|
0 | Electric usage | 11/1/16 | 0:00 | 0:14 | 0.07 | kWh | $0.01 | NaN |
1 | Electric usage | 11/1/16 | 0:15 | 0:29 | 0.05 | kWh | $0.01 | NaN |
data["DATE_TIME"] = pd.to_datetime(data.DATE + " " + data["END TIME"])
data["DAY_TYPE"] = data.DATE_TIME.apply(lambda x: 1 if x.dayofweek > 5 else 0 )
cal = calendar()
holidays = cal.holidays(start = data.DATE_TIME.min(), end = data.DATE_TIME.max())
data["IS_HOLIDAY"] = data.DATE_TIME.isin(holidays)
data.head(3)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Electric usage | 11/1/16 | 0:00 | 0:14 | 0.07 | kWh | $0.01 | NaN | 2016-11-01 00:14:00 | 0 | False |
1 | Electric usage | 11/1/16 | 0:15 | 0:29 | 0.05 | kWh | $0.01 | NaN | 2016-11-01 00:29:00 | 0 | False |
2 | Electric usage | 11/1/16 | 0:30 | 0:44 | 0.06 | kWh | $0.01 | NaN | 2016-11-01 00:44:00 | 0 | False |
for obs in range(1,6):
data["T_" + str(obs)] = data.USAGE.shift(obs)
data.fillna(0.00,inplace=True)
data.head(10)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Electric usage | 11/1/16 | 0:00 | 0:14 | 0.07 | kWh | $0.01 | 0 | 2016-11-01 00:14:00 | 0 | False | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
1 | Electric usage | 11/1/16 | 0:15 | 0:29 | 0.05 | kWh | $0.01 | 0 | 2016-11-01 00:29:00 | 0 | False | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 |
2 | Electric usage | 11/1/16 | 0:30 | 0:44 | 0.06 | kWh | $0.01 | 0 | 2016-11-01 00:44:00 | 0 | False | 0.05 | 0.07 | 0.00 | 0.00 | 0.00 |
3 | Electric usage | 11/1/16 | 0:45 | 0:59 | 0.05 | kWh | $0.01 | 0 | 2016-11-01 00:59:00 | 0 | False | 0.06 | 0.05 | 0.07 | 0.00 | 0.00 |
4 | Electric usage | 11/1/16 | 1:00 | 1:14 | 0.03 | kWh | $0.01 | 0 | 2016-11-01 01:14:00 | 0 | False | 0.05 | 0.06 | 0.05 | 0.07 | 0.00 |
5 | Electric usage | 11/1/16 | 1:15 | 1:29 | 0.03 | kWh | $0.01 | 0 | 2016-11-01 01:29:00 | 0 | False | 0.03 | 0.05 | 0.06 | 0.05 | 0.07 |
6 | Electric usage | 11/1/16 | 1:30 | 1:44 | 0.03 | kWh | $0.01 | 0 | 2016-11-01 01:44:00 | 0 | False | 0.03 | 0.03 | 0.05 | 0.06 | 0.05 |
7 | Electric usage | 11/1/16 | 1:45 | 1:59 | 0.06 | kWh | $0.01 | 0 | 2016-11-01 01:59:00 | 0 | False | 0.03 | 0.03 | 0.03 | 0.05 | 0.06 |
8 | Electric usage | 11/1/16 | 2:00 | 2:14 | 0.04 | kWh | $0.01 | 0 | 2016-11-01 02:14:00 | 0 | False | 0.06 | 0.03 | 0.03 | 0.03 | 0.05 |
9 | Electric usage | 11/1/16 | 2:15 | 2:29 | 0.03 | kWh | $0.01 | 0 | 2016-11-01 02:29:00 | 0 | False | 0.04 | 0.06 | 0.03 | 0.03 | 0.03 |
data.IS_HOLIDAY = data.IS_HOLIDAY.astype("int")
data.head(2)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Electric usage | 11/1/16 | 0:00 | 0:14 | 0.07 | kWh | $0.01 | 0 | 2016-11-01 00:14:00 | 0 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | Electric usage | 11/1/16 | 0:15 | 0:29 | 0.05 | kWh | $0.01 | 0 | 2016-11-01 00:29:00 | 0 | 0 | 0.07 | 0.0 | 0.0 | 0.0 | 0.0 |
clean_data = data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_data.head(2)
DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | USAGE | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.07 |
1 | 0 | 0 | 0.07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.05 |
all_show = TimeSeries(data,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(all_show)
/Users/jagan/anaconda2/lib/python2.7/site-packages/bokeh/core/json_encoder.py:33: FutureWarning: pandas.tslib is deprecated and will be removed in a future version. You can access Timestamp as pandas.Timestamp if pd and isinstance(obj, pd.tslib.Timestamp):
xmask = (data.DATE_TIME >= pd.to_datetime("12/20/2016")) & (data.DATE_TIME <= pd.to_datetime("12/27/2016"))
xmas_week = data.loc[xmask]
xmas_show = TimeSeries(xmas_week,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(xmas_show)
dmask = (data.DATE_TIME >= pd.to_datetime("01/01/2017")) & (data.DATE_TIME < pd.to_datetime("01/02/2017"))
nyd = data.loc[dmask]
nyd_show = TimeSeries(nyd,x="DATE_TIME",y=["USAGE"],legend=True,plot_width=900, plot_height=350)
show(nyd_show)
training_data = data[data.DATE_TIME < pd.to_datetime("08/01/2017")]
val_mask = (data.DATE_TIME >= pd.to_datetime("08/01/2017")) & (data.DATE_TIME < pd.to_datetime("09/01/2017"))
val_data = data.loc[val_mask]
test_data = data[data.DATE_TIME >= pd.to_datetime("09/01/2017")]
training_data.tail(3)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
26006 | Electric usage | 7/31/17 | 23:15 | 23:29 | 0.12 | kWh | $0.02 | 0 | 2017-07-31 23:29:00 | 0 | 0 | 0.15 | 0.10 | 0.27 | 0.44 | 0.12 |
26007 | Electric usage | 7/31/17 | 23:30 | 23:44 | 0.11 | kWh | $0.02 | 0 | 2017-07-31 23:44:00 | 0 | 0 | 0.12 | 0.15 | 0.10 | 0.27 | 0.44 |
26008 | Electric usage | 7/31/17 | 23:45 | 23:59 | 0.10 | kWh | $0.02 | 0 | 2017-07-31 23:59:00 | 0 | 0 | 0.11 | 0.12 | 0.15 | 0.10 | 0.27 |
test_data.head(2)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
28984 | Electric usage | 9/1/17 | 0:00 | 0:14 | 0.03 | kWh | $0.01 | 0 | 2017-09-01 00:14:00 | 0 | 0 | 0.03 | 0.03 | 0.04 | 0.04 | 0.05 |
28985 | Electric usage | 9/1/17 | 0:15 | 0:29 | 0.04 | kWh | $0.01 | 0 | 2017-09-01 00:29:00 | 0 | 0 | 0.03 | 0.03 | 0.03 | 0.04 | 0.04 |
clean_train = training_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_test = test_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_val = val_data[['DAY_TYPE', 'IS_HOLIDAY', 'T_1','T_2', 'T_3', 'T_4', 'T_5','USAGE']]
clean_train.head(2)
DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | USAGE | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.07 |
1 | 0 | 0 | 0.07 | 0.0 | 0.0 | 0.0 | 0.0 | 0.05 |
clean_test.head(2)
DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | USAGE | |
---|---|---|---|---|---|---|---|---|
28984 | 0 | 0 | 0.03 | 0.03 | 0.04 | 0.04 | 0.05 | 0.03 |
28985 | 0 | 0 | 0.03 | 0.03 | 0.03 | 0.04 | 0.04 | 0.04 |
clean_val.head(3)
DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | USAGE | |
---|---|---|---|---|---|---|---|---|
26009 | 0 | 0 | 0.1 | 0.11 | 0.12 | 0.15 | 0.10 | 0.10 |
26010 | 0 | 0 | 0.1 | 0.10 | 0.11 | 0.12 | 0.15 | 0.10 |
26011 | 0 | 0 | 0.1 | 0.10 | 0.10 | 0.11 | 0.12 | 0.08 |
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
X_train,y_train = clean_train.drop(["USAGE"],axis=1),clean_train.USAGE
X_test,y_test = clean_test.drop(["USAGE"],axis=1),clean_test.USAGE
X_val,y_val = clean_val.drop(["USAGE"],axis=1),clean_val.USAGE
scaler = StandardScaler()
#scaler = MinMaxScaler(feature_range=(-1, 1))
rfr = RandomForestRegressor(random_state=2017,verbose=2,n_jobs=5)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_valid_scaled = scaler.fit_transform(X_val)
rfr.fit(X_train_scaled,y_train)
building tree 1 of 10 building tree 2 of 10 building tree 3 of 10 building tree 4 of 10 building tree 5 of 10 building tree 6 of 10 building tree 7 of 10 building tree 8 of 10 building tree 9 of 10 building tree 10 of 10
[Parallel(n_jobs=5)]: Done 7 out of 10 | elapsed: 0.1s remaining: 0.1s [Parallel(n_jobs=5)]: Done 10 out of 10 | elapsed: 0.1s finished
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=5, oob_score=False, random_state=2017, verbose=2, warm_start=False)
rfr.score(X_val,y_val)
[Parallel(n_jobs=5)]: Done 7 out of 10 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=5)]: Done 10 out of 10 | elapsed: 0.0s finished
0.26352935135691857
rfr.score(X_test,y_test)
[Parallel(n_jobs=5)]: Done 7 out of 10 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=5)]: Done 10 out of 10 | elapsed: 0.0s finished
0.32707009777378615
test_data["RF_PREDICTED"] = rfr.predict(X_test_scaled)
[Parallel(n_jobs=5)]: Done 7 out of 10 | elapsed: 0.0s remaining: 0.0s [Parallel(n_jobs=5)]: Done 10 out of 10 | elapsed: 0.0s finished /Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
test_data.head(5)
TYPE | DATE | START TIME | END TIME | USAGE | UNITS | COST | NOTES | DATE_TIME | DAY_TYPE | IS_HOLIDAY | T_1 | T_2 | T_3 | T_4 | T_5 | RF_PREDICTED | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
28984 | Electric usage | 9/1/17 | 0:00 | 0:14 | 0.03 | kWh | $0.01 | 0 | 2017-09-01 00:14:00 | 0 | 0 | 0.03 | 0.03 | 0.04 | 0.04 | 0.05 | 0.021306 |
28985 | Electric usage | 9/1/17 | 0:15 | 0:29 | 0.04 | kWh | $0.01 | 0 | 2017-09-01 00:29:00 | 0 | 0 | 0.03 | 0.03 | 0.03 | 0.04 | 0.04 | 0.060113 |
28986 | Electric usage | 9/1/17 | 0:30 | 0:44 | 0.06 | kWh | $0.01 | 0 | 2017-09-01 00:44:00 | 0 | 0 | 0.04 | 0.03 | 0.03 | 0.03 | 0.04 | 0.048081 |
28987 | Electric usage | 9/1/17 | 0:45 | 0:59 | 0.03 | kWh | $0.01 | 0 | 2017-09-01 00:59:00 | 0 | 0 | 0.06 | 0.04 | 0.03 | 0.03 | 0.03 | 0.061000 |
28988 | Electric usage | 9/1/17 | 1:00 | 1:14 | 0.03 | kWh | $0.01 | 0 | 2017-09-01 01:14:00 | 0 | 0 | 0.03 | 0.06 | 0.04 | 0.03 | 0.03 | 0.028246 |
pred_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=800, plot_height=350)
show(pred_show)
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]
sep_30rf = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED"],legend=True,plot_width=900, plot_height=350)
show(sep_30rf)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
model_k = Sequential()
model_k.add(LSTM(1, input_shape=(1,7)))
model_k.add(Dense(1))
model_k.compile(loss='mean_squared_error', optimizer='adam')
SVG(model_to_dot(model_k).create(prog='dot', format='svg'))
X_t_reshaped = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_resaped = X_valid_scaled.reshape((X_valid_scaled.shape[0], 1, X_valid_scaled.shape[1]))
history = model_k.fit(X_t_reshaped, y_train, validation_data=(X_val_resaped, y_val),\
epochs=10, batch_size=96, verbose=2)
Train on 26009 samples, validate on 2975 samples Epoch 1/10 2s - loss: 0.0757 - val_loss: 0.0165 Epoch 2/10 1s - loss: 0.0605 - val_loss: 0.0191 Epoch 3/10 1s - loss: 0.0506 - val_loss: 0.0166 Epoch 4/10 1s - loss: 0.0421 - val_loss: 0.0207 Epoch 5/10 1s - loss: 0.0370 - val_loss: 0.0267 Epoch 6/10 1s - loss: 0.0339 - val_loss: 0.0307 Epoch 7/10 1s - loss: 0.0321 - val_loss: 0.0357 Epoch 8/10 1s - loss: 0.0309 - val_loss: 0.0394 Epoch 9/10 1s - loss: 0.0303 - val_loss: 0.0396 Epoch 10/10 1s - loss: 0.0298 - val_loss: 0.0413
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
<matplotlib.legend.Legend at 0x1c2e8c6c90>
X_te_reshaped = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))
res = model_k.predict(X_te_reshaped)
test_data["DL_PRED"] = res
/Users/jagan/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
keras_show = TimeSeries(test_data,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(keras_show)
sep_30m = test_data[test_data.DATE_TIME >= pd.to_datetime("09/30/2017")]
sep_30 = TimeSeries(sep_30m,x="DATE_TIME",y=["USAGE","RF_PREDICTED","DL_PRED"],legend=True,plot_width=900, plot_height=350)
show(sep_30)
from numpy import sqrt
sqrt(mean_squared_error(test_data.USAGE,test_data.DL_PRED))
0.16743906314666301
sqrt(mean_squared_error(test_data.USAGE,test_data.RF_PREDICTED))
0.18412616700088405