import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import tensorflow as tf from pandas.plotting import register_matplotlib_converters from sklearn.preprocessing import RobustScaler from sklearn.metrics import mean_squared_error, r2_score from keras.layers import CuDNNLSTM, Activation, Bidirectional, Dropout, Dense from keras.models import Sequential from keras.callbacks import EarlyStopping # configurations %matplotlib inline %config InlineBackend.figure_format='retina' # add 2x retina display config # https://github.com/ipython/ipython/pull/3381 # `pd.Timestamp` can be used in matplotlib plots without having to cast them to another type # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.plotting.register_matplotlib_converters.html pd.plotting.register_matplotlib_converters() sns.set(style='whitegrid', palette='muted', font_scale=1.5) plt.rcParams['figure.figsize'] = 20, 9 RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) tf.random.set_seed(RANDOM_SEED) !mkdir ~/.kaggle !cp kaggle.json ~/.kaggle/ !chmod 600 ~/.kaggle/kaggle.json # https://www.kaggle.com/hmavrodiev/london-bike-sharing-dataset !kaggle datasets download -d hmavrodiev/london-bike-sharing-dataset !unzip /content/london-bike-sharing-dataset.zip df_pbs = pd.read_csv('/content/london_merged.csv', parse_dates=['timestamp'], index_col='timestamp') df_pbs df_pbs.info() df_pbs.describe().T pd.date_range(df_pbs.index.min(), df_pbs.index.max()).difference(df_pbs.index) # add hour of the day df_pbs['hour'] = df_pbs.index.hour # add day of month df_pbs['day_of_month'] = df_pbs.index.day # add day of week df_pbs['day_of_week'] = df_pbs.index.dayofweek # add month df_pbs['month'] = df_pbs.index.month sns.lineplot(x=df_pbs.index, y="cnt", data=df_pbs); df_pbs.resample('M').sum() df_pbs.resample('M').sum().plot(y='cnt'); plt.xlabel('Month') plt.ylabel('Bike Share Count (million(s))') plt.suptitle('Bike Share Monthly Basis Trend') plt.show(); # df_pbs_month = df_pbs.resample('M').sum() # sns.lineplot(x=df_pbs_month.index, y='cnt', data=df_pbs_month); pd.plotting.autocorrelation_plot(df_pbs.cnt); pd.plotting.autocorrelation_plot(df_pbs.resample('M').sum().cnt) plt.suptitle('Monthly Basis Bike Share Autocorrelation') plt.show(); df_pbs.hum.hist() plt.xlabel('Humidity(%)') plt.ylabel('Frequency') plt.suptitle('Humidity(%) Distribution') plt.show(); # degree celcius df_pbs.t1.plot(kind='kde'); plt.xlabel('Temperature(°C)') plt.ylabel('Density') plt.suptitle('Temperature(°C) Distribution') plt.show(); df_pbs.wind_speed.hist() plt.xlabel('Wind Speed(km/h)') plt.ylabel('Frequency') plt.suptitle('Wind Speed(km/h) Distribution') plt.show(); fig,(ax1, ax2, ax3, ax4, ax5)= plt.subplots(nrows=5) fig.set_size_inches(18, 28) sns.pointplot(data=df_pbs, x='hour', y='cnt', ax=ax1) sns.pointplot(data=df_pbs, x='hour', y='cnt', hue='is_holiday', ax=ax2) sns.pointplot(data=df_pbs, x='hour', y='cnt', hue='is_weekend', ax=ax3) sns.pointplot(data=df_pbs, x='hour', y='cnt', hue='season', ax=ax4) sns.pointplot(data=df_pbs, x='hour', y='cnt', hue='weather_code', ax=ax5); fig,(ax1, ax2)= plt.subplots(nrows=2) fig.set_size_inches(18, 14) sns.pointplot(data=df_pbs, x='day_of_week', y='cnt', ax=ax1) sns.pointplot(data=df_pbs, x='day_of_week', y='cnt', hue='season', ax=ax2); fig,(ax1, ax2)= plt.subplots(nrows=2) fig.set_size_inches(18, 14) sns.pointplot(data=df_pbs, x='day_of_month', y='cnt', ax=ax1) sns.pointplot(data=df_pbs, x='day_of_month', y='cnt', hue='season', ax=ax2); train_size = int(len(df_pbs) * 0.9) test_size = len(df_pbs) -train_size train, test = df_pbs.iloc[:train_size], df_pbs.iloc[train_size:] print(len(train), len(test)) float_cols = ['t1', 't2', 'hum', 'wind_speed'] float_col_scaler = RobustScaler() train.loc[:, float_cols] = float_col_scaler.fit_transform(train[float_cols]) test.loc[:, float_cols] = float_col_scaler.transform(test[float_cols]) cnt_transformer = RobustScaler() train['cnt'] = cnt_transformer.fit_transform(train[['cnt']]) test['cnt'] = cnt_transformer.transform(test[['cnt']]) def create_dataset(X, y, time_steps=1): Xs, ys = [], [] for i in range(len(X) - time_steps): v = X.iloc[i:(i + time_steps)].values Xs.append(v) ys.append(y.iloc[i + time_steps]) return np.array(Xs), np.array(ys) time_steps = 10 # 10 data points from the history: X_train, y_train = create_dataset(train, train.cnt, time_steps) X_test, y_test = create_dataset(test, test.cnt, time_steps) print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) # plot model architecture def plot_model(model): model.summary() return tf.keras.utils.plot_model( model, to_file="model.png", show_shapes=True, show_dtype=False, show_layer_names=True, rankdir="TB", expand_nested=True, dpi=96, layer_range=None, ) # evaluate model def evaluate_model(model, history): plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show(); print("Evaluation loss",model.evaluate(X_test, y_test)) y_pred = model.predict(X_test) y_train_inv = cnt_transformer.inverse_transform(y_train.reshape(1, -1)) y_test_inv = cnt_transformer.inverse_transform(y_test.reshape(1, -1)) y_pred_inv = cnt_transformer.inverse_transform(y_pred) rmse = np.sqrt(mean_squared_error(y_test_inv.flatten(), y_pred_inv.flatten())) r2 = r2_score(y_test_inv.flatten(),y_pred_inv.flatten()) print("rmse is : {}\nr2 is : {}".format(rmse,r2)) plt.plot(np.arange(0, len(y_train)), y_train_inv.flatten(), 'g', label="history") plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_test_inv.flatten(), marker='.', label="true") plt.plot(np.arange(len(y_train), len(y_train) + len(y_test)), y_pred_inv.flatten(), 'r', label="prediction") plt.ylabel('Bike Count') plt.xlabel('Time Step') plt.legend() plt.show(); plt.plot(y_test_inv.flatten(), marker='.', label="true") plt.plot(y_pred_inv.flatten(), 'r', label="prediction") plt.ylabel('Bike Count') plt.xlabel('Time Step') plt.legend() plt.show(); # setting up an early stopping early_stop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='min') callbacks_list = [early_stop] def build_model_lstm_1(): model = Sequential() model.add(Bidirectional(CuDNNLSTM(128), input_shape=(X_train.shape[1], X_train.shape[-1]))) model.add(Dropout(0.2)) model.add(Dense(1)) model.compile(loss='mean_squared_error', optimizer='adam') model.add(Activation('linear')) return model lstm_vanilla = build_model_lstm_1() plot_model(lstm_vanilla) history_1 = lstm_vanilla.fit(X_train, y_train, epochs=30, batch_size=128, validation_split=0.1, # callbacks=callbacks_list, shuffle=False) evaluate_model(lstm_vanilla, history_1) def build_model_lstm_2(): model = Sequential() model.add(Bidirectional( CuDNNLSTM(X_train.shape[1], return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[-1]) )) model.add(Dropout(0.2)) model.add(Bidirectional(CuDNNLSTM((X_train.shape[1] * 2), return_sequences=True))) model.add(Dropout(0.2)) model.add(Bidirectional(CuDNNLSTM(X_train.shape[1]))) model.add(Dense(units=1)) model.add(Activation('linear')) model.compile(loss='mean_squared_error', optimizer='adam') return model lstm_stacked_1 = build_model_lstm_2() plot_model(lstm_stacked_1) history_2 = lstm_stacked_1.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.1, shuffle=False) evaluate_model(lstm_stacked_1, history_2) def build_model_lstm_3(): model = Sequential() model.add(Bidirectional( CuDNNLSTM(32, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[-1]) )) model.add(Dropout(0.2)) model.add(Bidirectional(CuDNNLSTM((32), return_sequences=True))) model.add(Dropout(0.2)) model.add(Bidirectional(CuDNNLSTM(16))) model.add(Dense(units=1)) model.add(Activation('linear')) model.compile(loss='mean_squared_error', optimizer='adam') return model lstm_stacked_2 = build_model_lstm_3() plot_model(lstm_stacked_2) history_3 = lstm_stacked_2.fit(X_train, y_train, epochs=40, batch_size=64, validation_split=0.1, shuffle=False) evaluate_model(lstm_stacked_2, history_3)