import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from pandas.plotting import register_matplotlib_converters
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, r2_score
from keras.layers import CuDNNLSTM, Activation, Bidirectional, Dropout, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
# configurations
%matplotlib inline
%config InlineBackend.figure_format='retina'
# add 2x retina display config
# https://github.com/ipython/ipython/pull/3381
# `pd.Timestamp` can be used in matplotlib plots without having to cast them to another type
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.plotting.register_matplotlib_converters.html
pd.plotting.register_matplotlib_converters()
sns.set(style='whitegrid', palette='muted', font_scale=1.5)
plt.rcParams['figure.figsize'] = 20, 9
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# https://www.kaggle.com/hmavrodiev/london-bike-sharing-dataset
!kaggle datasets download -d hmavrodiev/london-bike-sharing-dataset
!unzip /content/london-bike-sharing-dataset.zip
Downloading london-bike-sharing-dataset.zip to /content 0% 0.00/165k [00:00<?, ?B/s] 100% 165k/165k [00:00<00:00, 38.6MB/s] Archive: /content/london-bike-sharing-dataset.zip inflating: london_merged.csv
Metadata
timestamp - timestamp field for grouping the data
cnt - the count of a new bike shares
t1 - real temperature in C
t2 - temperature in C “feels like”
hum - humidity in percentage
wind_speed - wind speed in km/h
weather_code - category of the weather
is_holiday - boolean field - 1 holiday / 0 non holiday
is_weekend - boolean field - 1 if the day is weekend
season - category field meteorological seasons:
weather_code category description:
1 = Clear ; mostly clear but have some values with haze/fog/patches of fog/ fog in vicinity
2 = scattered clouds / few clouds
3 = Broken clouds
4 = Cloudy
7 = Rain/ light Rain shower/ Light rain
10 = rain with thunderstorm
26 = snowfall
94 = Freezing Fog
df_pbs = pd.read_csv('/content/london_merged.csv',
parse_dates=['timestamp'],
index_col='timestamp')
df_pbs
cnt | t1 | t2 | hum | wind_speed | weather_code | is_holiday | is_weekend | season | |
---|---|---|---|---|---|---|---|---|---|
timestamp | |||||||||
2015-01-04 00:00:00 | 182 | 3.0 | 2.0 | 93.0 | 6.0 | 3.0 | 0.0 | 1.0 | 3.0 |
2015-01-04 01:00:00 | 138 | 3.0 | 2.5 | 93.0 | 5.0 | 1.0 | 0.0 | 1.0 | 3.0 |
2015-01-04 02:00:00 | 134 | 2.5 | 2.5 | 96.5 | 0.0 | 1.0 | 0.0 | 1.0 | 3.0 |
2015-01-04 03:00:00 | 72 | 2.0 | 2.0 | 100.0 | 0.0 | 1.0 | 0.0 | 1.0 | 3.0 |
2015-01-04 04:00:00 | 47 | 2.0 | 0.0 | 93.0 | 6.5 | 1.0 | 0.0 | 1.0 | 3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2017-01-03 19:00:00 | 1042 | 5.0 | 1.0 | 81.0 | 19.0 | 3.0 | 0.0 | 0.0 | 3.0 |
2017-01-03 20:00:00 | 541 | 5.0 | 1.0 | 81.0 | 21.0 | 4.0 | 0.0 | 0.0 | 3.0 |
2017-01-03 21:00:00 | 337 | 5.5 | 1.5 | 78.5 | 24.0 | 4.0 | 0.0 | 0.0 | 3.0 |
2017-01-03 22:00:00 | 224 | 5.5 | 1.5 | 76.0 | 23.0 | 4.0 | 0.0 | 0.0 | 3.0 |
2017-01-03 23:00:00 | 139 | 5.0 | 1.0 | 76.0 | 22.0 | 2.0 | 0.0 | 0.0 | 3.0 |
17414 rows × 9 columns
df_pbs.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 17414 entries, 2015-01-04 00:00:00 to 2017-01-03 23:00:00 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 cnt 17414 non-null int64 1 t1 17414 non-null float64 2 t2 17414 non-null float64 3 hum 17414 non-null float64 4 wind_speed 17414 non-null float64 5 weather_code 17414 non-null float64 6 is_holiday 17414 non-null float64 7 is_weekend 17414 non-null float64 8 season 17414 non-null float64 dtypes: float64(8), int64(1) memory usage: 1.3 MB
df_pbs.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
cnt | 17414.0 | 1143.101642 | 1085.108068 | 0.0 | 257.0 | 844.0 | 1671.75 | 7860.0 |
t1 | 17414.0 | 12.468091 | 5.571818 | -1.5 | 8.0 | 12.5 | 16.00 | 34.0 |
t2 | 17414.0 | 11.520836 | 6.615145 | -6.0 | 6.0 | 12.5 | 16.00 | 34.0 |
hum | 17414.0 | 72.324954 | 14.313186 | 20.5 | 63.0 | 74.5 | 83.00 | 100.0 |
wind_speed | 17414.0 | 15.913063 | 7.894570 | 0.0 | 10.0 | 15.0 | 20.50 | 56.5 |
weather_code | 17414.0 | 2.722752 | 2.341163 | 1.0 | 1.0 | 2.0 | 3.00 | 26.0 |
is_holiday | 17414.0 | 0.022051 | 0.146854 | 0.0 | 0.0 | 0.0 | 0.00 | 1.0 |
is_weekend | 17414.0 | 0.285403 | 0.451619 | 0.0 | 0.0 | 0.0 | 1.00 | 1.0 |
season | 17414.0 | 1.492075 | 1.118911 | 0.0 | 0.0 | 1.0 | 2.00 | 3.0 |
pd.date_range(df_pbs.index.min(), df_pbs.index.max()).difference(df_pbs.index)
DatetimeIndex(['2015-09-12', '2015-11-18', '2016-02-08', '2016-06-24', '2016-07-31', '2016-09-02', '2016-09-03'], dtype='datetime64[ns]', freq=None)
Since the ratio of missing dates is very low (~0.1%), no action is taken to fill missing values
# add hour of the day
df_pbs['hour'] = df_pbs.index.hour
# add day of month
df_pbs['day_of_month'] = df_pbs.index.day
# add day of week
df_pbs['day_of_week'] = df_pbs.index.dayofweek
# add month
df_pbs['month'] = df_pbs.index.month
sns.lineplot(x=df_pbs.index, y="cnt", data=df_pbs);
The data is crowded. The grpah would be clear if it's plotted on monthly basis
df_pbs.resample('M').sum()
cnt | t1 | t2 | hum | wind_speed | weather_code | is_holiday | is_weekend | season | |
---|---|---|---|---|---|---|---|---|---|
timestamp | |||||||||
2015-01-31 | 546639 | 4101.000000 | 2255.500000 | 51005.00 | 12502.000000 | 2211.0 | 0.0 | 192.0 | 2013.0 |
2015-02-28 | 543594 | 3721.500000 | 1868.000000 | 51950.00 | 9873.500000 | 2409.0 | 0.0 | 192.0 | 2013.0 |
2015-03-31 | 695934 | 6093.500000 | 4558.500000 | 51832.25 | 12938.250000 | 1940.0 | 0.0 | 215.0 | 0.0 |
2015-04-30 | 831582 | 7873.000000 | 7112.500000 | 48468.00 | 10988.500000 | 1642.0 | 48.0 | 191.0 | 0.0 |
2015-05-31 | 895413 | 10091.500000 | 9803.500000 | 47939.00 | 14268.500000 | 1900.0 | 48.0 | 240.0 | 0.0 |
2015-06-30 | 1033252 | 12151.500000 | 12073.000000 | 44371.50 | 12398.000000 | 1457.0 | 0.0 | 192.0 | 717.0 |
2015-07-31 | 1120687 | 14107.000000 | 14060.000000 | 46632.50 | 13067.000000 | 1710.0 | 0.0 | 192.0 | 740.0 |
2015-08-31 | 1033951 | 13830.500000 | 13816.500000 | 51913.50 | 10326.000000 | 1948.0 | 24.0 | 240.0 | 744.0 |
2015-09-30 | 892478 | 10371.000000 | 10357.500000 | 52002.00 | 9394.000000 | 1683.0 | 0.0 | 185.0 | 1422.0 |
2015-10-31 | 865046 | 9474.000000 | 9321.500000 | 57887.00 | 8671.166667 | 2078.0 | 0.0 | 215.0 | 1472.0 |
2015-11-30 | 677332 | 8259.500000 | 7605.000000 | 56344.50 | 14191.000000 | 2418.0 | 0.0 | 216.0 | 1422.0 |
2015-12-31 | 602838 | 9000.500000 | 8660.000000 | 56448.50 | 15452.500000 | 2367.0 | 48.0 | 188.0 | 2220.0 |
2016-01-31 | 582518 | 5459.500000 | 3872.000000 | 58230.50 | 12996.500000 | 2424.0 | 24.0 | 240.0 | 2232.0 |
2016-02-29 | 592910 | 4695.500000 | 3033.000000 | 49847.00 | 13058.500000 | 2054.0 | 0.0 | 190.0 | 2064.0 |
2016-03-31 | 656527 | 5344.000000 | 3684.500000 | 53722.50 | 11778.000000 | 2177.0 | 48.0 | 177.0 | 0.0 |
2016-04-30 | 768845 | 6804.500000 | 5708.500000 | 48643.50 | 11067.000000 | 1901.0 | 0.0 | 216.0 | 0.0 |
2016-05-31 | 1001935 | 10663.000000 | 10537.500000 | 50815.50 | 11001.500000 | 1852.0 | 48.0 | 216.0 | 0.0 |
2016-06-30 | 933878 | 11602.500000 | 11578.500000 | 53668.50 | 9745.000000 | 2204.0 | 0.0 | 192.0 | 705.0 |
2016-07-31 | 1165527 | 14467.000000 | 14453.500000 | 48719.50 | 11847.000000 | 1619.0 | 0.0 | 239.0 | 741.0 |
2016-08-31 | 1137314 | 14699.000000 | 14686.500000 | 47944.00 | 11860.500000 | 1511.0 | 24.0 | 192.0 | 740.0 |
2016-09-30 | 998619 | 12813.000000 | 12800.000000 | 48022.00 | 9580.000000 | 1622.0 | 0.0 | 180.0 | 1366.0 |
2016-10-31 | 935706 | 9363.333333 | 9239.833333 | 56942.00 | 9328.666667 | 1919.0 | 0.0 | 239.0 | 1486.0 |
2016-11-30 | 703859 | 5827.000000 | 4499.500000 | 56243.00 | 10499.000000 | 2128.0 | 0.0 | 191.0 | 1438.0 |
2016-12-31 | 651908 | 5967.500000 | 4925.000000 | 63803.50 | 9216.500000 | 2005.0 | 48.0 | 216.0 | 2232.0 |
2017-01-31 | 37680 | 339.000000 | 114.000000 | 6071.50 | 1061.500000 | 235.0 | 24.0 | 24.0 | 216.0 |
df_pbs.resample('M').sum().plot(y='cnt');
plt.xlabel('Month')
plt.ylabel('Bike Share Count (million(s))')
plt.suptitle('Bike Share Monthly Basis Trend')
plt.show();
# df_pbs_month = df_pbs.resample('M').sum()
# sns.lineplot(x=df_pbs_month.index, y='cnt', data=df_pbs_month);
From the graph above it is clear that it has a strong seasonality component. Summer months are good for bike share business
We can quantify the strength and type of relationship between observations and their lags. In statistics, this is called correlation, and when calculated against lag values in time series, it is called autocorrelation (self-correlation)
A correlation value calculated between two groups of numbers, such as observations and their lag1 values, results in a number between -1 and 1. The sign of this number indicates a negative or positive correlation respectively. A value close to zero suggests a weak correlation, whereas a value closer to -1 or 1 indicates a strong correlation
pd.plotting.autocorrelation_plot(df_pbs.cnt);