# Importing libraries
import pandas as pd
import numpy as np
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(rc={'figure.figsize':(12,6)}, style='whitegrid')
# Loading and reading dataset
df_bicycle = pd.read_csv('data\london.csv')
df_bicycle
timestamp | cnt | t1 | t2 | hum | wind_speed | weather_code | is_holiday | is_weekend | season | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-04 00:00:00 | 182 | 3.0 | 2.0 | 93.0 | 6.0 | 3.0 | 0.0 | 1.0 | 3.0 |
1 | 2015-01-04 01:00:00 | 138 | 3.0 | 2.5 | 93.0 | 5.0 | 1.0 | 0.0 | 1.0 | 3.0 |
2 | 2015-01-04 02:00:00 | 134 | 2.5 | 2.5 | 96.5 | 0.0 | 1.0 | 0.0 | 1.0 | 3.0 |
3 | 2015-01-04 03:00:00 | 72 | 2.0 | 2.0 | 100.0 | 0.0 | 1.0 | 0.0 | 1.0 | 3.0 |
4 | 2015-01-04 04:00:00 | 47 | 2.0 | 0.0 | 93.0 | 6.5 | 1.0 | 0.0 | 1.0 | 3.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
17409 | 2017-01-03 19:00:00 | 1042 | 5.0 | 1.0 | 81.0 | 19.0 | 3.0 | 0.0 | 0.0 | 3.0 |
17410 | 2017-01-03 20:00:00 | 541 | 5.0 | 1.0 | 81.0 | 21.0 | 4.0 | 0.0 | 0.0 | 3.0 |
17411 | 2017-01-03 21:00:00 | 337 | 5.5 | 1.5 | 78.5 | 24.0 | 4.0 | 0.0 | 0.0 | 3.0 |
17412 | 2017-01-03 22:00:00 | 224 | 5.5 | 1.5 | 76.0 | 23.0 | 4.0 | 0.0 | 0.0 | 3.0 |
17413 | 2017-01-03 23:00:00 | 139 | 5.0 | 1.0 | 76.0 | 22.0 | 2.0 | 0.0 | 0.0 | 3.0 |
17414 rows × 10 columns
# Investigating dataset information
def overview(df):
print(" DATA INFO ".center(125,'-'))
print(df.info())
print(" SHAPE OF DATASET ".center(125,'-'))
print('Rows:{}'.format(df.shape[0]))
print('Columns:{}'.format(df.shape[1]))
print(" DATA TYPES ".center(125,'-'))
print(df.dtypes)
print(" STATISTICS OF DATA ".center(125,'-'))
print(df.describe(include="all"))
print(" MISSING VALUES ".center(125,'-'))
print(df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending = False))
print(" DUPLICATED VALUES ".center(125,'-'))
print(df.duplicated().sum())
overview(df_bicycle)
--------------------------------------------------------- DATA INFO --------------------------------------------------------- <class 'pandas.core.frame.DataFrame'> RangeIndex: 17414 entries, 0 to 17413 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 timestamp 17414 non-null object 1 cnt 17414 non-null int64 2 t1 17414 non-null float64 3 t2 17414 non-null float64 4 hum 17414 non-null float64 5 wind_speed 17414 non-null float64 6 weather_code 17414 non-null float64 7 is_holiday 17414 non-null float64 8 is_weekend 17414 non-null float64 9 season 17414 non-null float64 dtypes: float64(8), int64(1), object(1) memory usage: 1.3+ MB None ------------------------------------------------------ SHAPE OF DATASET ----------------------------------------------------- Rows:17414 Columns:10 --------------------------------------------------------- DATA TYPES -------------------------------------------------------- timestamp object cnt int64 t1 float64 t2 float64 hum float64 wind_speed float64 weather_code float64 is_holiday float64 is_weekend float64 season float64 dtype: object ----------------------------------------------------- STATISTICS OF DATA ---------------------------------------------------- timestamp cnt t1 t2 \ count 17414 17414.000000 17414.000000 17414.000000 unique 17414 NaN NaN NaN top 2015-01-04 00:00:00 NaN NaN NaN freq 1 NaN NaN NaN mean NaN 1143.101642 12.468091 11.520836 std NaN 1085.108068 5.571818 6.615145 min NaN 0.000000 -1.500000 -6.000000 25% NaN 257.000000 8.000000 6.000000 50% NaN 844.000000 12.500000 12.500000 75% NaN 1671.750000 16.000000 16.000000 max NaN 7860.000000 34.000000 34.000000 hum wind_speed weather_code is_holiday is_weekend \ count 17414.000000 17414.000000 17414.000000 17414.000000 17414.000000 unique NaN NaN NaN NaN NaN top NaN NaN NaN NaN NaN freq NaN NaN NaN NaN NaN mean 72.324954 15.913063 2.722752 0.022051 0.285403 std 14.313186 7.894570 2.341163 0.146854 0.451619 min 20.500000 0.000000 1.000000 0.000000 0.000000 25% 63.000000 10.000000 1.000000 0.000000 0.000000 50% 74.500000 15.000000 2.000000 0.000000 0.000000 75% 83.000000 20.500000 3.000000 0.000000 1.000000 max 100.000000 56.500000 26.000000 1.000000 1.000000 season count 17414.000000 unique NaN top NaN freq NaN mean 1.492075 std 1.118911 min 0.000000 25% 0.000000 50% 1.000000 75% 2.000000 max 3.000000 ------------------------------------------------------- MISSING VALUES ------------------------------------------------------ Series([], dtype: int64) ----------------------------------------------------- DUPLICATED VALUES ----------------------------------------------------- 0
Data description:
timestamp - date and time (accurate to the hour)
cnt - number of bicycle rentals for this hour
t1 - temperature, in C
t2 - temperature "feels like", in C
hum - humidity (%)
wind_speed - wind speed, km/h
weather_code - weather conditions
1 - clear (SKC)
2 - mostly clear / isolated clouds occur (SCT)
3 - cloudy / significant clouds (BKN)
4 - cloudy (OVC)
7 - light rain Rain/ light Rain shower/ Light rain
10 - rain with thunderstorm
26 - snowfall
94 - ice fog (yes, it happens!)
isholiday - is the day a holiday (1 - holiday, 0 - not)
isweekend - is the day a weekend (1 - weekend, 0 - no)
season - weather season (0 - spring, 1 - summer, 2 - fall, 3 - winter)
REMARK: There are no missing values or duplicates in the dataset. Convert timestamp to date time type
df_bicycle.timestamp = pd.to_datetime(df_bicycle.timestamp)
df_bicycle.dtypes
timestamp datetime64[ns] cnt int64 t1 float64 t2 float64 hum float64 wind_speed float64 weather_code float64 is_holiday float64 is_weekend float64 season float64 dtype: object
# Chart by number of trips by date and time
sns.lineplot(x=df_bicycle.timestamp, y=df_bicycle.cnt)
<Axes: xlabel='timestamp', ylabel='cnt'>
REMARK: It is very difficult to understand something from such a graph so you need to transform the data to see trends
df_bicycle.sort_values(by='timestamp', inplace=True)
df_bicycle.set_index('timestamp', drop=False, inplace=True)
df_bicycle.resample('D').agg({'cnt': 'sum'}).plot()
sns.despine()
# Aggregate data by day and calculate moving average with window 3
df_by_day = (df_bicycle
.resample('D')
.cnt
.sum()
.reset_index()
)
df_by_day_rolling = (df_bicycle
.resample('D')
.cnt
.sum()
.rolling(window=3).mean()
.reset_index()
)
sns.lineplot(df_by_day_rolling, x='timestamp', y='cnt')
<Axes: xlabel='timestamp', ylabel='cnt'>
# Calculation of the standard deviation and determination of the 99% confidence interval
std = np.std(df_by_day['cnt'] - df_by_day_rolling['cnt'])
lower_bound = df_by_day_rolling.cnt - (2.576 * std)
upper_bound = df_by_day_rolling.cnt + (2.576 * std)
df_by_day = df_by_day.assign(lower_bound=lower_bound, upper_bound=upper_bound)
df_by_day
timestamp | cnt | lower_bound | upper_bound | |
---|---|---|---|---|
0 | 2015-01-04 | 9234 | NaN | NaN |
1 | 2015-01-05 | 20372 | NaN | NaN |
2 | 2015-01-06 | 20613 | 4996.747479 | 28482.585854 |
3 | 2015-01-07 | 21064 | 8940.080812 | 32425.919188 |
4 | 2015-01-08 | 15601 | 7349.747479 | 30835.585854 |
... | ... | ... | ... | ... |
726 | 2016-12-30 | 11566 | 939.080812 | 24424.919188 |
727 | 2016-12-31 | 11424 | 604.414146 | 24090.252521 |
728 | 2017-01-01 | 6421 | -1939.252521 | 21546.585854 |
729 | 2017-01-02 | 11823 | -1853.585854 | 21632.252521 |
730 | 2017-01-03 | 19436 | 817.080812 | 24302.919188 |
731 rows × 4 columns
# Abnormally high value
index_of_max_difference = (df_by_day['cnt'] - df_by_day['upper_bound']).idxmax()
row_with_max_difference = df_by_day.loc[index_of_max_difference]
row_with_max_difference
timestamp 2015-07-09 00:00:00 cnt 72504 lower_bound 38962.080812 upper_bound 62447.919188 Name: 186, dtype: object
The possible reason for this anomaly is the strike of subway workers, which caused heavy traffic jams
# Abnormally low value
index_of_min_difference = (df_by_day['cnt'] - df_by_day['lower_bound']).idxmin()
row_with_min_difference = df_by_day.loc[index_of_min_difference]
row_with_min_difference
timestamp 2016-09-02 00:00:00 cnt 0 lower_bound 14343.414146 upper_bound 37829.252521 Name: 607, dtype: object
A possible reason for a value of 0 is that there were no bicycle rental data records on that day or unformed data in the initial dataset