In this first part, we load the data and perform some initial exploration on it. The main goal of this step is to acquire some basic knowledge about the data, how the various features are distributed, if there are missing values in it and so on.
### imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
# load hourly data
hourly_data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv')
Check data format, number of missing values in the data and general statistics:
# print some generic statistics about the data
print(f"Shape of data: {hourly_data.shape}")
print(f"Number of missing values in the data: {hourly_data.isnull().sum().sum()}")
# get statistics on the numerical columns
hourly_data.describe().T
Shape of data: (17379, 17) Number of missing values in the data: 0
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
instant | 17379.0 | 8690.000000 | 5017.029500 | 1.00 | 4345.5000 | 8690.0000 | 13034.5000 | 17379.0000 |
season | 17379.0 | 2.501640 | 1.106918 | 1.00 | 2.0000 | 3.0000 | 3.0000 | 4.0000 |
yr | 17379.0 | 0.502561 | 0.500008 | 0.00 | 0.0000 | 1.0000 | 1.0000 | 1.0000 |
mnth | 17379.0 | 6.537775 | 3.438776 | 1.00 | 4.0000 | 7.0000 | 10.0000 | 12.0000 |
hr | 17379.0 | 11.546752 | 6.914405 | 0.00 | 6.0000 | 12.0000 | 18.0000 | 23.0000 |
holiday | 17379.0 | 0.028770 | 0.167165 | 0.00 | 0.0000 | 0.0000 | 0.0000 | 1.0000 |
weekday | 17379.0 | 3.003683 | 2.005771 | 0.00 | 1.0000 | 3.0000 | 5.0000 | 6.0000 |
workingday | 17379.0 | 0.682721 | 0.465431 | 0.00 | 0.0000 | 1.0000 | 1.0000 | 1.0000 |
weathersit | 17379.0 | 1.425283 | 0.639357 | 1.00 | 1.0000 | 1.0000 | 2.0000 | 4.0000 |
temp | 17379.0 | 0.496987 | 0.192556 | 0.02 | 0.3400 | 0.5000 | 0.6600 | 1.0000 |
atemp | 17379.0 | 0.475775 | 0.171850 | 0.00 | 0.3333 | 0.4848 | 0.6212 | 1.0000 |
hum | 17379.0 | 0.627229 | 0.192930 | 0.00 | 0.4800 | 0.6300 | 0.7800 | 1.0000 |
windspeed | 17379.0 | 0.190098 | 0.122340 | 0.00 | 0.1045 | 0.1940 | 0.2537 | 0.8507 |
casual | 17379.0 | 35.676218 | 49.305030 | 0.00 | 4.0000 | 17.0000 | 48.0000 | 367.0000 |
registered | 17379.0 | 153.786869 | 151.357286 | 0.00 | 34.0000 | 115.0000 | 220.0000 | 886.0000 |
cnt | 17379.0 | 189.463088 | 181.387599 | 1.00 | 40.0000 | 142.0000 | 281.0000 | 977.0000 |
# create a copy of the original data
preprocessed_data = hourly_data.copy()
# tranform seasons
seasons_mapping = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'}
preprocessed_data['season'] = preprocessed_data['season'].apply(lambda x: seasons_mapping[x])
# transform yr
yr_mapping = {0: 2011, 1: 2012}
preprocessed_data['yr'] = preprocessed_data['yr'].apply(lambda x: yr_mapping[x])
# transform weekday
weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x])
# transform weathersit
weather_mapping = {1: 'clear', 2: 'cloudy', 3: 'light_rain_snow', 4: 'heavy_rain_snow'}
preprocessed_data['weathersit'] = preprocessed_data['weathersit'].apply(lambda x: weather_mapping[x])
# transorm hum and windspeed
preprocessed_data['hum'] = preprocessed_data['hum']*100
preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67
# visualize preprocessed columns
cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed']
preprocessed_data[cols].sample(10, random_state=123)
season | yr | weekday | weathersit | hum | windspeed | |
---|---|---|---|---|---|---|
5792 | summer | 2011 | Saturday | clear | 74.0 | 8.9981 |
7823 | fall | 2011 | Sunday | clear | 43.0 | 31.0009 |
15426 | fall | 2012 | Tuesday | cloudy | 77.0 | 6.0032 |
15028 | fall | 2012 | Sunday | clear | 51.0 | 22.0028 |
12290 | spring | 2012 | Friday | cloudy | 89.0 | 12.9980 |
3262 | spring | 2011 | Friday | clear | 64.0 | 7.0015 |
10763 | spring | 2012 | Thursday | clear | 42.0 | 23.9994 |
12384 | spring | 2012 | Tuesday | light_rain_snow | 82.0 | 11.0014 |
6051 | summer | 2011 | Wednesday | clear | 52.0 | 19.0012 |
948 | winter | 2011 | Saturday | clear | 80.0 | 0.0000 |
# assert that total numer of rides is equal to the sum of registered and casual ones
assert (preprocessed_data.casual + preprocessed_data.registered == preprocessed_data.cnt).all(), \
'Sum of casual and registered rides not equal to total number of rides'
# plot distributions of registered vs casual rides
sns.distplot(preprocessed_data['registered'], label='registered')
sns.distplot(preprocessed_data['casual'], label='casual')
plt.legend()
plt.xlabel('rides')
plt.ylabel("frequency")
plt.title("Rides distributions")
plt.savefig('figs/rides_distributions.png', format='png')
# plot evolution of rides over time
plot_data = preprocessed_data[['registered', 'casual', 'dteday']]
ax = plot_data.groupby('dteday').sum().plot(figsize=(10,6))
ax.set_xlabel("time");
ax.set_ylabel("number of rides per day");
plt.savefig('figs/rides_daily.png', format='png')
# create new dataframe with necessary for plotting columns, and
# obtain number of rides per day, by grouping over each day
plot_data = preprocessed_data[['registered', 'casual', 'dteday']]
plot_data = plot_data.groupby('dteday').sum()
# define window for computing the rolling mean and standard deviation
window = 7
rolling_means = plot_data.rolling(window).mean()
rolling_deviations = plot_data.rolling(window).std()
# create a plot of the series, where we first plot the series of rolling means,
# then we color the zone between the series of rolling means
# +- 2 rolling standard deviations
ax = rolling_means.plot(figsize=(10,6))
ax.fill_between(rolling_means.index, \
rolling_means['registered'] + 2*rolling_deviations['registered'], \
rolling_means['registered'] - 2*rolling_deviations['registered'], \
alpha = 0.2)
ax.fill_between(rolling_means.index, \
rolling_means['casual'] + 2*rolling_deviations['casual'], \
rolling_means['casual'] - 2*rolling_deviations['casual'], \
alpha = 0.2)
ax.set_xlabel("time");
ax.set_ylabel("number of rides per day");
plt.savefig('figs/rides_aggregated.png', format='png')
# select relevant columns
plot_data = preprocessed_data[['hr', 'weekday', 'registered', 'casual']]
# transform the data into a format, in number of entries are computed as count,
# for each distinct hr, weekday and type (registered or casual)
plot_data = plot_data.melt(id_vars=['hr', 'weekday'], var_name='type', value_name='count')
# create FacetGrid object, in which a grid plot is produced.
# As columns, we have the various days of the week,
# as rows, the different types (registered and casual)
grid = sns.FacetGrid(plot_data, row='weekday', col='type', height=2.5,\
aspect=2.5, row_order=['Monday', 'Tuesday', \
'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
# populate the FacetGrid with the specific plots
grid.map(sns.barplot, 'hr', 'count', alpha=0.5)
grid.savefig('figs/weekday_hour_distributions.png', format='png')
/Users/kosta/projects/packt-book/book-env/lib/python3.6/site-packages/seaborn/axisgrid.py:715: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot. warnings.warn(warning)
# select subset of the data
plot_data = preprocessed_data[['hr', 'season', 'registered', 'casual']]
# unpivot data from wide to long format
plot_data = plot_data.melt(id_vars=['hr', 'season'], var_name='type', \
value_name='count')
# define FacetGrid
grid = sns.FacetGrid(plot_data, row='season', \
col='type', height=2.5, aspect=2.5, \
row_order=['winter', 'spring', 'summer', 'fall'])
# apply plotting function to each element in the grid
grid.map(sns.barplot, 'hr', 'count', alpha=0.5)
# save figure
grid.savefig('figs/exercise_1_02_a.png', format='png')
plot_data = preprocessed_data[['weekday', 'season', 'registered', 'casual']]
plot_data = plot_data.melt(id_vars=['weekday', 'season'], var_name='type', value_name='count')
grid = sns.FacetGrid(plot_data, row='season', col='type', height=2.5, aspect=2.5,
row_order=['winter', 'spring', 'summer', 'fall'])
grid.map(sns.barplot, 'weekday', 'count', alpha=0.5,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
# save figure
grid.savefig('figs/exercise_1_02_b.png', format='png')
# compute population mean of registered rides
population_mean = preprocessed_data.registered.mean()
# get sample of the data (summer 2011)
sample = preprocessed_data[(preprocessed_data.season == "summer") &\
(preprocessed_data.yr == 2011)].registered
# perform t-test and compute p-value
from scipy.stats import ttest_1samp
test_result = ttest_1samp(sample, population_mean)
print(f"Test statistic: {test_result[0]:.03f}, p-value: {test_result[1]:.03f}")
# get sample as 5% of the full data
import random
random.seed(111)
sample_unbiased = preprocessed_data.registered.sample(frac=0.05)
test_result_unbiased = ttest_1samp(sample_unbiased, population_mean)
print(f"Unbiased test statistic: {test_result_unbiased[0]:.03f}, p-value: {test_result_unbiased[1]:.03f}")
Test statistic: -3.492, p-value: 0.000 Unbiased test statistic: -0.460, p-value: 0.645
# define mask, indicating if the day is weekend or work day
weekend_days = ['Saturday', 'Sunday']
weekend_mask = preprocessed_data.weekday.isin(weekend_days)
workingdays_mask = ~preprocessed_data.weekday.isin(weekend_days)
# select registered rides for the weekend and working days
weekend_data = preprocessed_data.registered[weekend_mask]
workingdays_data = preprocessed_data.registered[workingdays_mask]
# perform ttest
from scipy.stats import ttest_ind
test_res = ttest_ind(weekend_data, workingdays_data)
print(f"Statistic value: {test_res[0]:.03f}, p-value: {test_res[1]:.03f}")
# plot distributions of registered rides for working vs weekend days
sns.distplot(weekend_data, label='weekend days')
sns.distplot(workingdays_data, label='working days')
plt.legend()
plt.xlabel('rides')
plt.ylabel('frequency')
plt.title("Registered rides distributions")
plt.savefig('figs/exercise_1_04_a.png', format='png')
Statistic value: -16.004, p-value: 0.000
# select casual rides for the weekend and working days
weekend_data = preprocessed_data.casual[weekend_mask]
workingdays_data = preprocessed_data.casual[workingdays_mask]
# perform ttest
test_res = ttest_ind(weekend_data, workingdays_data)
print(f"Statistic value: {test_res[0]:.03f}, p-value: {test_res[1]:.03f}")
# plot distributions of casual rides for working vs weekend days
sns.distplot(weekend_data, label='weekend days')
sns.distplot(workingdays_data, label='working days')
plt.legend()
plt.xlabel('rides')
plt.ylabel('frequency')
plt.title("Casual rides distributions")
plt.savefig('figs/exercise_1_04_b.png', format='png')
Statistic value: 41.077, p-value: 0.000