#!/usr/bin/env python # coding: utf-8 # # Understanding the data # # In this first part, we load the data and perform some initial exploration on it. The main goal of this step is to acquire some basic knowledge about the data, how the various features are distributed, if there are missing values in it and so on. # In[ ]: ### imports import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import numpy as np get_ipython().run_line_magic('matplotlib', 'inline') # load hourly data hourly_data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv') # Check data format, number of missing values in the data and general statistics: # In[ ]: # print some generic statistics about the data print(f"Shape of data: {hourly_data.shape}") print(f"Number of missing values in the data: {hourly_data.isnull().sum().sum()}") # get statistics on the numerical columns hourly_data.describe().T # In[ ]: # create a copy of the original data preprocessed_data = hourly_data.copy() # tranform seasons seasons_mapping = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'} preprocessed_data['season'] = preprocessed_data['season'].apply(lambda x: seasons_mapping[x]) # transform yr yr_mapping = {0: 2011, 1: 2012} preprocessed_data['yr'] = preprocessed_data['yr'].apply(lambda x: yr_mapping[x]) # transform weekday weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'} preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x]) # transform weathersit weather_mapping = {1: 'clear', 2: 'cloudy', 3: 'light_rain_snow', 4: 'heavy_rain_snow'} preprocessed_data['weathersit'] = preprocessed_data['weathersit'].apply(lambda x: weather_mapping[x]) # transorm hum and windspeed preprocessed_data['hum'] = preprocessed_data['hum']*100 preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67 # visualize preprocessed columns cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed'] preprocessed_data[cols].sample(10, random_state=123) # ### Registered vs casual use analysis # In[ ]: # assert that total numer of rides is equal to the sum of registered and casual ones assert (preprocessed_data.casual + preprocessed_data.registered == preprocessed_data.cnt).all(), \ 'Sum of casual and registered rides not equal to total number of rides' # In[ ]: # plot distributions of registered vs casual rides sns.distplot(preprocessed_data['registered'], label='registered') sns.distplot(preprocessed_data['casual'], label='casual') plt.legend() plt.xlabel('rides') plt.ylabel("frequency") plt.title("Rides distributions") plt.savefig('figs/rides_distributions.png', format='png') # In[ ]: # plot evolution of rides over time plot_data = preprocessed_data[['registered', 'casual', 'dteday']] ax = plot_data.groupby('dteday').sum().plot(figsize=(10,6)) ax.set_xlabel("time"); ax.set_ylabel("number of rides per day"); plt.savefig('figs/rides_daily.png', format='png') # In[ ]: # create new dataframe with necessary for plotting columns, and # obtain number of rides per day, by grouping over each day plot_data = preprocessed_data[['registered', 'casual', 'dteday']] plot_data = plot_data.groupby('dteday').sum() # define window for computing the rolling mean and standard deviation window = 7 rolling_means = plot_data.rolling(window).mean() rolling_deviations = plot_data.rolling(window).std() # create a plot of the series, where we first plot the series of rolling means, # then we color the zone between the series of rolling means # +- 2 rolling standard deviations ax = rolling_means.plot(figsize=(10,6)) ax.fill_between(rolling_means.index, \ rolling_means['registered'] + 2*rolling_deviations['registered'], \ rolling_means['registered'] - 2*rolling_deviations['registered'], \ alpha = 0.2) ax.fill_between(rolling_means.index, \ rolling_means['casual'] + 2*rolling_deviations['casual'], \ rolling_means['casual'] - 2*rolling_deviations['casual'], \ alpha = 0.2) ax.set_xlabel("time"); ax.set_ylabel("number of rides per day"); plt.savefig('figs/rides_aggregated.png', format='png') # In[ ]: # select relevant columns plot_data = preprocessed_data[['hr', 'weekday', 'registered', 'casual']] # transform the data into a format, in number of entries are computed as count, # for each distinct hr, weekday and type (registered or casual) plot_data = plot_data.melt(id_vars=['hr', 'weekday'], var_name='type', value_name='count') # create FacetGrid object, in which a grid plot is produced. # As columns, we have the various days of the week, # as rows, the different types (registered and casual) grid = sns.FacetGrid(plot_data, row='weekday', col='type', height=2.5,\ aspect=2.5, row_order=['Monday', 'Tuesday', \ 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) # populate the FacetGrid with the specific plots grid.map(sns.barplot, 'hr', 'count', alpha=0.5) grid.savefig('figs/weekday_hour_distributions.png', format='png') # Exercise 1.02: Analyzing season impact on rides # In[ ]: # select subset of the data plot_data = preprocessed_data[['hr', 'season', 'registered', 'casual']] # unpivot data from wide to long format plot_data = plot_data.melt(id_vars=['hr', 'season'], var_name='type', \ value_name='count') # define FacetGrid grid = sns.FacetGrid(plot_data, row='season', \ col='type', height=2.5, aspect=2.5, \ row_order=['winter', 'spring', 'summer', 'fall']) # apply plotting function to each element in the grid grid.map(sns.barplot, 'hr', 'count', alpha=0.5) # save figure grid.savefig('figs/exercise_1_02_a.png', format='png') # In[ ]: plot_data = preprocessed_data[['weekday', 'season', 'registered', 'casual']] plot_data = plot_data.melt(id_vars=['weekday', 'season'], var_name='type', value_name='count') grid = sns.FacetGrid(plot_data, row='season', col='type', height=2.5, aspect=2.5, row_order=['winter', 'spring', 'summer', 'fall']) grid.map(sns.barplot, 'weekday', 'count', alpha=0.5, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']) # save figure grid.savefig('figs/exercise_1_02_b.png', format='png')