#!/usr/bin/env python
# coding: utf-8

# # Ford Gobike System data exploration
# ## by Sheila Alwanga Olayo
# 
# ## Introduction
# > This dataset includes information about individual rides made in a bike-sharing system covering the greater San Francisco Bay area, for the month of February, 2019.
# 
# 
# ## Preliminary Wrangling
# 

# In[50]:


# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

get_ipython().run_line_magic('matplotlib', 'inline')


# In[51]:


bike = pd.read_csv('201902-fordgobike-tripdata.csv')
bike.head()


# In[52]:


#checking for column datatypes and missing values.
bike.info()


# In[53]:


#checking for duplicate records
print(sum(bike.duplicated()))


# In[54]:


#changing data types of 'start_time' and 'end_time' columns
bike.start_time = pd.to_datetime(bike.start_time)
bike.end_time = pd.to_datetime(bike.end_time)

print(bike.start_time.dtype)
print(bike.end_time.dtype)


# In[55]:


#dropping all rows with null values
bike.dropna(inplace = True)


# In[56]:


#changing data type of 'member_birth_year' column
bike.member_birth_year = bike.member_birth_year.astype(int)


# In[57]:


bike.info()


# In[58]:


#extracting 'day' and 'date' from 'start_time' column
bike['date'] = bike['start_time'].dt.date
bike['day'] = bike['start_time'].dt.day_name()


# In[59]:


#converting 'day' to an ordered categorical datatype
day = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
wk_day = pd.api.types.CategoricalDtype(ordered = True, categories = day)
bike.day = bike.day.astype(wk_day)


# In[60]:


#creating an 'age' column
bike['age'] = 2019 - bike.member_birth_year 


# In[61]:


#checking unique 'age' values
print(np.sort(bike.age.unique())) 


# In[62]:


#filtering out rows with age 99 and over
bike = bike.loc[bike.age < 93].copy()


# In[63]:


#creating a duration in minutes column
bike['duration_min'] = bike.duration_sec/60


# Creating a categorical time_period column in the following cells

# In[64]:


#creating a dictionary of time periods
time_periods = [{'period':'morning','start':'5:59:01','end':'11:59:00'},
{'period':'afternoon','start':'11:59:01','end':'15:59:00'},
{'period':'evening','start':'15:59:01','end':'23:59:00'},
{'period':'late_night','start':'23:59:01','end':'5:59:00'}]


# In[65]:


#creating a function to create new dataframes according to daytime periods
def day_period(start_time,end_time):
    '''creates new dataframes according to day time periods'''
    period_df = bike.between_time(start_time,end_time)
    return period_df.copy()


# In[66]:


#setting 'bike' df's start_time as index, so as to be able to use the between_time() function on it
bike = bike.set_index('start_time')


# In[67]:


#creating time _period data frames
i = 0
while i < 4:
    morning = day_period(time_periods[i]['start'],time_periods[i]['end'])
    i += 1
    afternoon = day_period(time_periods[i]['start'],time_periods[i]['end'])
    i += 1
    evening = day_period(time_periods[i]['start'],time_periods[i]['end'])
    i += 1
    late_night = day_period(time_periods[i]['start'],time_periods[i]['end'])
    i += 1


# In[68]:


#creating a function to add a day_period column to the dataframes
def add_column(df,value):
    df['day_period'] = value


# In[69]:


#adding day_period column to the dataframes
day_period = [(morning,'morning'),(afternoon,'afternoon'),(evening,'evening'),(late_night,'late_night')]
for period in day_period:
    add_column(period[0],period[1])


# In[70]:


#merging the four datasets to replace original 'bike' dataset
bike = morning.append([afternoon,evening,late_night],ignore_index  = False)


# In[71]:


#resetting bike df's index
bike = bike.reset_index()


# ### Structure of my dataset
# 
# > My dataset initially had 183,412 records and 16 columns. However after some wrangling which involved dropping records with null or inaccurate values and some feature engineering, the dataset now has 174,877 records and 21 columns. The five additional columns include **'date', 'day', 'day_period'** derived from the **'start_time'** column, **'age'** derived from **'member_birth_year'** column and **'duration_min'** derived from the **'duration_sec'** column.
# 
# ### My main feature(s) of interest in the dataset
# 
# > My main features of interest are time and demographic information features with focus on how they affect the number of bike rides.
# 
# ### Other features in the dataset that will help support my investigation into my feature(s) of interest
# 
# > I think that the time features (**'duration_sec', 'start_time', 'end_time','start_date', 'start_day'**) and demographic data features (**'user_type','member_birth_year', 'member_gender','age'**) will help support my investigation into my features of interest.

# ## Univariate Exploration

# In[72]:


bike.head()


# In[73]:


#creating a fuction that plots seaborn countplots
def Countplot(df, xvar, huevar=None, color=0, palette=None, order=None):
    #set plot dimentions
    plt.figure(figsize = [14,6])
    #plot
    sb.countplot(data=df, x=xvar, hue=huevar, color=sb.color_palette()[color], palette=palette, order=order, edgecolor='black')
    #clean up variable names
    xvar = xvar.replace("_"," ")
    if huevar:
        huevar = huevar.replace("_"," ")
    #add title and format it
    plt.title(f"Distribution of bike rides: {xvar} {'by' if huevar else ''} {huevar if huevar else ''}" .title(), fontsize=14, weight='bold')
    #add x label and format it
    plt.xlabel(xvar.title(), fontsize=10, weight='bold')
    #add y label and format it
    plt.ylabel('Frequency', fontsize=10, weight='bold')


# #### Distribution of the length of bike rides

# In[74]:


bike.duration_min.describe()


# In[75]:


#checking distribution of 'duration_min' column
plt.figure(figsize = [14,6])
plt.hist(bike.duration_min);
plt.title('Distribution of bike trip duration'.title(),fontsize=14, weight='bold')
plt.xlabel('duration in minutes',fontsize=10, weight='bold')
plt.ylabel('number of rides',fontsize=10, weight='bold');


# In[76]:


#plotting a histogram of 'duration_min' focusing on the 0-80 minute range
plt.figure(figsize = [14,6])
bins = np.arange(0,80,5)
plt.hist(bike.duration_min, bins = bins, edgecolor = 'black')
plt.xlim(0,80)
plt.title('Distribution of bike trip duration'.title(),fontsize=14, weight='bold')
plt.xlabel('duration in minutes',fontsize=10, weight='bold')
plt.ylabel('number of rides',fontsize=10, weight='bold');


#  I plotted a second histogram with focus on the 0-80 minute range for a closer look because that is where majority of the data is concentrated. The distribution is right skewed, with a steep rise in the 5-10 minute range where majority of the values lie, and an equally steep decline in the 10-15 minute range as frequencies go on decreasing.
# Majority of rides are therefore 5-10 minutes long.

# #### Frequency of rides as per period of the day

# In[77]:


#plotting a bargraph for the 'day_period' variable
Countplot(bike,'day_period')


# The period of the day with the highest number of bike rides is the evening, that is between 4 pm and 11 pm. Followed closely by the morning period. The late night period however has a very low frequency of bike rides.

# #### Frequency of rides as pertaining to the day of the week

# In[78]:


#plotting a bargraph for the 'day' column
Countplot(bike,'day')


# The plot shows that Thursdays tend to have the most number of rides,closely followed by Tuesdays, while Saturdays and Sundays register the least number rides.

# ####  Customers compared to subscribed members.

# In[79]:


#plotting a bargraph for the 'user_type' column
Countplot(bike,'user_type')


#  The plot shows a very large difference in frequency of the categories of bike share system users. Subscribed members appear to log approximately eight times more rides than regular customers.

# #### Distribution of frequency of  rides for different age groups

# In[80]:


#plotting a histogram for the 'age' column
plt.figure(figsize = [14,6])
plt.hist(bike.age,edgecolor = 'black')
plt.xlabel('ages', fontsize=10, weight = 'bold')
plt.ylabel('number of rides', fontsize=10, weight='bold')
plt.title('Distribution of age groups vs rides', fontsize=14, weight='bold');


# The plot reveals a slightly right skewed distribution with a majoriy number of the rides lying in the mid 20's to early 30's age group. There's a sharp drop in the number of rides at the age 40 mark and this drop continuously tapers off, declining as the ages increase up until the 90's.

# #### Distribution of the frequency of rides according to gender

# In[81]:


#plotting a bargraph for the 'member_gender' column
Countplot(bike,'member_gender')


# The plot shows that a majority of rides are taken by male members, with the number of rides by female members equating to approximately a third of that of male members.

# #### Distribution of the kind of usage

# In[82]:


Countplot(bike, 'bike_share_for_all_trip')


# The plot shows that majority of the rides are to supplement other means of transport when moving from one point to another

# #### Frequency of bike rides throughout the month

# In[83]:


plt.figure(figsize = [14,6])
bike.groupby('date').date.count().plot()
plt.xticks(rotation = 90)
plt.xlabel('Date', fontsize=10, weight='bold')
plt.ylabel('Count', fontsize=10, weight='bold')
plt.title('Frequency of rides per date', fontsize=14, weight='bold');


# The plot shows an up and down trend in the number of rides throughout the month. There are wide peaks and sharp valleys. I would attribute this to the distribution shown earlier in the days of the week plot, where weekends tend to have a lower number of bike rides logged, compared to weekdays.

# ### Distribution(s) of my variable(s) of interest, unusual points and transformations
# 
# > Starting off with the time and date variables: For the bike **duration_min** variable, there was a very large range in the distribution of values. However, majority of the values were concentrated on the extreme lower end. On plotting these on a smaller scale, an extremely right skewed distribution was yielded.
# 
# > The distribution of the **day_period** variable showed the evening as having the most number of rides, followed closely by the morning, then the afternoon period and finally the late night with and extreme low number of rides.
# 
# > As for the **day** feature, I observed that the weekends, Saturdays and Sundays had noteably lower frequencies of rides as compared to the weekdays.
# 
# >For the demographic variables; the **age** feature showed a slight right skewed distribution. With majority of rides lying in the mid 20's to early 30's age group. There's also a noteable sharp drop in the number of rides at the age 40 mark.
# 
# > The **member_gender** feature distribution showed the frequency of rides by male members being alot higher compared to that of female members and others.
# 
# > Finally, the **user_type** distribution revealed that subscribed members log alot more rides than non-subscribed members.
# 
# ### Unusual distributions of investigated features and operations performed on the data to tidy, adjust, or change the form of the data
# > The most unusual distribution was the **duration_min** and **age** features which had a wide range of values and therefore right skewed distributions.
# 
# > I performed a number of transformations on the data including correcting data types of the **start_time** and **end_time** features to *datetime* to enable me derive other features from these.
# 
# > I added three columns **'date', 'day', 'day_period'** derived from the **'start_time'** column  so as to enable me investigate each feature categorically and quantitatively, this would have otherwise been complicated by having these features all under one variable.
# 
# > I also added **'age'** derived from **'member_birth_year'** column and **'duration_min'** derived from the **'duration_sec'** column. 
# 
# >For the **age** variable, I figured it would in this context, make more sense to group users by the age groups rather than the year in which they were born because there was a greater interest in the ages as opposed to the particular years of birth.
# 
# >As for the **duration_min** variable, I converted the available bike ride duration given in seconds, because it is easier to  percieve time in so many minutes compared to so many seconds.
# 
# >I also dropped approximately 10,000 rows from the dataset, which either had null values and a few whose *age* value was 99 and over.

# ## Bivariate Exploration
# 

# In[84]:


bike.head()


# #### Ride duration vs period of the day
# Checking distribution of the ride durations for each period of the day

# In[85]:


# plotting a Facet grid)
bins = np.arange(0,75,5)
g = sb.FacetGrid
g = sb.FacetGrid(data=bike, col = 'day_period')
g.map(plt.hist, 'duration_min', bins = bins, edgecolor = 'black')
g.fig.set_size_inches(14,6)
plt.ylabel('number of rides',fontsize=10, weight='bold');
plt.suptitle('Distribution of bike trip duration by day period'.title(), fontsize=14, weight='bold');


# The distribution of the ride duration values throughout all the periods of the day, except the late night still shows the 5-10 minute rides being a majority. Minor differences are however seen for other time ranges. In the mornings, there are more 10-15 minute rides as opposed to 0-5 minute rides. The opposite is however true for the afternoon and evening. The late night period is an exception to all this because it appears to have an equal number of rides that are in the 0-5 and 5-10 minute ranges, these being the majority.

# #### Ride duration vs day of the week
# Checking distributions of ride durations for each day of the week.

# In[86]:


bins = np.arange(0,75,5)
g = sb.FacetGrid(data=bike, col = 'day', col_wrap = 4)
g.map(plt.hist, 'duration_min', bins = bins, edgecolor = 'black')
g.fig.set_size_inches(14,6)
plt.ylabel('number of rides',fontsize=10, weight='bold');
#plt.suptitle('Distribution of ride duration by day'.title(), fontsize=14, weight='bold');


# The distribution of the ride duration values against days of the week also still shows that 5-10 minute bike rides were the majority across all the days of the week. The variations here also appear in the differences betweeen 0-5 minute and 10-15 minute ranges. On Sundays,Mondays and Saturdays, the number of 0-5 minute bike rides are slightly higher, while on Tuesdays and Thursdays the 10-15 minute bike rides are slightly more. Note that Tuesdays and Thursdays overall have the highest number of bike rides throughout the week. Perhaps they are the busier days, hence why people are taking longer bike rides to supplement regular transport means.

# #### Ride duration vs age
# Checking if there is a correlation between the duration of bike rides and age.
# Due to the large number of values, I used a random sample of 2500 records.

# In[87]:


bike_sample = bike.sample(5000)
plt.figure(figsize = [14,6])
plt.scatter(data = bike_sample, x = 'age', y = 'duration_min', alpha = 0.3, s=5)
plt.yscale('log')
plt.xlabel('age',fontsize=10, weight='bold')
plt.ylabel('ride duration in minutes',fontsize=10, weight='bold')
plt.title('Ride duration vs age',fontsize=14, weight='bold');


#  There is no correlation between age and ride duration. Majority of the rides are in the 5-15 minute range, regardless of age.

# > Curious to see the trend of average ride duration per age

# In[88]:


#line plot
plt.figure(figsize = [14,6])
bike.groupby('age').duration_min.mean().plot(legend = True);
plt.xlabel('Age',fontsize=10, weight='bold')
plt.ylabel('Duration in min', fontsize=10, weight='bold')
plt.title('Average ride duration', fontsize=14, weight='bold');


# For ages, 19 to 70, the average ride duration lies between 10 and 15 minutes. The extreme values seen for 18 year old age group can be attested to the relatively fewer number of records for this age. This can also explain sharp flactuations seen in the older age groups. 

# #### Ride duration vs gender
# Checking the distribution of ride duration for each gender

# In[89]:


#plotting a FacetGrid
bins = np.arange(0,80,5)
g = sb.FacetGrid
g = sb.FacetGrid(data=bike, col = 'member_gender')
g.map(plt.hist, 'duration_min', bins = bins, edgecolor = 'black')
g.fig.set_size_inches(14,6)
plt.ylabel('number of rides',fontsize=10, weight='bold');
plt.suptitle('Distribution of ride duration by gender'.title(), fontsize=14, weight='bold');


# The 5-10 minute ride duration still  has the highest frequency across all genders. However, there are more 10-15 minute rides relative to 0-5 minute rides by female members while the opposite is true for male members.

# #### Ride duration vs user type
# Comparing the distribution of ride duration for each user type.

# In[90]:


plt.figure(figsize = [20, 6])

plt.subplot(1,2,1)
sb.stripplot(data=bike, x='user_type', y='duration_min',size=1, jitter=0.35,color = 'steelblue')

plt.subplot(1,2,2)
sb.stripplot(data=bike.query('duration_min < duration_min.quantile(0.99)'), x='user_type', y='duration_min', size=1, jitter=0.35, color = 'steelblue');


# The plots show that the distributions of ride durations for both customers and subscribed members are very similar, despite the differences in the numbers of both groups.

# #### Period of the day vs day
# Checking for variations in the number of rides for each period of the day across all days.

# In[91]:


#clustered bar chart
Countplot(bike, 'day', huevar='day_period', palette='Pastel2')


# On the week-days, mornings and evenings are seen to have the higher numbers of bike rides. While on the weekends, afternoons and evenings have the higher numbers with each of their counts being nearly equal. Although evenings have been seen to have the highest number of bike rides generally, Friday evenings have a lower number of rides compared to Friday mornings.

# #### Period of the day vs age
# Checking the distribution of ages relative to the period of the day.

# In[92]:


plt.figure(figsize = [14,6])
sb.violinplot(data = bike, x = 'day_period', y = 'age', palette = 'Pastel2')
plt.xlabel('Day period', fontsize=10, weight='bold')
plt.ylabel('Age', fontsize=10, weight='bold')
plt.title('Age distributions by day period', fontsize=14, weight='bold');


# The age distribution is similar across all periods of the day, appearing right skewed for all of them. The modal age for the morning, afternoon and evening periods lies between late 20's and early 30's while the modal age for the late night  period lies between mid and late 20's.

# #### Period of the day vs gender

# In[93]:


#clustered bar chart
Countplot(bike, 'day_period', huevar='member_gender', palette = 'Pastel1')


# The period of the day doesn't really affect the frequency of rides according to gender. It's however notable that the same number of rides by the female gender group is seen for the morning and evening while for males the number is slightly increased in the evening.

# #### Day vs gender

# In[94]:


#clustered bar chart
Countplot(bike, 'day', huevar='member_gender', palette='Pastel1')


# The day and gender variables are independent of each other. The frequency of bike rides by gender is not affected by any particular day of the week.

# ## Relationships observed in this part of the investigation and how the feature(s) of interest varied with other features in the dataset.
# 
# > The features of interest were observed to be largely independent with only slight variations when plotted against each other and in other instances, no variations at all. 
# 
# >The ride duration showed slight variations when plotted against period of day, day of the week and gender and no correlation when plotted against age.
# 
# >There were also slight variations in the number of rides logged for certain values of day period when plotted against days. However, there was no effect on the number of rides when the period of the day was plotted against gender, same thing when the days were plotted against gender.
# 
# ### Interesting relationships observed between the other features (not the main feature(s) of interest)
# 
# > It was interesting to note that the distributions of ride durations for both customers and subscribed members are very similar, despite the differences in the numbers of both groups.

# ## Multivariate Exploration
# 

# Due to the notable largely independent nature of the gender variable, I will be leaving it out in my multivariate exploration. Instead, I will explore how the **age** and **ride duration** features each independently affect the frequency of bike rides with respect to **day** and **period of day**, seeing as they (**age** and **ride duration**) have no correlation.

# #### Age, day, period of day

# In[95]:


plt.figure(figsize = [14,6])
sb.stripplot(data=bike, x='day',y='age', hue='day_period',palette='Pastel2', size=1, jitter=0.35, dodge=True);
plt.legend(loc=6, bbox_to_anchor=(1.0,0.5));


# In[96]:


g = sb.FacetGrid(data = bike, col = 'day', row = 'day_period',margin_titles = True)
bins = np.arange(18,90,10)
g.map(plt.hist, 'age', bins = bins, edgecolor = 'black');


# The day,period of day and age features enhance each other and only further confirm the observations as seen in earlier univariate and bivariate plots. The highest frequency of rides is seen on Thursday evenings among the 30's age group and the lowest frequencies are seen throughout late nights.

# #### Duration, day, period of day

# In[97]:


plt.figure(figsize = [14,6])
sb.stripplot(data=bike.query('duration_min < 90'), x='day',y='duration_min', hue='day_period',palette='Pastel2', size=1, jitter=0.35, dodge=True)
plt.legend(loc=6, bbox_to_anchor=(1.0,0.5));


# In[98]:


g = sb.FacetGrid(data = bike, col = 'day', row = 'day_period',margin_titles = True)
bins = np.arange(0,80,5)
g.map(plt.hist, 'duration_min', bins = bins, edgecolor = 'black');


# A similar observation is noted for the ride duration, day and period of day features. enhance each other and only further .The highest frequency of rides is seen on Thursday evenings for the 5-10 minute bike rides, while the lowest frequencies of rides are generally seen throughout late nights.

# ### Relationships observed in this part of the investigation, features that strengthened each other in terms of my feature(s) of interest.
# 
# > The day,period of day and age features enhance each other and only further confirm the observations as seen in earlier univariate and bivariate plots. The highest frequency of rides is seen on Thursday evenings among the 30's age group.
# 
# > A similar observation is noted for the ride duration, day and period of day features. enhance each other and only further .The highest frequency of rides is seen on Thursday evenings for the 5-10 minute bike rides.
# 
# > Comparing the distribution sizes of the age and ride duration plots, I'd say that age has a larger effect on the frequency of bike rides than ride duration does.
# 
# ### Comments on any interesting or surprising interactions between features.
# 
# > There were no particularly surprising interactions between features. The multivariate plots mostly enhanced what was seen in the earlier univariate and bivariate plots.