#!/usr/bin/env python # coding: utf-8 # ### Import the necessary libraries # In[1]: import pandas as pd import os import glob import matplotlib.pyplot as plt import seaborn as sn from pandas.api.types import CategoricalDtype # ### Merge all twelve csv files into one (January to december 2021) # In[2]: ## Accessing the path to all the files. The files must all have a similar naming structure. All my files ended in "data.csv" joined_files = os.path.join("C:\\Users\\jeyjo\\Desktop\\GOOGLE DATA ANALYTICS CERTIFICATE PROGRAM\\CASE STUDY\\Bikeshare\\CYCLISTIC DATA","*data.csv") # merging the files # In[3]: # A list of all the files is returned joined_list = glob.glob(joined_files) # In[4]: # This appends all the files into one dataframe mergeddata = pd.concat(map(pd.read_csv, joined_list), ignore_index=True) # Finally, the files are joined mergeddata # ## DATA CLEANING & WRANGLING # In[5]: ## To know the number of rows and columns mergeddata.shape # In[6]: ## Checking for null values mergeddata.isnull().sum() # In[7]: ## Dropping columns that are not needed mergeddata.drop(mergeddata.iloc[:, 4:12], inplace= True, axis=1) # In[8]: mergeddata # ### GETTING TO KNOW THE DATA # In[9]: ## Checking for duplicates mergeddata.duplicated().sum() # In[10]: mergeddata.describe() # In[11]: mergeddata.rideable_type.unique() # In[12]: mergeddata.member_casual.unique() # In[13]: mergeddata.dtypes # #### DATA WRANGLING AND MANIPULATION # In[14]: ## Converting into datetime datatypes for the respective columns mergeddata['started_at'] = pd.to_datetime(mergeddata['started_at']) mergeddata['ended_at'] = pd.to_datetime(mergeddata['ended_at']) # In[15]: mergeddata = mergeddata.rename(columns = {'member_casual':'membership'}) # In[16]: ## Feature extraction mergeddata['day'] = mergeddata['started_at'].dt.day_name() mergeddata['month'] = mergeddata['ended_at'].dt.month_name() mergeddata['hour'] = mergeddata['started_at'].dt.hour mergeddata['ride_length'] = (mergeddata['ended_at'] - mergeddata['started_at']).dt.total_seconds()/60 # In[19]: mergeddata # In[20]: ## Sorting the ride length in ascending order mergeddata.sort_values(by=['ride_length'], axis = 0, inplace = True) mergeddata.head(5) # In[21]: ## The total number of negative ride_lengths and those less than 60 seconds. mergeddata[mergeddata['ride_length']<1]['ride_length'].count() # ## OR # In[22]: ## The number of negative values and those less than 60 seconds len(mergeddata[mergeddata['ride_length']<1]) ## 85233 is only 1.5% of the total data thus can be deleted. # In[23]: ## Assign the negative values and those less than 60 seconds to a variable Bad_Data = mergeddata[ (mergeddata['ride_length']<1)].index # In[24]: ## Drop the bad data mergeddata.drop(Bad_Data, inplace=True) # In[27]: ## To reset the index mergeddata.reset_index(drop=True).head(5) # In[28]: ## To ascertain the right number of rows after dropping the bad data. i.e, 5595063 - 85233 mergeddata.shape[0] # In[29]: ## Inspect the data since new fields have been populated mergeddata.isna().sum() # In[30]: mergeddata.dtypes # In[31]: ## A defined and ordered category for month and day ( This allows the months and days to appear in order when plotting) Months = ["Januray",'February','March', 'April', 'May', 'June','July','August','September','October','November','December'] category1_type = CategoricalDtype(categories=Months, ordered=True) ## created an ordered category from the list "months" mergeddata['month'] = mergeddata['month'].astype(category1_type) Days = ["Monday",'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday'] category2_type = CategoricalDtype(categories=Days, ordered=True) mergeddata['day']=mergeddata['day'].astype(category2_type) mergeddata['ride_length'] = mergeddata['ride_length'].astype(int) # In[32]: mergeddata.dtypes # # ANALYSIS AND VISUALIZATIONS # #### ANALYZING THE DIFFERENCE BETWEEN CASUAL RIDERS AND MEMBERS # #### 1. The difference in total rides per each member type # In[33]: mergeddata.groupby(['membership'])['ride_id'].count() # In[34]: sn.set_theme(style="darkgrid") plt.figure(figsize=(12,6)) sn.countplot(y="membership", data = mergeddata, hue="membership") plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation plt.xlabel("Number of Rides") plt.ylabel("Membership Type") plt.title("TOTAL NUMBER OF RIDES BETWEEN MEMBERS IN 2021 ") plt.xticks(rotation = 45, horizontalalignment = "right") plt.show() # #### 2. The total ride lengths (Casual vs. Members) in 2021 # In[35]: df1 = mergeddata.groupby(['membership'], as_index=False)['ride_length'].sum() # In[36]: sn.set_theme(style="whitegrid") plt.figure(figsize=(12,6)) sn.barplot(data = df1, x='ride_length', y ='membership', hue = 'membership') plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation plt.title('TOTAL RIDE LENGTH IN 2021 (CASUAL RIDERS VS. MEMBERS)') plt.xlabel('TOTAL RIDE LENGTH IN MINUTES') plt.ylabel('MEMBER TYPE') plt.xticks(rotation = 45, horizontalalignment = "right") plt.show() # #### 3. Total number of rides in each month between casual riders and members # In[37]: pivot_1 = mergeddata.pivot_table( index=["month","membership"], values="ride_id", aggfunc= ['count'], margins=True, margins_name= 'Total Count') pivot_1 # In[38]: pivot1_df = mergeddata.groupby(['month','membership'], as_index=False)[['ride_id']].count() plt.figure(figsize=(18,6)) sn.lineplot(data = pivot1_df , x = "month", y='ride_id', hue = "membership") plt.xlabel("Month") plt.ylabel('No. of Rides') plt.title("Total Monthly Rides Between Casual Riders and Members") plt.show() # #### 4. The most popular type of ride between the members # In[39]: pivot_2 = mergeddata.pivot_table( index=['rideable_type', 'membership'], values=['ride_id'], aggfunc=['count'], margins=True, margins_name= "Total_Count" ) pivot_2 # In[40]: pivot2_df = mergeddata.groupby(['rideable_type','membership'], as_index=False)['ride_id'].count() plt.figure(figsize=(12,6)) sn.barplot(data = pivot2_df, x= "rideable_type", y='ride_id', hue='membership') plt.ticklabel_format(style='plain', axis="y") plt.title('THE MOST POPULAR RIDE TYPES BETWEEN MEMBER TYPES') plt.xlabel('RIDE TYPE') plt.ylabel('NO. OF RIDES') plt.show() # #### 5. Total usage per day between casual riders and members # In[41]: pivot_3 = mergeddata.pivot_table( index=['day','membership'], values=['ride_id'], aggfunc=['count'], margins=True, margins_name= "Total" ) pivot_3 # In[42]: pivot3_df=mergeddata.groupby(['day','membership'], as_index=False)['ride_id'].count() plt.figure(figsize=(12,6)) sn.lineplot(data = pivot3_df, x="day", y='ride_id', hue='membership', markers=True) plt.title('TOTAL RIDES FOR EACH DAY OF THE WEEK (CASUAL VS. MEMBERS)') plt.xlabel('DAYS OF THE WEEK') plt.ylabel('TOTAL RIDES') plt.show() # #### 6. The average / total number of rides per hour # In[43]: pivot_4 = mergeddata.pivot_table( index=['hour','membership'], values=['ride_id'], aggfunc=['count'], margins=True, margins_name="Total Count" ) pivot_4 # In[44]: pivot4_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_id'].count() plt.figure(figsize=(12,6)) sn.lineplot(data=pivot4_df, x="hour",y='ride_id', hue='membership') plt.title('NO. OF RIDES PER HOUR (CASUAL VS. MEMBERS)') plt.xlabel('HOUR') plt.ylabel('NO. OF RIDES') plt.show() # #### 7. Average ride length per hour # In[45]: pivot_5 = mergeddata.pivot_table( index=['hour','membership'], values=['ride_length'], aggfunc=['mean'] ) pivot_5 # In[46]: pivot5_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_length'].mean() plt.figure(figsize=(12,6)) sn.lineplot(data = pivot5_df, x='hour', y='ride_length', hue='membership') plt.title('AVERAGE RIDE LENGTH PER HOUR (CASUAL VS. MEMBERS)') plt.xlabel('HOUR') plt.ylabel('AVERAGE RIDE LENGTH') plt.show() # #### 8. The average ride length per day # In[47]: pivot_6 = mergeddata.pivot_table( index=['day','membership'], values=['ride_length'], aggfunc=['mean'], margins=True, margins_name="Total Average" ) pivot_6 # In[48]: pivot6_df = mergeddata.groupby(['day','membership'], as_index=False)['ride_length'].mean() plt.figure(figsize=(12,6)) sn.lineplot(data = pivot6_df, x='day', y='ride_length', hue='membership') plt.title('AVERAGE RIDE LENGTH PER DAY (CASUAL VS. MEMBERS)') plt.xlabel('DAY') plt.ylabel('AVERAGE RIDE LENGTH') plt.show() # #### 9. Average ride length per month # In[49]: pivot_7 = mergeddata.pivot_table( index=['month','membership'], values=['ride_length'], aggfunc=['mean'], margins=True, margins_name="Total Average" ) pivot_7 # In[52]: pivot7_df = mergeddata.groupby(['month','membership'], as_index=False)['ride_length'].mean() plt.figure(figsize=(12,6)) sn.barplot(data = pivot7_df, x='month', y='ride_length', hue='membership') plt.title('AVERAGE RIDE LENGTH PER MONTH (CASUAL VS. MEMBERS)') plt.xlabel('MONTH') plt.ylabel('AVERAGE RIDE LENGTH') plt.show()