#!/usr/bin/env python
# coding: utf-8

# ### Import the necessary libraries

# In[1]:


import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sn 
from pandas.api.types import CategoricalDtype


# ### Merge all twelve csv files into one (January to december 2021)

# In[2]:


## Accessing the path to all the files. The files must all have a similar naming structure. All my files ended in "data.csv"
joined_files = os.path.join("C:\\Users\\jeyjo\\Desktop\\GOOGLE DATA ANALYTICS CERTIFICATE PROGRAM\\CASE STUDY\\Bikeshare\\CYCLISTIC DATA","*data.csv") # merging the files


# In[3]:


# A list of all the files is returned
joined_list = glob.glob(joined_files) 


# In[4]:


# This appends all the files into one dataframe
mergeddata = pd.concat(map(pd.read_csv, joined_list), ignore_index=True) # Finally, the files are joined
mergeddata


# ## DATA CLEANING & WRANGLING

# In[5]:


## To know the number of rows and columns
mergeddata.shape


# In[6]:


## Checking for null values
mergeddata.isnull().sum()


# In[7]:


## Dropping columns that are not needed
mergeddata.drop(mergeddata.iloc[:, 4:12], inplace= True, axis=1)


# In[8]:


mergeddata


# ### GETTING TO KNOW THE DATA

# In[9]:


## Checking for duplicates
mergeddata.duplicated().sum()


# In[10]:


mergeddata.describe()


# In[11]:


mergeddata.rideable_type.unique()


# In[12]:


mergeddata.member_casual.unique()


# In[13]:


mergeddata.dtypes


# #### DATA WRANGLING AND MANIPULATION

# In[14]:


## Converting into datetime datatypes for the respective columns
mergeddata['started_at'] = pd.to_datetime(mergeddata['started_at'])
mergeddata['ended_at'] = pd.to_datetime(mergeddata['ended_at'])


# In[15]:


mergeddata = mergeddata.rename(columns = {'member_casual':'membership'})


# In[16]:


## Feature extraction
mergeddata['day'] = mergeddata['started_at'].dt.day_name()
mergeddata['month'] = mergeddata['ended_at'].dt.month_name()
mergeddata['hour'] = mergeddata['started_at'].dt.hour
mergeddata['ride_length'] = (mergeddata['ended_at'] - mergeddata['started_at']).dt.total_seconds()/60


# In[19]:


mergeddata


# In[20]:


## Sorting the ride length in ascending order
mergeddata.sort_values(by=['ride_length'], axis = 0, inplace = True)
mergeddata.head(5)


# In[21]:


## The total number of negative ride_lengths and those less than 60 seconds.
mergeddata[mergeddata['ride_length']<1]['ride_length'].count()


# ## OR

# In[22]:


## The number of negative values and those less than 60 seconds
len(mergeddata[mergeddata['ride_length']<1])

## 85233 is only 1.5% of the total data thus can be deleted. 


# In[23]:


## Assign the negative values and those less than 60 seconds to a variable
Bad_Data = mergeddata[ (mergeddata['ride_length']<1)].index


# In[24]:


## Drop the bad data
mergeddata.drop(Bad_Data, inplace=True)


# In[27]:


## To reset the index
mergeddata.reset_index(drop=True).head(5)


# In[28]:


## To ascertain the right number of rows after dropping the bad data. i.e, 5595063 - 85233
mergeddata.shape[0]


# In[29]:


## Inspect the data since new fields have been populated
mergeddata.isna().sum()


# In[30]:


mergeddata.dtypes


# In[31]:


## A defined and ordered category for month and day ( This allows the months and days to appear in order when plotting)
Months = ["Januray",'February','March', 'April', 'May', 'June','July','August','September','October','November','December']
category1_type = CategoricalDtype(categories=Months, ordered=True) ## created an ordered category from the list "months"
mergeddata['month'] = mergeddata['month'].astype(category1_type)

Days = ["Monday",'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
category2_type = CategoricalDtype(categories=Days, ordered=True)
mergeddata['day']=mergeddata['day'].astype(category2_type)

mergeddata['ride_length'] = mergeddata['ride_length'].astype(int)


# In[32]:


mergeddata.dtypes


# # ANALYSIS AND VISUALIZATIONS

# #### ANALYZING THE DIFFERENCE BETWEEN CASUAL RIDERS AND MEMBERS

# #### 1. The difference in total rides per each member type

# In[33]:


mergeddata.groupby(['membership'])['ride_id'].count()


# In[34]:


sn.set_theme(style="darkgrid")
plt.figure(figsize=(12,6))
sn.countplot(y="membership", data = mergeddata, hue="membership")
plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation
plt.xlabel("Number of Rides")
plt.ylabel("Membership Type")
plt.title("TOTAL NUMBER OF RIDES BETWEEN MEMBERS IN 2021 ")
plt.xticks(rotation = 45, horizontalalignment = "right")
plt.show()


# #### 2. The total ride lengths (Casual vs. Members) in 2021

# In[35]:


df1 = mergeddata.groupby(['membership'], as_index=False)['ride_length'].sum()


# In[36]:


sn.set_theme(style="whitegrid")
plt.figure(figsize=(12,6))
sn.barplot(data = df1, x='ride_length', y ='membership', hue = 'membership')
plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation
plt.title('TOTAL RIDE LENGTH IN 2021 (CASUAL RIDERS VS. MEMBERS)')
plt.xlabel('TOTAL RIDE LENGTH IN MINUTES')
plt.ylabel('MEMBER TYPE')
plt.xticks(rotation = 45, horizontalalignment = "right")
plt.show()


# #### 3. Total number of rides in each month between casual riders and members

# In[37]:


pivot_1 = mergeddata.pivot_table(
index=["month","membership"],
values="ride_id",
aggfunc= ['count'],
margins=True,
margins_name= 'Total Count')
pivot_1


# In[38]:


pivot1_df = mergeddata.groupby(['month','membership'], as_index=False)[['ride_id']].count()
plt.figure(figsize=(18,6))
sn.lineplot(data = pivot1_df , x = "month", y='ride_id', hue = "membership")
plt.xlabel("Month")
plt.ylabel('No. of Rides')
plt.title("Total Monthly Rides Between Casual Riders and Members")
plt.show()


# #### 4. The most popular type of ride between the members

# In[39]:


pivot_2 = mergeddata.pivot_table(
    index=['rideable_type', 'membership'],
    values=['ride_id'],
    aggfunc=['count'],
    margins=True,
    margins_name= "Total_Count"
)
pivot_2


# In[40]:


pivot2_df = mergeddata.groupby(['rideable_type','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.barplot(data = pivot2_df, x= "rideable_type", y='ride_id', hue='membership')
plt.ticklabel_format(style='plain', axis="y")
plt.title('THE MOST POPULAR RIDE TYPES BETWEEN MEMBER TYPES')
plt.xlabel('RIDE TYPE')
plt.ylabel('NO. OF RIDES')
plt.show()


# #### 5. Total usage per day between casual riders and members

# In[41]:


pivot_3 = mergeddata.pivot_table(
    index=['day','membership'],
    values=['ride_id'],
    aggfunc=['count'],
    margins=True,
    margins_name= "Total"
)
pivot_3


# In[42]:


pivot3_df=mergeddata.groupby(['day','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot3_df, x="day", y='ride_id', hue='membership', markers=True)
plt.title('TOTAL RIDES FOR EACH DAY OF THE WEEK (CASUAL VS. MEMBERS)')
plt.xlabel('DAYS OF THE WEEK')
plt.ylabel('TOTAL RIDES')
plt.show()


# #### 6. The average / total number of rides per hour

# In[43]:


pivot_4 = mergeddata.pivot_table(
    index=['hour','membership'],
    values=['ride_id'],
    aggfunc=['count'],
    margins=True,
    margins_name="Total Count"
)
pivot_4


# In[44]:


pivot4_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.lineplot(data=pivot4_df, x="hour",y='ride_id', hue='membership')
plt.title('NO. OF RIDES PER HOUR (CASUAL VS. MEMBERS)')
plt.xlabel('HOUR')
plt.ylabel('NO. OF RIDES')
plt.show()


# #### 7. Average ride length per hour

# In[45]:


pivot_5 = mergeddata.pivot_table(
    index=['hour','membership'],
    values=['ride_length'],
    aggfunc=['mean']
)
pivot_5


# In[46]:


pivot5_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot5_df, x='hour', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER HOUR (CASUAL VS. MEMBERS)')
plt.xlabel('HOUR')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()


# #### 8. The average ride length per day

# In[47]:


pivot_6 = mergeddata.pivot_table(
    index=['day','membership'],
    values=['ride_length'],
    aggfunc=['mean'],
    margins=True,
    margins_name="Total Average"
)
pivot_6


# In[48]:


pivot6_df = mergeddata.groupby(['day','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot6_df, x='day', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER DAY (CASUAL VS. MEMBERS)')
plt.xlabel('DAY')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()


# #### 9. Average ride length per month

# In[49]:


pivot_7 = mergeddata.pivot_table(
    index=['month','membership'],
    values=['ride_length'],
    aggfunc=['mean'],
    margins=True,
    margins_name="Total Average"
)
pivot_7


# In[52]:


pivot7_df = mergeddata.groupby(['month','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.barplot(data = pivot7_df, x='month', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER MONTH (CASUAL VS. MEMBERS)')
plt.xlabel('MONTH')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()