import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sn
from pandas.api.types import CategoricalDtype
## Accessing the path to all the files. The files must all have a similar naming structure. All my files ended in "data.csv"
joined_files = os.path.join("C:\\Users\\jeyjo\\Desktop\\GOOGLE DATA ANALYTICS CERTIFICATE PROGRAM\\CASE STUDY\\Bikeshare\\CYCLISTIC DATA","*data.csv") # merging the files
# A list of all the files is returned
joined_list = glob.glob(joined_files)
# This appends all the files into one dataframe
mergeddata = pd.concat(map(pd.read_csv, joined_list), ignore_index=True) # Finally, the files are joined
mergeddata
ride_id | rideable_type | started_at | ended_at | start_station_name | start_station_id | end_station_name | end_station_id | start_lat | start_lng | end_lat | end_lng | member_casual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | E19E6F1B8D4C42ED | electric_bike | 2021-01-23 16:14:19 | 2021-01-23 16:24:44 | California Ave & Cortez St | 17660 | NaN | NaN | 41.900341 | -87.696743 | 41.890000 | -87.720000 | member |
1 | DC88F20C2C55F27F | electric_bike | 2021-01-27 18:43:08 | 2021-01-27 18:47:12 | California Ave & Cortez St | 17660 | NaN | NaN | 41.900333 | -87.696707 | 41.900000 | -87.690000 | member |
2 | EC45C94683FE3F27 | electric_bike | 2021-01-21 22:35:54 | 2021-01-21 22:37:14 | California Ave & Cortez St | 17660 | NaN | NaN | 41.900313 | -87.696643 | 41.900000 | -87.700000 | member |
3 | 4FA453A75AE377DB | electric_bike | 2021-01-07 13:31:13 | 2021-01-07 13:42:55 | California Ave & Cortez St | 17660 | NaN | NaN | 41.900399 | -87.696662 | 41.920000 | -87.690000 | member |
4 | BE5E8EB4E7263A0B | electric_bike | 2021-01-23 02:24:02 | 2021-01-23 02:24:45 | California Ave & Cortez St | 17660 | NaN | NaN | 41.900326 | -87.696697 | 41.900000 | -87.700000 | casual |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5595058 | 847431F3D5353AB7 | electric_bike | 2021-12-12 13:36:55 | 2021-12-12 13:56:08 | Canal St & Madison St | 13341 | NaN | NaN | 41.882289 | -87.639752 | 41.890000 | -87.610000 | casual |
5595059 | CF407BBC3B9FAD63 | electric_bike | 2021-12-06 19:37:50 | 2021-12-06 19:44:51 | Canal St & Madison St | 13341 | Kingsbury St & Kinzie St | KA1503000043 | 41.882123 | -87.640053 | 41.889106 | -87.638862 | member |
5595060 | 60BB69EBF5440E92 | electric_bike | 2021-12-02 08:57:04 | 2021-12-02 09:05:21 | Canal St & Madison St | 13341 | Dearborn St & Monroe St | TA1305000006 | 41.881956 | -87.639955 | 41.880254 | -87.629603 | member |
5595061 | C414F654A28635B8 | electric_bike | 2021-12-13 09:00:26 | 2021-12-13 09:14:39 | Lawndale Ave & 16th St | 362.0 | NaN | NaN | 41.860000 | -87.720000 | 41.850000 | -87.710000 | member |
5595062 | 37AC57E34B2E7E97 | classic_bike | 2021-12-13 08:45:32 | 2021-12-13 08:49:09 | Michigan Ave & Jackson Blvd | TA1309000002 | Dearborn St & Monroe St | TA1305000006 | 41.877850 | -87.624080 | 41.881320 | -87.629521 | member |
5595063 rows × 13 columns
## To know the number of rows and columns
mergeddata.shape
(5595063, 13)
## Checking for null values
mergeddata.isnull().sum()
ride_id 0 rideable_type 0 started_at 0 ended_at 0 start_station_name 690809 start_station_id 690806 end_station_name 739170 end_station_id 739170 start_lat 0 start_lng 0 end_lat 4771 end_lng 4771 member_casual 0 dtype: int64
## Dropping columns that are not needed
mergeddata.drop(mergeddata.iloc[:, 4:12], inplace= True, axis=1)
mergeddata
ride_id | rideable_type | started_at | ended_at | member_casual | |
---|---|---|---|---|---|
0 | E19E6F1B8D4C42ED | electric_bike | 2021-01-23 16:14:19 | 2021-01-23 16:24:44 | member |
1 | DC88F20C2C55F27F | electric_bike | 2021-01-27 18:43:08 | 2021-01-27 18:47:12 | member |
2 | EC45C94683FE3F27 | electric_bike | 2021-01-21 22:35:54 | 2021-01-21 22:37:14 | member |
3 | 4FA453A75AE377DB | electric_bike | 2021-01-07 13:31:13 | 2021-01-07 13:42:55 | member |
4 | BE5E8EB4E7263A0B | electric_bike | 2021-01-23 02:24:02 | 2021-01-23 02:24:45 | casual |
... | ... | ... | ... | ... | ... |
5595058 | 847431F3D5353AB7 | electric_bike | 2021-12-12 13:36:55 | 2021-12-12 13:56:08 | casual |
5595059 | CF407BBC3B9FAD63 | electric_bike | 2021-12-06 19:37:50 | 2021-12-06 19:44:51 | member |
5595060 | 60BB69EBF5440E92 | electric_bike | 2021-12-02 08:57:04 | 2021-12-02 09:05:21 | member |
5595061 | C414F654A28635B8 | electric_bike | 2021-12-13 09:00:26 | 2021-12-13 09:14:39 | member |
5595062 | 37AC57E34B2E7E97 | classic_bike | 2021-12-13 08:45:32 | 2021-12-13 08:49:09 | member |
5595063 rows × 5 columns
## Checking for duplicates
mergeddata.duplicated().sum()
0
mergeddata.describe()
ride_id | rideable_type | started_at | ended_at | member_casual | |
---|---|---|---|---|---|
count | 5595063 | 5595063 | 5595063 | 5595063 | 5595063 |
unique | 5595063 | 3 | 4677998 | 4671372 | 2 |
top | E19E6F1B8D4C42ED | classic_bike | 2021-07-16 16:59:57 | 2021-09-16 08:39:10 | member |
freq | 1 | 3251028 | 7 | 17 | 3066058 |
mergeddata.rideable_type.unique()
array(['electric_bike', 'classic_bike', 'docked_bike'], dtype=object)
mergeddata.member_casual.unique()
array(['member', 'casual'], dtype=object)
mergeddata.dtypes
ride_id object rideable_type object started_at object ended_at object member_casual object dtype: object
## Converting into datetime datatypes for the respective columns
mergeddata['started_at'] = pd.to_datetime(mergeddata['started_at'])
mergeddata['ended_at'] = pd.to_datetime(mergeddata['ended_at'])
mergeddata = mergeddata.rename(columns = {'member_casual':'membership'})
## Feature extraction
mergeddata['day'] = mergeddata['started_at'].dt.day_name()
mergeddata['month'] = mergeddata['ended_at'].dt.month_name()
mergeddata['hour'] = mergeddata['started_at'].dt.hour
mergeddata['ride_length'] = (mergeddata['ended_at'] - mergeddata['started_at']).dt.total_seconds()/60
mergeddata
ride_id | rideable_type | started_at | ended_at | membership | day | month | hour | ride_length | |
---|---|---|---|---|---|---|---|---|---|
0 | E19E6F1B8D4C42ED | electric_bike | 2021-01-23 16:14:19 | 2021-01-23 16:24:44 | member | Saturday | January | 16 | 10.416667 |
1 | DC88F20C2C55F27F | electric_bike | 2021-01-27 18:43:08 | 2021-01-27 18:47:12 | member | Wednesday | January | 18 | 4.066667 |
2 | EC45C94683FE3F27 | electric_bike | 2021-01-21 22:35:54 | 2021-01-21 22:37:14 | member | Thursday | January | 22 | 1.333333 |
3 | 4FA453A75AE377DB | electric_bike | 2021-01-07 13:31:13 | 2021-01-07 13:42:55 | member | Thursday | January | 13 | 11.700000 |
4 | BE5E8EB4E7263A0B | electric_bike | 2021-01-23 02:24:02 | 2021-01-23 02:24:45 | casual | Saturday | January | 2 | 0.716667 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5595058 | 847431F3D5353AB7 | electric_bike | 2021-12-12 13:36:55 | 2021-12-12 13:56:08 | casual | Sunday | December | 13 | 19.216667 |
5595059 | CF407BBC3B9FAD63 | electric_bike | 2021-12-06 19:37:50 | 2021-12-06 19:44:51 | member | Monday | December | 19 | 7.016667 |
5595060 | 60BB69EBF5440E92 | electric_bike | 2021-12-02 08:57:04 | 2021-12-02 09:05:21 | member | Thursday | December | 8 | 8.283333 |
5595061 | C414F654A28635B8 | electric_bike | 2021-12-13 09:00:26 | 2021-12-13 09:14:39 | member | Monday | December | 9 | 14.216667 |
5595062 | 37AC57E34B2E7E97 | classic_bike | 2021-12-13 08:45:32 | 2021-12-13 08:49:09 | member | Monday | December | 8 | 3.616667 |
5595063 rows × 9 columns
## Sorting the ride length in ascending order
mergeddata.sort_values(by=['ride_length'], axis = 0, inplace = True)
mergeddata.head(5)
ride_id | rideable_type | started_at | ended_at | membership | day | month | hour | ride_length | |
---|---|---|---|---|---|---|---|---|---|
5044266 | 7CA158F5F050156E | electric_bike | 2021-11-07 01:58:08 | 2021-11-07 01:00:06 | casual | Sunday | November | 1 | -58.033333 |
5129636 | FD8AF7324ABAE9DA | electric_bike | 2021-11-07 01:56:51 | 2021-11-07 01:00:57 | casual | Sunday | November | 1 | -55.900000 |
5113400 | 508B09A5FB0737DC | classic_bike | 2021-11-07 01:54:50 | 2021-11-07 01:00:45 | member | Sunday | November | 1 | -54.083333 |
5248365 | 6F9E76F5EDAAC1B8 | electric_bike | 2021-11-07 01:55:42 | 2021-11-07 01:01:55 | member | Sunday | November | 1 | -53.783333 |
5270218 | 7AECC76D1562B51C | classic_bike | 2021-11-07 01:54:58 | 2021-11-07 01:01:29 | casual | Sunday | November | 1 | -53.483333 |
## The total number of negative ride_lengths and those less than 60 seconds.
mergeddata[mergeddata['ride_length']<1]['ride_length'].count()
85233
## The number of negative values and those less than 60 seconds
len(mergeddata[mergeddata['ride_length']<1])
## 85233 is only 1.5% of the total data thus can be deleted.
85233
## Assign the negative values and those less than 60 seconds to a variable
Bad_Data = mergeddata[ (mergeddata['ride_length']<1)].index
## Drop the bad data
mergeddata.drop(Bad_Data, inplace=True)
## To reset the index
mergeddata.reset_index(drop=True).head(5)
ride_id | rideable_type | started_at | ended_at | membership | day | month | hour | ride_length | |
---|---|---|---|---|---|---|---|---|---|
0 | A18D0099C32F7239 | classic_bike | 2021-08-13 13:40:35 | 2021-08-13 13:41:35 | member | Friday | August | 13 | 1.0 |
1 | 5F884BFB1A6265FD | electric_bike | 2021-05-27 14:14:35 | 2021-05-27 14:15:35 | member | Thursday | May | 14 | 1.0 |
2 | ACD14297A4F64102 | electric_bike | 2021-10-29 23:55:17 | 2021-10-29 23:56:17 | member | Friday | October | 23 | 1.0 |
3 | CFDB24DFFD0A4CA9 | electric_bike | 2021-11-07 22:50:56 | 2021-11-07 22:51:56 | member | Sunday | November | 22 | 1.0 |
4 | B6E3BD1790D2BC64 | electric_bike | 2021-09-18 11:15:51 | 2021-09-18 11:16:51 | member | Saturday | September | 11 | 1.0 |
## To ascertain the right number of rows after dropping the bad data. i.e, 5595063 - 85233
mergeddata.shape[0]
5509830
## Inspect the data since new fields have been populated
mergeddata.isna().sum()
ride_id 0 rideable_type 0 started_at 0 ended_at 0 membership 0 day 0 month 0 hour 0 ride_length 0 dtype: int64
mergeddata.dtypes
ride_id object rideable_type object started_at datetime64[ns] ended_at datetime64[ns] membership object day object month object hour int64 ride_length float64 dtype: object
## A defined and ordered category for month and day ( This allows the months and days to appear in order when plotting)
Months = ["Januray",'February','March', 'April', 'May', 'June','July','August','September','October','November','December']
category1_type = CategoricalDtype(categories=Months, ordered=True) ## created an ordered category from the list "months"
mergeddata['month'] = mergeddata['month'].astype(category1_type)
Days = ["Monday",'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
category2_type = CategoricalDtype(categories=Days, ordered=True)
mergeddata['day']=mergeddata['day'].astype(category2_type)
mergeddata['ride_length'] = mergeddata['ride_length'].astype(int)
mergeddata.dtypes
ride_id object rideable_type object started_at datetime64[ns] ended_at datetime64[ns] membership object day category month category hour int64 ride_length int32 dtype: object
mergeddata.groupby(['membership'])['ride_id'].count()
membership casual 2494892 member 3014938 Name: ride_id, dtype: int64
sn.set_theme(style="darkgrid")
plt.figure(figsize=(12,6))
sn.countplot(y="membership", data = mergeddata, hue="membership")
plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation
plt.xlabel("Number of Rides")
plt.ylabel("Membership Type")
plt.title("TOTAL NUMBER OF RIDES BETWEEN MEMBERS IN 2021 ")
plt.xticks(rotation = 45, horizontalalignment = "right")
plt.show()
df1 = mergeddata.groupby(['membership'], as_index=False)['ride_length'].sum()
sn.set_theme(style="whitegrid")
plt.figure(figsize=(12,6))
sn.barplot(data = df1, x='ride_length', y ='membership', hue = 'membership')
plt.ticklabel_format(style='plain', axis="x") ## to change from scientific notation
plt.title('TOTAL RIDE LENGTH IN 2021 (CASUAL RIDERS VS. MEMBERS)')
plt.xlabel('TOTAL RIDE LENGTH IN MINUTES')
plt.ylabel('MEMBER TYPE')
plt.xticks(rotation = 45, horizontalalignment = "right")
plt.show()
pivot_1 = mergeddata.pivot_table(
index=["month","membership"],
values="ride_id",
aggfunc= ['count'],
margins=True,
margins_name= 'Total Count')
pivot_1
count | ||
---|---|---|
ride_id | ||
month | membership | |
Januray | casual | 0 |
member | 0 | |
February | casual | 9957 |
member | 38684 | |
March | casual | 83136 |
member | 142366 | |
April | casual | 134894 |
member | 197459 | |
May | casual | 253253 |
member | 269902 | |
June | casual | 365040 |
member | 352661 | |
July | casual | 435622 |
member | 373781 | |
August | casual | 407903 |
member | 385494 | |
September | casual | 359206 |
member | 385949 | |
October | casual | 253857 |
member | 367474 | |
November | casual | 105423 |
member | 248701 | |
December | casual | 68703 |
member | 174880 | |
Total Count | 5414345 |
pivot1_df = mergeddata.groupby(['month','membership'], as_index=False)[['ride_id']].count()
plt.figure(figsize=(18,6))
sn.lineplot(data = pivot1_df , x = "month", y='ride_id', hue = "membership")
plt.xlabel("Month")
plt.ylabel('No. of Rides')
plt.title("Total Monthly Rides Between Casual Riders and Members")
plt.show()
pivot_2 = mergeddata.pivot_table(
index=['rideable_type', 'membership'],
values=['ride_id'],
aggfunc=['count'],
margins=True,
margins_name= "Total_Count"
)
pivot_2
count | ||
---|---|---|
ride_id | ||
rideable_type | membership | |
classic_bike | casual | 1253395 |
member | 1955673 | |
docked_bike | casual | 310390 |
member | 1 | |
electric_bike | casual | 931107 |
member | 1059264 | |
Total_Count | 5509830 |
pivot2_df = mergeddata.groupby(['rideable_type','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.barplot(data = pivot2_df, x= "rideable_type", y='ride_id', hue='membership')
plt.ticklabel_format(style='plain', axis="y")
plt.title('THE MOST POPULAR RIDE TYPES BETWEEN MEMBER TYPES')
plt.xlabel('RIDE TYPE')
plt.ylabel('NO. OF RIDES')
plt.show()
pivot_3 = mergeddata.pivot_table(
index=['day','membership'],
values=['ride_id'],
aggfunc=['count'],
margins=True,
margins_name= "Total"
)
pivot_3
count | ||
---|---|---|
ride_id | ||
day | membership | |
Monday | casual | 282545 |
member | 409354 | |
Tuesday | casual | 270713 |
member | 458153 | |
Wednesday | casual | 275188 |
member | 469601 | |
Thursday | casual | 282311 |
member | 444440 | |
Friday | casual | 359117 |
member | 438997 | |
Saturday | casual | 550484 |
member | 425175 | |
Sunday | casual | 474534 |
member | 369218 | |
Total | 5509830 |
pivot3_df=mergeddata.groupby(['day','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot3_df, x="day", y='ride_id', hue='membership', markers=True)
plt.title('TOTAL RIDES FOR EACH DAY OF THE WEEK (CASUAL VS. MEMBERS)')
plt.xlabel('DAYS OF THE WEEK')
plt.ylabel('TOTAL RIDES')
plt.show()
pivot_4 = mergeddata.pivot_table(
index=['hour','membership'],
values=['ride_id'],
aggfunc=['count'],
margins=True,
margins_name="Total Count"
)
pivot_4
count | ||
---|---|---|
ride_id | ||
hour | membership | |
0 | casual | 53099 |
member | 32481 | |
1 | casual | 38659 |
member | 21442 | |
2 | casual | 25179 |
member | 12187 | |
3 | casual | 13875 |
member | 7026 | |
4 | casual | 9801 |
member | 8006 | |
5 | casual | 12358 |
member | 29604 | |
6 | casual | 25406 |
member | 80649 | |
7 | casual | 46152 |
member | 147619 | |
8 | casual | 62780 |
member | 170673 | |
9 | casual | 75191 |
member | 128767 | |
10 | casual | 103332 |
member | 127090 | |
11 | casual | 134131 |
member | 152778 | |
12 | casual | 159795 |
member | 177628 | |
13 | casual | 171071 |
member | 174798 | |
14 | casual | 176047 |
member | 171589 | |
15 | casual | 186070 |
member | 197620 | |
16 | casual | 202590 |
member | 253307 | |
17 | casual | 233544 |
member | 315080 | |
18 | casual | 210980 |
member | 267020 | |
19 | casual | 164053 |
member | 191027 | |
20 | casual | 120428 |
member | 129219 | |
21 | casual | 102392 |
member | 95424 | |
22 | casual | 94922 |
member | 73217 | |
23 | casual | 73037 |
member | 50687 | |
Total Count | 5509830 |
pivot4_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_id'].count()
plt.figure(figsize=(12,6))
sn.lineplot(data=pivot4_df, x="hour",y='ride_id', hue='membership')
plt.title('NO. OF RIDES PER HOUR (CASUAL VS. MEMBERS)')
plt.xlabel('HOUR')
plt.ylabel('NO. OF RIDES')
plt.show()
pivot_5 = mergeddata.pivot_table(
index=['hour','membership'],
values=['ride_length'],
aggfunc=['mean']
)
pivot_5
mean | ||
---|---|---|
ride_length | ||
hour | membership | |
0 | casual | 34.199288 |
member | 13.401496 | |
1 | casual | 37.131638 |
member | 14.591036 | |
2 | casual | 43.410858 |
member | 15.347994 | |
3 | casual | 44.989766 |
member | 16.209081 | |
4 | casual | 46.986124 |
member | 13.251187 | |
5 | casual | 23.533662 |
member | 11.510505 | |
6 | casual | 21.044753 |
member | 11.865367 | |
7 | casual | 20.712754 |
member | 11.991573 | |
8 | casual | 23.822029 |
member | 11.869681 | |
9 | casual | 28.320796 |
member | 12.311291 | |
10 | casual | 33.256397 |
member | 13.243843 | |
11 | casual | 33.803051 |
member | 13.437772 | |
12 | casual | 33.372064 |
member | 13.185686 | |
13 | casual | 34.431452 |
member | 13.413260 | |
14 | casual | 34.729322 |
member | 13.912110 | |
15 | casual | 33.977154 |
member | 13.793695 | |
16 | casual | 31.500844 |
member | 13.835843 | |
17 | casual | 29.669514 |
member | 14.002295 | |
18 | casual | 29.807636 |
member | 13.861512 | |
19 | casual | 30.249048 |
member | 13.648652 | |
20 | casual | 32.024513 |
member | 13.576881 | |
21 | casual | 32.172982 |
member | 13.229376 | |
22 | casual | 32.355745 |
member | 13.406040 | |
23 | casual | 33.459534 |
member | 13.930534 |
pivot5_df = mergeddata.groupby(['hour','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot5_df, x='hour', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER HOUR (CASUAL VS. MEMBERS)')
plt.xlabel('HOUR')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()
pivot_6 = mergeddata.pivot_table(
index=['day','membership'],
values=['ride_length'],
aggfunc=['mean'],
margins=True,
margins_name="Total Average"
)
pivot_6
mean | ||
---|---|---|
ride_length | ||
day | membership | |
Monday | casual | 31.811113 |
member | 12.971457 | |
Tuesday | casual | 27.855685 |
member | 12.495476 | |
Wednesday | casual | 27.538279 |
member | 12.529075 | |
Thursday | casual | 27.574328 |
member | 12.482677 | |
Friday | casual | 30.271313 |
member | 13.052720 | |
Saturday | casual | 34.683591 |
member | 15.048752 | |
Sunday | casual | 37.590295 |
member | 15.451665 | |
Total Average | 21.777986 |
pivot6_df = mergeddata.groupby(['day','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.lineplot(data = pivot6_df, x='day', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER DAY (CASUAL VS. MEMBERS)')
plt.xlabel('DAY')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()
pivot_7 = mergeddata.pivot_table(
index=['month','membership'],
values=['ride_length'],
aggfunc=['mean'],
margins=True,
margins_name="Total Average"
)
pivot_7
mean | ||
---|---|---|
ride_length | ||
month | membership | |
February | casual | 47.169529 |
member | 17.923560 | |
March | casual | 37.004282 |
member | 13.671298 | |
April | casual | 37.498169 |
member | 14.398336 | |
May | casual | 36.609130 |
member | 14.399063 | |
June | casual | 36.323291 |
member | 14.444180 | |
July | casual | 33.448024 |
member | 13.982372 | |
August | casual | 30.148324 |
member | 13.850942 | |
September | casual | 27.644040 |
member | 13.450091 | |
October | casual | 27.982116 |
member | 12.212848 | |
November | casual | 24.151210 |
member | 11.047925 | |
December | casual | 23.556206 |
member | 10.690502 | |
Total Average | 21.891146 |
pivot7_df = mergeddata.groupby(['month','membership'], as_index=False)['ride_length'].mean()
plt.figure(figsize=(12,6))
sn.barplot(data = pivot7_df, x='month', y='ride_length', hue='membership')
plt.title('AVERAGE RIDE LENGTH PER MONTH (CASUAL VS. MEMBERS)')
plt.xlabel('MONTH')
plt.ylabel('AVERAGE RIDE LENGTH')
plt.show()