#!/usr/bin/env python # coding: utf-8 # # Part III : Exploratory Data Analyses # > Jump to : # * [Part 1](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part1.ipynb) *Extracting Street Addresses & Coordinates* # * [Part 2](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part2-forUpload.ipynb), *Extracting Foursquare Data* # * [Part 4](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part4.ipynb), *Clustering and Visualising* # * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion* # ## Step 1: Load up environment and data # In[9]: import pandas as pd import numpy as np # In[10]: streetTrikkBussMetroTog = pd.read_csv('./streetData_TrikkBusMetroTog.csv') streetTrikkBussMetroTog.head() # In[11]: streetTrikkBussMetroTog.shape # So, in all we have information on **2460** streets in Oslo, Norway # ## Step 2: Find total transport options # In[22]: #Check column type in the above dataframe streetTrikkBussMetroTog.columns.map(type) # In[35]: #To find total transport options/street, we sum the columns that are NOT null, followed by sorting in descending order to obtain streets with maximum number of transport options #First we must select the columns to sum on busCOLS = list(range(18)) #Now we define a function that can automatically add text to numbers, useful for colns of T-bane def makeColnName (list_of_colnNum): metroCOLSlist = [] for i in list_of_colnNum: #print('T-bane_'+str(i)) metroCOLSlist.append('T-bane_'+str(i))#Choose whatever colnName you want to prepend to the numbers return(metroCOLSlist) #Makes columns of T-bane COLSlist = makeColnName(list(range(1,5))) #Join T-bane and bus lists together with additional list of Trikk+Tog COLSlist = ['Trikk','Train Station']+ COLSlist + busCOLS COLSlist#Total 24 transport colns, out of which max comes 22...wow...a street with 22 options..?? #Check data type in the list #for i in COLSlist: # if (type(i)) == str: # print('YABBA',i) # else: # print('BOO',i) # # #Convert int types to str types def convertType (LIST): newLIST = list() for i in LIST: if type(i) == str: newLIST.append(i) else: #str(i) newLIST.append(str(i)) return newLIST COLSlist_str = convertType(COLSlist) #Check correctly formatted [type(i) for i in COLSlist_str] # In[41]: #Sort in descending order of non-null values i.e. highest transport to lowest streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False) #Also store this in a new coln named totalTransport streetTrikkBussMetroTog['Total Transport']= streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False) #Sort in descending order of non-null values i.e. highest transport to lowest # In[42]: #Obtain new index of the sorted order newIndex = streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False).index newIndex # In[43]: #If you want to see all the values of the original dataframe (not the sorted) streetTrikkBussMetroTog.index.values # In[44]: #Attempt to change old index to new custom ordered index orderedStreetTrikkBussMetroTog = streetTrikkBussMetroTog.iloc[newIndex,] #Right now, the index values will be like 554, 2354 etc, to get them in the order of 0,1,2... #Get range for the index values manualIndex2 = range(orderedStreetTrikkBussMetroTog.shape[0]) #Assign this new index orderedStreetTrikkBussMetroTog.index = manualIndex2 orderedStreetTrikkBussMetroTog.head() # In[48]: #Find total bus options, but first convert the busCOLS to str busCOLS_str = convertType(busCOLS) orderedStreetTrikkBussMetroTog[busCOLS_str].notnull().sum(axis = 'columns').sort_values(ascending = False) #Sort in descending order of non-null values i.e. highest transport to lowest #Add this info as a new column orderedStreetTrikkBussMetroTog['Total Bus'] = orderedStreetTrikkBussMetroTog[busCOLS_str].notnull().sum(axis = 'columns').sort_values(ascending = False) orderedStreetTrikkBussMetroTog.head() # In[54]: #Find total streets with least amount of transport options print ('A total of ', orderedStreetTrikkBussMetroTog[orderedStreetTrikkBussMetroTog['Total Transport'] == 0].shape[0],'streets seem to have NO transport option within 400m') # In[52]: orderedStreetTrikkBussMetroTog.shape # ## Step 6: Exploratory Analysis # Let us explore some of the streets with the most, the least transport options. Let us also review the distribution of transport options across all streets. # ### 6.1 Top 10 streets with most transport options within 400m # In[50]: #sns.set(rc={'figure.figsize':(12,9)}) top_streets = orderedStreetTrikkBussMetroTog.head(10) top_streets.head() # ### 6.2 10 streets with least transport options within 400m # In[55]: bottom_streets = orderedStreetTrikkBussMetroTog.tail(10) bottom_streets.head() # In[61]: #Let us combine the top10 and bottom 10 dataframes into a single dataframe #We will use pd.concat that takes in a LIST of dataframes to concat df_list = [top_streets, bottom_streets] combinedTop10_bottom10streets = pd.concat(df_list) #combinedTop10_bottom10streets #Let us visualise this import seaborn as sns import matplotlib.pyplot as plt sns.set(rc={'figure.figsize':(12,9)}) sns.barplot(x = 'Street', y = 'Total Transport', data = combinedTop10_bottom10streets, palette = 'Blues_d') plt.xticks(rotation = 45) # ### 6.3 Distribution of transport options across all streets # In[129]: sns.set_palette("Blues_r") #sns.set(style="white", palette="dark", color_codes=True) sns.distplot(orderedStreetTrikkBussMetroTog['Total Transport'], kde = False) # In[63]: sns.boxplot(y = orderedStreetTrikkBussMetroTog['Total Transport'], width = 0.1) # In[132]: sns.set(style="whitegrid", palette="muted", color_codes=True) sns.violinplot(x="Total Transport", data=orderedStreetTrikkBussMetroTog) # ### 6.4 Visualise relation between total bus options and total transport # In[64]: sns.jointplot(x = 'Total Transport', y = 'Total Bus', data = orderedStreetTrikkBussMetroTog, height= 9) # In[65]: #Make a density plot version of the above sns.jointplot(x = 'Total Transport', y = 'Total Bus', kind = 'kde', data = orderedStreetTrikkBussMetroTog, height = 9) # In[66]: cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True) sns.kdeplot(orderedStreetTrikkBussMetroTog['Total Transport'], orderedStreetTrikkBussMetroTog['Total Bus'], cmap=cmap, n_levels=60, shade=True); # ### 6.4 Divide streets based on no. of transport options # In[67]: bins = np.linspace(min(orderedStreetTrikkBussMetroTog['Total Transport']), max(orderedStreetTrikkBussMetroTog['Total Transport']), 8)#Provide 1 more than the number of levels to create bins # In[69]: group_names = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18', '19-22'] # In[70]: orderedStreetTrikkBussMetroTog['Binned Transport'] = pd.cut(orderedStreetTrikkBussMetroTog['Total Transport'], bins, labels = group_names, include_lowest= True) orderedStreetTrikkBussMetroTog[['Binned Transport','Total Transport']].head(20) # #### Let us find out how many streets belong to each category # In[71]: orderedStreetTrikkBussMetroTog['Binned Transport'].value_counts() # #### Let us try to visualise this to get a better perspective... # In[72]: new_df = orderedStreetTrikkBussMetroTog['Binned Transport'].value_counts().to_frame() new_df # In[73]: new_df.reset_index(inplace = True) new_df.columns = ['Transport Options', 'Total Streets'] new_df # In[74]: sns.barplot(x = 'Transport Options', y = 'Total Streets', data = new_df, palette= 'Blues_d') plt.xticks(rotation = 45) # In[75]: #Save to drive orderedStreetTrikkBussMetroTog.to_csv(path_or_buf='./orderedStreetTrikkBusMetroTog.csv', index = False)