#!/usr/bin/env python # coding: utf-8 # # Part IV: Clustering and Visualising # > Jump to : # * [Part 1](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part1.ipynb) *Extracting Street Addresses & Coordinates* # * [Part 2](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part2-forUpload.ipynb), *Extracting Foursquare Data* # * [Part 3](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part3.ipynb) , *Exploratory Data Analysis* # * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion* # ## Step 1: Load environment and data # In[1]: import pandas as pd import numpy as np from sklearn.cluster import KMeans # In[7]: orderedStreetTrikkBussMetroTog = pd.read_csv('./orderedStreetTrikkBusMetroTog.csv') orderedStreetTrikkBussMetroTog.head() # ## Step 2: K-Means Clustering # In[4]: #K-means clustering on Total Transport kclusters = 7 #Create object kmClust = KMeans(init = 'k-means++', n_clusters = kclusters, n_init= 12, random_state= 1) #Fit kmLabels = kmClust.fit(orderedStreetTrikkBussMetroTog[['Total Transport']]) #Check labels kmLabels.labels_[0:6] # In[8]: #Insert into dataframe of ordered bus trikk #orderedStreetTrikkBussMetroTog.drop('Cluster', axis = 'columns', inplace = True)#In case Cluster exists already from previous rounds orderedStreetTrikkBussMetroTog.insert(3, 'Cluster', kmLabels.labels_) orderedStreetTrikkBussMetroTog.head(20) # In[10]: # Matplotlib and associated plotting modules import matplotlib.cm as cm import matplotlib.colors as colors import folium #Make map object latitude = 59.9133301 longitude = 10.7389701 map_cluster = folium.Map(location=[latitude, longitude], zoom_start=11) # set color scheme for the clusters x = np.arange(kclusters)#kclusters ys = [i + x + (i*x)**2 for i in range(kclusters)]#kclusters colors_array = cm.rainbow(np.linspace(0, 1, len(ys))) rainbow = [colors.rgb2hex(i) for i in colors_array] # add markers to map markers_colors = [] for lat, lng, street, cluster, group in zip(orderedStreetTrikkBussMetroTog['Street Latitude'], orderedStreetTrikkBussMetroTog['Street Longitude'], orderedStreetTrikkBussMetroTog['Street'], orderedStreetTrikkBussMetroTog['Cluster'].astype('int'), orderedStreetTrikkBussMetroTog['Binned Transport']): #label = folium.Popup(str(street) + ' Cluster ' + str(cluster) + ' ' + str(group), parse_html=True) label = folium.Popup(str(street) + ' Group:' + str(group) + '\nCluster:'+ str(cluster) , parse_html = True) folium.CircleMarker( [lat, lng], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_cluster) map_cluster # ### From the above map, we see that the cluster definitions don't correspond to our group labels of 'Binned Transport' # ##### So let us investigate the characters of each cluster to try and figure out how the clusters were formed # #### Cluster 3 # In[14]: #Let us compare Total transport options for various clusters orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 3].iloc[:, -3:] # So we see all streets with __15+__ transport options get categorised into **Cluster 3** # #### Cluster 2 # In[16]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 2].iloc[:, -3:].head() # In[15]: #Let us compare Total transport options for various clusters orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 2].iloc[:, -3:].tail() # Here, we see all streets with __0__ transport options are put into **Cluster 2** # #### Cluster 5 # In[17]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 5].iloc[:, -3:].head() # In[18]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 5].iloc[:, -3:].tail() # All streets with __4-5__ transport options are in **Cluster 5** # #### Cluster 0 # In[19]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 0].iloc[:, -3:].head() # In[20]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 0].iloc[:, -3:].tail() # All streets with __9-14__ transport options are in **Cluster 0** # #### Cluster 1 # In[21]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 1].iloc[:, -3:].head() # In[22]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 1].iloc[:, -3:].tail() # All streets with __2-3__ transport options are in **Cluster 1** # #### Cluster 6 # In[23]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 6].iloc[:, -3:].head() # In[24]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 6].iloc[:, -3:].tail() # All streets with __1__ transport options are in **Cluster 6** # #### Cluster 4 # In[25]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 4].iloc[:, -3:].head() # In[26]: orderedStreetTrikkBussMetroTog[ orderedStreetTrikkBussMetroTog['Cluster'] == 4].iloc[:, -3:].tail() # All streets with __6-8__ transport options are in **Cluster 4** # In[27]: #Let us find out how many streets belong to each cluster cluster_df = orderedStreetTrikkBussMetroTog['Cluster'].value_counts().to_frame() cluster_df #Reset index cluster_df.reset_index(inplace = True) #Change column name cluster_df.columns = ['Cluster Label', 'Total Streets'] cluster_df #cluster_df.sort_values(by = 'Cluster Label', ascending = True, axis = 'rows') # #### We can visualise the distribution as follows: # In[66]: import seaborn as sns import matplotlib.pyplot as plt sns.set(rc = {'figure.figsize' : (12,9)}) sns.barplot(x = 'Cluster Label', y = 'Total Streets', data = cluster_df, palette= 'Blues_d') plt.annotate('9 -14', xy= (0,80), xytext = (0,80))#Cluster 0 plt.annotate('2 -3', xy= (1,750), xytext = (1,750))#Cluster 1 plt.annotate('0', xy= (2,640) , xytext=(2, 640))#Cluster 2 plt.annotate('15+', xy= (3,30), xytext = (3,30))#Cluster 3 plt.annotate('6 -8', xy= (4,170), xytext = (4,170))#Cluster 4 plt.annotate('4 -5', xy= (5,220), xytext = (5,220))#Cluster 5 plt.annotate('1', xy= (6,650), xytext = (6,650))#Cluster 6 # ### Let us make new column label that describes the no. of transport options in each cluster # In[33]: bins_kmeans = [0, 0.9, 1.2, 3.2, 5.2, 8.2, 14.2, 22]#Set the values -1 from the range you want, so if you want >15, so give 14; similarly if you want 9-14, give 8; want 6-8, give 5; want 4-5 give 3; want 3 give 2; want 2 give 1 bins_kmeans # In[34]: group_names2 = ['0', '1', '2-3', '4-5', '6-8', '9-14', '15+'] # In[37]: pd.set_option('display.max_rows', None) orderedStreetTrikkBussMetroTog['Clustered Transport'] = pd.cut(orderedStreetTrikkBussMetroTog['Total Transport'], bins_kmeans, labels = group_names2, include_lowest= True) #Check that the ordering is correct orderedStreetTrikkBussMetroTog[['Clustered Transport','Total Transport']] # In[38]: #Ensure value counts is the same as the cluster labels orderedStreetTrikkBussMetroTog['Clustered Transport'].value_counts().to_frame() # In[39]: #And in Cluster df cluster_df # ### Make the Folium map, this time, showing no. of transport options in each cluster # In[50]: # Matplotlib and associated plotting modules import matplotlib.cm as cm import matplotlib.colors as colors #Make map object map_cluster = folium.Map(location=[latitude, longitude], zoom_start=11, tiles= 'Stamen Toner')#'Stamen Toner' 'Stamen Terrain' 'Mapbox Bright' 'Mapbox Control Room' MAPBOX DOES NOT RENDER # set color scheme for the clusters x = np.arange(kclusters) colors_array = cm.rainbow(np.linspace(0, .8, len(x))) rainbow = [colors.rgb2hex(i) for i in colors_array] # add markers to map markers_colors = [] for lat, lng, street, cluster, group in zip(orderedStreetTrikkBussMetroTog['Street Latitude'], orderedStreetTrikkBussMetroTog['Street Longitude'], orderedStreetTrikkBussMetroTog['Street'], orderedStreetTrikkBussMetroTog['Cluster'].astype('int'), orderedStreetTrikkBussMetroTog['Clustered Transport']): #label = folium.Popup(str(street) + ' Cluster ' + str(cluster) + ' ' + str(group), parse_html=True) label = folium.Popup(str(street) + ' Transport Options:' + str(group) + '\nCluster:'+ str(cluster) , parse_html = True) folium.CircleMarker( [lat, lng], radius=4, popup=label, color= rainbow[cluster-1],#rainbow[cluster-1], fill=True, fill_color= rainbow[cluster-1],#rainbow[cluster-1], fill_opacity=0.7).add_to(map_cluster) map_cluster # In[47]: #Extract cluster where total transport options are Zero streetsWithNoTransport = orderedStreetTrikkBussMetroTog[orderedStreetTrikkBussMetroTog['Cluster'] == 2] streetsWithNoTransport.head() # In[48]: #Save to drive streetsWithNoTransport.to_csv(path_or_buf='./streetsWithNo_Transport.csv', index = False) # In[ ]: