#!/usr/bin/env python
# coding: utf-8

# # Part III : Exploratory Data Analyses

# > Jump to :  
# * [Part 1](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part1.ipynb) *Extracting Street Addresses & Coordinates* 
# * [Part 2](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part2-forUpload.ipynb), *Extracting Foursquare Data*
# * [Part 4](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part4.ipynb), *Clustering and Visualising*
# * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion*

# ## Step 1: Load up environment and data

# In[9]:


import pandas as pd
import numpy as np


# In[10]:


streetTrikkBussMetroTog = pd.read_csv('./streetData_TrikkBusMetroTog.csv')
streetTrikkBussMetroTog.head()


# In[11]:


streetTrikkBussMetroTog.shape


# So, in all we have information on **2460** streets in Oslo, Norway

# ## Step 2: Find total transport options

# In[22]:


#Check column type in the above dataframe
streetTrikkBussMetroTog.columns.map(type)


# In[35]:


#To find total transport options/street, we sum the columns that are NOT null, followed by sorting in descending order to obtain streets with maximum number of transport options

#First we must select the columns to sum on
busCOLS = list(range(18))

#Now we define a function that can automatically add text to numbers, useful for colns of T-bane
def makeColnName (list_of_colnNum):
    
    metroCOLSlist = []
    for i in list_of_colnNum:
        #print('T-bane_'+str(i))
        metroCOLSlist.append('T-bane_'+str(i))#Choose whatever colnName you want to prepend to the numbers
    return(metroCOLSlist)

#Makes columns of T-bane
COLSlist = makeColnName(list(range(1,5)))

#Join T-bane and bus lists together with additional list of Trikk+Tog
COLSlist = ['Trikk','Train Station']+ COLSlist + busCOLS
COLSlist#Total 24 transport colns, out of which max comes 22...wow...a street with 22 options..??

#Check data type in the list
#for i in COLSlist:
#    if (type(i)) == str:
#        print('YABBA',i)
#    else:
#        print('BOO',i)
#    
#    
#Convert int types to str types
def convertType (LIST):
    
    newLIST = list()
    for i in LIST:
        if type(i) == str:
            newLIST.append(i)
        else:
            #str(i)
            newLIST.append(str(i))
    
    return newLIST

COLSlist_str = convertType(COLSlist)

#Check correctly formatted
[type(i) for i in COLSlist_str]        


# In[41]:


#Sort in descending order of non-null values i.e. highest transport to lowest
streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False) 

#Also store this in a new coln named totalTransport
streetTrikkBussMetroTog['Total Transport']= streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False) #Sort in descending order of non-null values i.e. highest transport to lowest


# In[42]:


#Obtain new index of the sorted order
newIndex = streetTrikkBussMetroTog[COLSlist_str].notnull().sum(axis = 'columns').sort_values(ascending = False).index 
newIndex


# In[43]:


#If you want to see all the values of the original dataframe (not the sorted)
streetTrikkBussMetroTog.index.values


# In[44]:


#Attempt to change old index to new custom ordered index
orderedStreetTrikkBussMetroTog = streetTrikkBussMetroTog.iloc[newIndex,]

#Right now, the index values will be like 554, 2354 etc, to get them in the order of 0,1,2...

#Get range for the index values
manualIndex2 = range(orderedStreetTrikkBussMetroTog.shape[0])

#Assign this new index
orderedStreetTrikkBussMetroTog.index = manualIndex2
orderedStreetTrikkBussMetroTog.head()


# In[48]:


#Find total bus options, but first convert the busCOLS to str
busCOLS_str = convertType(busCOLS)
orderedStreetTrikkBussMetroTog[busCOLS_str].notnull().sum(axis = 'columns').sort_values(ascending = False) #Sort in descending order of non-null values i.e. highest transport to lowest

#Add this info as a new column
orderedStreetTrikkBussMetroTog['Total Bus'] = orderedStreetTrikkBussMetroTog[busCOLS_str].notnull().sum(axis = 'columns').sort_values(ascending = False)
orderedStreetTrikkBussMetroTog.head()


# In[54]:


#Find total streets with least amount of transport options
print ('A total of ', orderedStreetTrikkBussMetroTog[orderedStreetTrikkBussMetroTog['Total Transport'] == 0].shape[0],'streets seem to have NO transport option within 400m')


# In[52]:


orderedStreetTrikkBussMetroTog.shape


# ## Step 6: Exploratory Analysis 

# Let us explore some of the streets with the most, the least transport options. Let us also review the distribution of transport options across all streets.

# ### 6.1 Top 10 streets with most transport options within 400m

# In[50]:


#sns.set(rc={'figure.figsize':(12,9)})
top_streets = orderedStreetTrikkBussMetroTog.head(10)
top_streets.head()


# ### 6.2 10 streets with least transport options within 400m

# In[55]:


bottom_streets = orderedStreetTrikkBussMetroTog.tail(10)
bottom_streets.head()


# In[61]:


#Let us combine the top10 and bottom 10 dataframes into a single dataframe
#We will use pd.concat that takes in a LIST of dataframes to concat
df_list = [top_streets, bottom_streets]
combinedTop10_bottom10streets = pd.concat(df_list)
#combinedTop10_bottom10streets

#Let us visualise this
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(12,9)})

sns.barplot(x = 'Street', y = 'Total Transport', data = combinedTop10_bottom10streets, palette = 'Blues_d')
plt.xticks(rotation = 45)


# ### 6.3 Distribution of transport options across all streets

# In[129]:


sns.set_palette("Blues_r")
#sns.set(style="white", palette="dark", color_codes=True)
sns.distplot(orderedStreetTrikkBussMetroTog['Total Transport'], kde = False)


# In[63]:


sns.boxplot(y = orderedStreetTrikkBussMetroTog['Total Transport'], width = 0.1)


# In[132]:


sns.set(style="whitegrid", palette="muted", color_codes=True)
sns.violinplot(x="Total Transport", data=orderedStreetTrikkBussMetroTog)


# ### 6.4 Visualise relation between total bus options and total transport

# In[64]:


sns.jointplot(x = 'Total Transport', y = 'Total Bus', data = orderedStreetTrikkBussMetroTog, height= 9)


# In[65]:


#Make a density plot version of the above
sns.jointplot(x = 'Total Transport', y = 'Total Bus', kind = 'kde', data = orderedStreetTrikkBussMetroTog, height = 9)


# In[66]:


cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
sns.kdeplot(orderedStreetTrikkBussMetroTog['Total Transport'], orderedStreetTrikkBussMetroTog['Total Bus'], cmap=cmap, n_levels=60, shade=True);


# ### 6.4 Divide streets based on no. of transport options

# In[67]:


bins = np.linspace(min(orderedStreetTrikkBussMetroTog['Total Transport']), max(orderedStreetTrikkBussMetroTog['Total Transport']), 8)#Provide 1 more than the number of levels to create
bins


# In[69]:


group_names = ['0-3', '4-6', '7-9', '10-12', '13-15', '16-18', '19-22']


# In[70]:


orderedStreetTrikkBussMetroTog['Binned Transport'] = pd.cut(orderedStreetTrikkBussMetroTog['Total Transport'], bins, labels = group_names, include_lowest= True)
orderedStreetTrikkBussMetroTog[['Binned Transport','Total Transport']].head(20)


# #### Let us find out how many streets belong to each category

# In[71]:


orderedStreetTrikkBussMetroTog['Binned Transport'].value_counts()


# #### Let us try to visualise this to get a better perspective...

# In[72]:


new_df = orderedStreetTrikkBussMetroTog['Binned Transport'].value_counts().to_frame()
new_df


# In[73]:


new_df.reset_index(inplace = True)
new_df.columns = ['Transport Options', 'Total Streets']
new_df


# In[74]:


sns.barplot(x = 'Transport Options', y = 'Total Streets', data = new_df, palette= 'Blues_d')
plt.xticks(rotation = 45)


# In[75]:


#Save to drive
orderedStreetTrikkBussMetroTog.to_csv(path_or_buf='./orderedStreetTrikkBusMetroTog.csv', index = False)