#!/usr/bin/env python # coding: utf-8 # # Part II : Extracting Fourquare Data # > Jump to : # * [Part 1](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part1.ipynb) *Extracting Street Addresses & Coordinates* # * [Part 3](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part3.ipynb) , *Exploratory Data Analysis* # * [Part 4](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part4.ipynb), *Clustering and Visualising* # * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion* # ## 1. Set up environment # In[9]: import pandas as pd import numpy as np import folium import requests import re # In[2]: streetData = pd.read_csv('./streetData_Midcoordinates.csv') # ## 2. Basic Folium Map visualisation # In[3]: streetData.head() # In[5]: #1.2.1 Make map with street info latitude = 59.9133301 longitude = 10.7389701 map_oslo = folium.Map(location=[latitude, longitude], zoom_start=10) # add markers to map for lat, lng, street , in zip(streetData['MidLatitude'], streetData['MidLongitude'], streetData['Street']): #Create pop-up label to display label = '{}'.format(street)#neighborhood, borough originally label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=2,#Change radius of circle marker popup=label, color='blue', fill=False, #fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(map_oslo) map_oslo # ## 3. Use Foursquare API to extract various transport information # ### 3.1 Find Trikk # #### Set up Foursquare developer credentials, including Client ID, Client Secret and Version (hidden here) # In[13]: #FULL VERSION: Trikk for street def getTrikkNearby(post, bydelLat, bydelLon, radius, query):#Where names = postcode venues_list=[]#Initialise empty list to store details for post, lat, lng in zip(post, bydelLat, bydelLon): print(post) # create the API request URL url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}'.format( CLIENT_ID, CLIENT_SECRET, lat,#VITAL to use lat, NOT bydelLat as otherwise get misc info like dtype, and name and that makes a weird URL lng,#VITAL: DO NOT USE bydelLat VERSION, query, radius) #print(url,'\n') # make the GET request results = requests.get(url).json() ##[#"response"]['groups'][0]['items'] try: places = results['response']['venues'][0] # return only relevant information for each nearby venue venues_list.append([( post, #postcode #bydel,#I think this is useful to have, several post codes are in the same borough that may have similar char. lat, lng, places['name'], places['location']['distance'])]) #for v in results]) except: venues_list.append([ (post, lat, lng, 'NA', 'NA')]) #for v in results ]) #Create new dataframe and fill it with values from the venue_list array with 2 nested for loops ##Note: placement of 'item' below #print(venues_list) nearby_venues = pd.DataFrame([item for Venue_list in venues_list for item in Venue_list]) nearby_venues.columns = ['Street',#'Neighborhood' 'Street Latitude',#'Neighborhood Latitude' 'Street Longitude',#'Neighborhood Longitude' 'Trikk', 'Trikk Distance']#, #'Venue Longitude', #'Venue Category'] return(nearby_venues) #2.1. Run the above code streetTrikk = getTrikkNearby(post=streetData['Street'], bydelLat=streetData['MidLatitude'], bydelLon=streetData['MidLongitude'], radius = 400, query = 'Trikk' ) # In[14]: #Check shape of df streetTrikk.shape # In[15]: #View streetTrikk.head() # In[66]: #Save to drive streetTrikk.to_csv(path_or_buf='./streetTrikk.csv', index = False) # ## Step 2: Obtain & clean Bus info # In[48]: #FULL VERSION; CHANGED CODE # Extend to all postcodes busPost = {} def getBusNearby (post, bydelLat, bydelLon, query, radius, count): for post, lat, lng in zip(post, bydelLat, bydelLon): count= count+1 #if count in [1896,1897]: print('\n',post,'\t',count) # create the API request URL url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}'.format( CLIENT_ID, CLIENT_SECRET, lat,#VITAL to use lat, NOT bydelLat as otherwise get misc info like dtype, and name and that makes a weird URL lng,#VITAL: DO NOT USE bydelLat VERSION, query, radius) #print(url) # make the GET request results = requests.get(url).json() ##[#"response"]['groups'][0]['items'] #print(results) try: places = results['response']['venues'] print('Number of entries in {} is {}'.format(post,len(places))) #print(places) if len(places) > 0: busList = list() for i in range(len(places)):# len creates the number, range creates an iterable list because for works on A LIST if re.search('(B|b)uss[the]*?', places[i]['name']): print(places[i]['name']) #Add info to dictionary, if post does not exist already if post not in busPost: #Add info as tuple of bus route+distance busList.append((places[i]['name'],places[i]['location']['distance'])) busPost[post] = busList #First entry #But if post is already in the dict, append to newBus, then append to busList else: #print('Post already exists') #print(places[i]['name'],places[i]['location']['distance']) newBus = places[i]['name'],places[i]['location']['distance'] #print(type(newBus)) busList.append(newBus) #If the search matches something else than Buss else: if post not in busPost: #Add info as tuple of bus route+distance busList.append((places[i]['name'],places[i]['location']['distance'])) busPost[post] = busList #First entry #But if post is already in the dict, append to newBus, then append to busList else: continue #When places length = 0 else: busPost[post] = '' #'0' except: #continue busPost[post] = '' #'0' #ColnNames #Postcode Bydel/District Latitude Longitude getBusNearby(post=streetData['Street'],#[0:2000],#[0:5],#[0:20], bydelLat=streetData['MidLatitude'],#[0:2000],#[0:5],#[0:20], bydelLon=streetData['MidLongitude'],#[0:2000],#[0:5],#[0:20], radius = 400, query = 'Bus Stops', count = 0 ) # #### Clean the street bus dictionary # In[49]: len(busPost) # In[50]: [(k,v) for k,v in busPost.items()] # In[54]: #Check that streets have been properly ascribed buses (by checking against some of the ones known to have busses) for k,v in busPost.items(): if k == 'Hausmanns gate': print(k,v) # In[55]: #Make a clean dictionary containing only buses with route numbers on them ## This helps remove some miscellaneous places named like 'Bussola' which is a pizza place actually cleanBus = dict() for k,v in busPost.items(): #print(v) clean =[] for tuple in v: #print((tuple)[1])#Acces the distance string = tuple[0] if re.search('\d', string):#If this returns a match if k not in cleanBus:#If key does not exist clean.append(string)#append to list cleanBus[k] = clean#Assign first entry else: #if key already exists clean.append(string)#Only append to list # In[56]: print('Length of Clean street Bus dictionary is:', len(cleanBus)) # In[40]: #Stats #Out of 100 streets, 46 have bus within 400m #Out of 1000 streets, 481 have bus within 400m #Out of 2460 streets, 1191 have bus within 400m # In[57]: #Convert to a dataframe streetBusesClean = pd.DataFrame.from_dict(cleanBus, orient = 'index') streetBusesClean.head() # In[59]: #Reset index streetBusesClean.reset_index(inplace = True) #Change column name streetBusesClean.columns.values[0] = 'Street' #Change to list streetBusesClean.columns = streetBusesClean.columns.tolist() #Verfiy access to column streetBusesClean[['Street']].head() # In[60]: #Save the cleaned up bus data frame locally streetBusesClean.to_csv(path_or_buf= './streetBusesClean.csv', index = False) # In[62]: #Merge with Street Trikk dataframe streetTrikkBussClean = pd.merge(streetTrikk, streetBusesClean, on = 'Street', how = 'left') streetTrikkBussClean.head() # In[65]: #Check shape of merged Trikk + Buss streetTrikkBussClean.shape # ## Step 3: Obtain T-Bane info # In[68]: #Extend to run through all postcodes metroPost = dict() def getMetroNearby (post, bydelLat, bydelLon, radius, query, count): for post, lat, lng in zip(post, bydelLat, bydelLon): count= count+1 print('\n',post,'\t',count) # create the API request URL url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}'.format( CLIENT_ID, CLIENT_SECRET, lat,#VITAL to use lat, NOT bydelLat as otherwise get misc info like dtype, and name and that makes a weird URL lng,#VITAL: DO NOT USE bydelLat VERSION, query, radius) # make the GET request results = requests.get(url).json() ##[#"response"]['groups'][0]['items'] #print(len(results))#of type Dictionary with 2 keys = 'meta', 'response' #print(results ['meta']) #print(results ['response']) #print(len(results ['response']['venues'])) #Situation 1: GET gives result try: places = results['response']['venues'] print('Number of entries in {} is {}'.format(post,len(places))) #print(places) #Situation 2: GET result > 0 if len(places) > 0: metroList = list() for i in results['response']['venues']: #Situation 2.1: len > 0 + matches Metro if (len(i['categories']) > 0) and (i['categories'][0]['shortName'] == 'Metro'):#'Train Station' #Situation 2.1.1 if post/key does NOT EXIST already if post not in metroPost: metroTuple = i['name'], i['location']['distance'] metroList.append(metroTuple) #First assignment metroPost[post] = metroList #post as key from the loop function above #Situation 2.1.2 if post/key EXISTS already elif post in metroPost: metroTuple = i['name'], i['location']['distance'] metroList.append(metroTuple) #Situation 2.2: len > 0 + does NOT MATCH Train Station elif (len(i['categories']) > 0) and (i['categories'][0]['shortName'] != 'Metro'):#If the short name is NOT 'Train Station' #Situation 2.2.1 key does NOT exist if post not in metroPost: #First assignment metroPost[post] = metroList #post as key from the loop function above #Situation 2.2.2 key EXISTS elif post in metroPost: continue #Just skip no need to do anything as postcode already exists with a BLANK list OR a list that HAS INFO #Situation 2: GET result = 0 elif len(places) == 0: print('<>') metroPost[post] = metroList #'0' #Situation 1: GET does not give result except: #continue print('We have this situation') metroPost[post] = [] #Keeping blank list as value getMetroNearby(post=streetData['Street'], #[0:200],#[0:20], bydelLat=streetData['MidLatitude'], #[0:200],#[0:20], bydelLon=streetData['MidLongitude'], #[0:200],#[0:20], radius = 400, query = 'T-bane', count = 0 ) # In[69]: len(metroPost) # In[70]: #Convert to dictionary streetMetro = pd.DataFrame.from_dict(metroPost, orient = 'Index') streetMetro.head() # In[71]: #Reset index streetMetro.reset_index(inplace = True) #Change coln names from 0,1,2,3 to something bettee streetMetro.columns = ['Street','T-bane_1','T-bane_2','T-bane_3','T-bane_4'] streetMetro.head() # In[72]: #Save to disk streetMetro.to_csv(path_or_buf='streetMetro.csv', index = False) # In[73]: #Merge with street Trikk Buss streetTrikkBussMetro = pd.merge(streetTrikkBussClean, streetMetro, on = 'Street', how = 'left') streetTrikkBussMetro.head() # In[74]: streetTrikkBussMetro.shape # ## Step 4: Obtain Train info # In[75]: #Extend to run through all postcodes trainPost = dict() def getTrainsNearby (post, bydelLat, bydelLon, radius, query, count): for post, lat, lng in zip(post, bydelLat, bydelLon): count= count+1 print('\n',post,'\t',count) # create the API request URL url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}'.format( CLIENT_ID, CLIENT_SECRET, lat,#VITAL to use lat, NOT bydelLat as otherwise get misc info like dtype, and name and that makes a weird URL lng,#VITAL: DO NOT USE bydelLat VERSION, query, radius) # make the GET request results = requests.get(url).json() ##[#"response"]['groups'][0]['items'] #print(len(results))#of type Dictionary with 2 keys = 'meta', 'response' #print(results ['meta']) #print(results ['response']) #print(len(results ['response']['venues'])) #Situation 1: GET gives result try: places = results['response']['venues'] print('Number of entries in {} is {}'.format(post,len(places))) print(places) #Situation 2: GET result > 0 if len(places) > 0: trainList = list() for i in results['response']['venues']: #Situation 2.1: len > 0 + matches Train Station if (len(i['categories']) > 0) and (i['categories'][0]['shortName'] == 'Train Station'):#'Train Station' #Situation 2.1.1 if post/key does NOT EXIST already if post not in trainPost: trainTuple = i['name'], i['location']['distance'] trainList.append(trainTuple) #First assignment trainPost[post] = trainList #post as key from the loop function above #Situation 2.1.2 if post/key EXISTS already elif post in trainPost: trainTuple = i['name'], i['location']['distance'] trainList.append(trainTuple) #Situation 2.2: len > 0 + does NOT MATCH Train Station elif (len(i['categories']) > 0) and (i['categories'][0]['shortName'] != 'Train Station'):#If the short name is NOT 'Train Station' #Situation 2.2.1 key does NOT exist if post not in trainPost: #First assignment trainPost[post] = trainList #post as key from the loop function above #Situation 2.2.2 key EXISTS elif post in trainPost: continue #Just skip no need to do anything as postcode already exists with a BLANK list OR a list that HAS INFO #Situation 2: GET result = 0 elif len(places) == 0: print('<>') trainPostPost[post] = trainList #'0' #Situation 1: GET does not give result except: #continue print('We have this situation') trainPost[post] = [] #Keeping blank list as value getTrainsNearby(post=streetData['Street'], #[0:200],#[0:20], bydelLat=streetData['MidLatitude'], #[0:200],#[0:20], bydelLon=streetData['MidLongitude'], #[0:200],#[0:20], radius = 400, query = 'Train Station', count = 0 ) # In[2747]: #sorted( [(k,v) for k,v in trainPost.items()] ) # In[76]: print('Length of dictionary is {}'.format(len(trainPost))) # In[ ]: #Manually add Nationaltheatret Stasjon to Ruseløkkveien # In[2698]: #for k,v in trainPost.items(): # if k == 'Ruseløkkveien': # trainPost[k].append('Nationaltheatret stasjon') # print(k,v) #See below for 1. Finding streets within 400m of Ruseløkkveien that has Nationaltheatret stasjon 2. Adding station info to those streets # ### Important: Foursquare API completely misses the Nationaltheatret station # I manually found it to be located on the street named: Ruseløkkveien. This info needs to added. # Additionally, to _approximately_ find other streets in the 400m vicinity of this station, I will try to find all streets within 400m of Ruseløkkveien and assign the station to those streets as well. # ### Try to find streets that are within 400m of Ruseløkkveien # In[77]: #Import the library that allows one to estimate distance from two geo-coordinate sets. import geopy.distance # In[78]: #We will work on a copy of the original Street Data df, just to ensure the original remains intact streetDataModified = streetData streetDataModified.head() # In[79]: #Find geo-coordinates of Ruseløkkveien streetDataModified[streetDataModified.Street == 'Ruseløkkveien']#dataframe # In[80]: #Try to see how to access the latitude/longitude streetDataModified[streetDataModified.Street == 'Ruseløkkveien'].iloc[0,2]#This access the longitude # In[81]: #Create a function that will estimate the distances ## Here we will estimate the distance of each street in the dataframe to Ruseløøkveien and output those that are within 400m (and skip Ruseløkkveien) #Empty list to store streets within 400m streetsNearby = list() def distanceFromStreet (dataframe, fullData): #Load up the coordinates of Ruseløkkveien lat = dataframe.iloc[0,1] lon = dataframe.iloc[0,2] coords_1 = lat,lon #Find nearby streets for i in range(fullData.shape[0]): df = fullData.iloc[i:i+1,] #print(df) lat2 = df.iloc[0,1] lon2 = df.iloc[0,2] coords_2 = lat2, lon2 #print(coords_2) distance = geopy.distance.distance(coords_1, coords_2).m if distance <= 400 and distance > 0: print(df, round(distance, 2) ) streetsNearby.append(df.iloc[0,0]) #return (coords_1, coords_2) distanceFromStreet(streetDataModified[streetDataModified.Street == 'Ruseløkkveien'], fullData= streetDataModified) # In[82]: #Have a look at streets whose mid points are within 400m from Ruseløkkveien streetsNearby # In[83]: #Add info about Nationaltheatret stasjon to above list for k,v in trainPost.items(): if k in streetsNearby: trainPost[k].append('Nationaltheatret stasjon') print(k,v) # In[84]: #Convert to the updated dictionary to dataframe streetTrain = pd.DataFrame.from_dict(trainPost, orient = 'index') streetTrain.head() #Reset index streetTrain.reset_index(inplace = True) #Change column names streetTrain.columns = ['Street','Train Station'] streetTrain.head() # In[85]: #Save to local drive streetTrain.to_csv(path_or_buf= './streetTrain.csv', index = False) # In[86]: #Merge on full street Trikk, Buss, T-Bane df streetTrikkBussMetroTog = pd.merge(streetTrikkBussMetro, streetTrain, on = 'Street', how = 'left') streetTrikkBussMetroTog.shape # In[87]: #View your accomplishment and hardwork! streetTrikkBussMetroTog.head() # In[2760]: #Save to harddrive streetTrikkBussMetroTog.to_csv(index = False, path_or_buf= './streetData_TrikkBusMetroTog.csv')