#!/usr/bin/env python # coding: utf-8 # # Street-level analysis of public transport options in Oslo # The project will aim to : # * extract geo-coordinates of each street address # * Street start address > street end address # * Use Foursquare API to obtain: # * Bus routes # * Tram # * Light rail # * Trains # * Cluster the transport options down to the street level and draw conclusions # # ---- # > Jump to : # * [Part 2](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part2-forUpload.ipynb), *Extracting Foursquare Data* # * [Part 3](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part3.ipynb) , *Exploratory Data Analysis* # * [Part 4](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part4.ipynb), *Clustering and Visualising* # * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion* # # Part I: Web Scraping of postcodes and streets # ## 1. Set up environment # In[1]: import pandas as pd import numpy as np import folium import urllib.request, urllib.parse, urllib.error from urllib.request import urlopen from bs4 import BeautifulSoup import ssl import re # ## 2. Obtain and parse URL # #### 2.1 Ignore SSL errors # In[2]: cntxt = ssl.create_default_context() cntxt.check_hostname = False cntxt.verify_mode = ssl.CERT_NONE # #### 2.2 Obtain URL # In[4]: url = input('Please enter the website to obtain data from: ') if len(url) < 1: url = 'https://www.erikbolstad.no/postnummer-koordinatar/kommune.php?kommunenummer=301' print('You want data from >>>\n', url, '\n<<<') # #### 2.3 Parse URL # In[5]: #Use a file-handle like object to open the url html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end #Use BeautifulSoup to parse soup = BeautifulSoup(html, 'html.parser') # ## 3. Explore and obtain relevant data # For each Postkodenummer : # * Ignore postnummer til postboksar, postnummer til servicepostnummer, postnummer til gateadresser og postboksar # * Obtain Lat + Lon of postnummer of gate-/veg-adresse, save in table1 # * Obtain all street names (Gate-/veg-adresse) # * For each streetname : # * Obtain Koordinatar of all addresses # * Extract coodinates of start address/minimum # * Extract coordinates of end address/last # * ~~Estimate street length~~ # * Establish mid-point # * Save in table2 # In[6]: #Data is contained in the tags #tags = soup('tr') # In[8]: #Extract tags that are in use tagsinUse = soup.find_all('tr', class_ = 'ibruk') count = 0 #Initiate counter dictn = {} #With postcode as KEy, and everything else as LIST of values lisT = [] for i in tagsinUse: #count = count +1 #print('\nCount: {}'.format(count)) #if count == 5: # break #Look at the parts of the tag #print('TAG:', i) #print('Postnummer:', i.th.text)#WORKS #print(str(i.contents)) #print(type(str(i.contents))) if re.search('Gate-/veg-adresse', str(i.contents)):#i.e. print only if postbox belongs to gate/veg-adresse #For intial testing #count = count +1 #if count > 5: # break #print('\nCount: {}'.format(count)) #print(str(i.contents))#Convert to str is important to be able to perform regEX #print(i.th.a.text)#Extract postcode number #print(len(i.contents))#Seems to be constant length of 9 (counting from 0) #print(i.contents[5].a.text)#From 5th element (counting from 0, extract the lat+ long) #contents[7].text #Extract bydel #print(i.th.a.text,i.contents[7].text, i.contents[5].a.text) pinCity = i.th.a.text.split(' ')#0018 OSLO #So there is space in between, split on it #print(pinCity[0]) #Also split the latitude and longitude latlon = i.contents[5].a.text.split(',') #print(latlon[0], latlon[1]) #list.append(i.contents[7].text), list.append(i.contents[5].a.text) lisT.append(i.contents[7].text), lisT.append(latlon[0]), lisT.append(latlon[1]) dictn[pinCity[0]] = lisT else: continue#Ignore all other instances: Postboksar/Gateadresser og postboksar/Servicepostnummer lisT = [] #print('\n<<<----The dictionary follows------>>>>>') #for k,v in dict.items(): # print(k,v) # In[9]: #Convert to dataframe data = pd.DataFrame.from_dict(dictn, orient='index') data.head() # In[10]: #Make modifications on the table to make it prettier- assign column names + reset index #Reset index data.reset_index(inplace= True) data.head() # In[11]: #Change column names data.columns.values[0:] = ['Postcode','Bydel/District','Latitude','Longitude'] data.head() # In[249]: data.shape # In[12]: data.to_csv(path_or_buf= './postCode_Bydel.csv', index = False) # ## 4. Obtain street-level information # In[7]: def postCheck(pass_the_url): exceptions = ['0018','0045'] #if (re.search(exceptions, pass_the_url) for post in exceptions): if [post for post in exceptions if re.search(post, pass_the_url)]:#Using List comprehension #if re.search(post, pass_the_url): print('Exceptional post found, skipping...') return 0 else: print(pass_the_url) # In[8]: import requests #This library is superior to the urllib for parsing out non-ASCII characters such as those found in Norwegian #r = requests.get('https://no.wikipedia.org/wiki/Jonas_Gahr_Støre') #print(r.text) # In[9]: #Small version, to test things out #Extract tags that are in use tagsinUse = soup.find_all('tr', class_ = 'ibruk') count = 0 #Initiate counter midStreetDict = dict() midStreetLoc = list() failedStreet = list() for i in tagsinUse: if re.search('Gate-/veg-adresse', str(i.contents)): #i.e. print only if postbox belongs to gate/veg-adresse #For intial testing count = count +1 if count > 6:break print('\nCount: {}'.format(count)) print('Working on postcode {}...'.format(i.th.a.text)) posturl = i.th.a.get('href', None )#Postcode link postCheck(posturl)#Custom function defined above #Parse Postcode link and open it html2 = urllib.request.urlopen(posturl, context=cntxt)#Open, similar to filehandle #print('HTML2 is {}'.format(html2)) soup2 = BeautifulSoup(html2, 'html.parser')#Parse #print(soup2) #Establish canonical link that will combine with street address URL to form new URL canonical = 'https://www.erikbolstad.no/postnummer-koordinatar/' #Extract tag in the new link tagsinUse2 = soup2.find_all('table', style="margin-top: 4rem;") for j in tagsinUse2: #print('\n<<<>>>') #print(j) td = j.find_all('td', itemtype="https://schema.org/StreetAddress")#Find the street addresses #print('<<<>>>') #print(td,'\n') #print(j.td.a.text)#Gives only 1 street address#Early version of script for tag in td: streetUrl = canonical+tag.a.get('href',None) streetName = tag.a.text print(streetUrl) #print('Working on:', tag.a.text,'\n') print('Working on:', streetName) #Parse street url link and open it try: #html3 = urllib.request.urlopen(streetUrl, context=cntxt)#Open, similar to filehandle html3 = requests.get(streetUrl).text #print('HTML3 is {}'.format(html3)) soup3 = BeautifulSoup(html3, 'html.parser')#Parse #Extract all anchor tags since within them one finds the street address + geocoordinates streets = soup3.find_all('a') streetList = [] latlonList = [] #latlonTup = tuple() for datas in streets: text = datas.text #print(text) #Only get the text that is street name or lat, long if re.search(streetName+'\s[0-9]+', text):# or re.findall('[0-9]+,', text): #print(text) streetList.append(text) elif re.search('[0-9]+,', text): coordinates = text.split(',') #print(tuple(coordinates)) #Make into latlon coordinates tuple so that it can be appended to list latlonList.append(tuple(coordinates)) #print(streetList) #print(latlonList) combined = dict(zip(streetList, latlonList)) #Establish mid-points of streets lat2 = [] long2 = [] for l in latlonList : lat2.append(float(l[0])) long2.append(float(l[1])) midStreetLat = sum(lat2)/len(lat2) midStreetLon = sum(long2)/len(long2) #midStreetLoc.append(midStreetLat,midStreetLoc) print(streetName,midStreetLat, midStreetLon,'\n') except: print('Can\'t open URL, possible UnicodeEncode error. Skipping...\n') failedStreet.append(streetName) # In[10]: print(failedStreet) # #### NOTE: FOLLOWING CODE TAKES ~45 mins to RUN # In[21]: #LONG CODE # RUN ONLY IF ABSOLUTELY REQUIRED #Extract tags that are in use tagsinUse = soup.find_all('tr', class_ = 'ibruk') count = 0 #Initiate counter midStreetDict = dict() midStreetLoc = list() for i in tagsinUse: if re.search('Gate-/veg-adresse', str(i.contents)): #i.e. print only if postbox belongs to gate/veg-adresse #For intial testing count = count +1 #if count > 4:break print('\nCount: {}'.format(count)) print('Working on postcode {}...'.format(i.th.a.text)) posturl = i.th.a.get('href', None )#Postcode link postCheck(posturl)#Custom function defined above #Parse Postcode link and open it html2 = urllib.request.urlopen(posturl, context=cntxt)#Open, similar to filehandle #print('HTML2 is {}'.format(html2)) soup2 = BeautifulSoup(html2, 'html.parser')#Parse #print(soup2) #Establish canonical link that will combine with street address URL to form new URL canonical = 'https://www.erikbolstad.no/postnummer-koordinatar/' #Extract tag in the new link tagsinUse2 = soup2.find_all('table', style="margin-top: 4rem;") for j in tagsinUse2: #print('\n<<<>>>') #print(j) td = j.find_all('td', itemtype="https://schema.org/StreetAddress")#Find the street addresses #print('<<<>>>') #print(td,'\n') #print(j.td.a.text)#Gives only 1 street address#Early version of script for tag in td: streetUrl = canonical+tag.a.get('href',None) streetName = tag.a.text print(streetUrl) #print('Working on:', tag.a.text,'\n') print('Working on:', streetName) #Parse street url link and open it try: #html3 = urllib.request.urlopen(streetUrl, context=cntxt)#Open, similar to filehandle html3 = requests.get(streetUrl).text #print('HTML3 is {}'.format(html3)) soup3 = BeautifulSoup(html3, 'html.parser')#Parse #Extract all anchor tags since within them one finds the street address + geocoordinates streets = soup3.find_all('a') streetList = [] latlonList = [] #latlonTup = tuple() for datas in streets: text = datas.text #print(text) #Only get the text that is street name or lat, long if re.search(streetName+'\s[0-9]+', text):# or re.findall('[0-9]+,', text): #print(text) streetList.append(text) elif re.search('[0-9]+,', text): coordinates = text.split(',') #print(tuple(coordinates)) #Make into latlon coordinates tuple so that it can be appended to list latlonList.append(tuple(coordinates)) #print(streetList) #print(latlonList) combined = dict(zip(streetList, latlonList)) #Establish mid-points of streets lat2 = [] long2 = [] for l in latlonList : lat2.append(float(l[0])) long2.append(float(l[1])) midStreetLat = sum(lat2)/len(lat2) midStreetLon = sum(long2)/len(long2) #midStreetStr= str(midStreetLat) +','+ str(midStreetLon) #midStreetCoord = midStreetStr.split(',') #midStreetLoc.append((midStreetLat,midStreetLon)) midStreetLoc.append(midStreetLat), midStreetLoc.append(midStreetLon) print(streetName,midStreetLat, midStreetLon,'\n') midStreetDict[streetName] = midStreetLoc midStreetLoc = [] except: print('Can\'t open URL, possible UnicodeEncode error. Skipping...\n') # In[22]: #Create backup version of the dictionary created above, just in case midStreetDict_backUp = midStreetDict #Check backup dictionary formed for k,v in midStreetDict_backUp.items(): print(k,v) # In[12]: #Check a small sample of streets streetDataSmall = pd.DataFrame.from_dict(combined, orient='index') #Reset row index to numerical values streetDataSmall.reset_index(inplace = True) #Change column names streetDataSmall.columns.values[0:] = ['StreetAddress','Latitude','Longitude'] #View first 5 rows streetDataSmall.head() # In[13]: #Arrange above df in ascending order of address #streetDataSmall.sort_values('StreetAddress', ascending= True, axis = 0) from natsort import order_by_index, index_natsorted streetDataSmall.reindex(index=order_by_index(streetDataSmall.index, index_natsorted(streetDataSmall.StreetAddress))).reset_index() # In[14]: from natsort import natsorted natsorted((k,v) for k,v in combined.items())#Sorts on K, and if you want v with it, bracket them together # In[28]: #Obtain first and last address #Find length of list #listLen = len(natsorted((k,v) for k,v in combined.items())) #print(listLen) #If we want to extract coordinates #count = 0 #for i in natsorted((k,v) for k,v in combined.items()): # count = count+1 # if count == 1: # coords1 = i[1][0], i[1][1]#Can index within a list of a list in python # print(coords1)#Geopy distance estimator takes in tuple # elif count == listLen: # coords2 = i[1][0], i[1][1] # print(coords2) # In[24]: #midStreetDict # In[25]: streetData = pd.DataFrame.from_dict(midStreetDict, orient= 'index') #Reset index streetData.reset_index(inplace = True) #Rename columns streetData.columns.values[0:] = ['Street','MidLatitude','MidLongitude'] streetData.head() # In[26]: streetData.shape # In[27]: streetData.isnull().sum() # In[ ]: #Save file to disk streetData.to_csv(path_or_buf='./streetData_Midcoordinates.csv', index = False)