#!/usr/bin/env python
# coding: utf-8
# # Street-level analysis of public transport options in Oslo
# The project will aim to :
# * extract geo-coordinates of each street address
# * Street start address > street end address
# * Use Foursquare API to obtain:
# * Bus routes
# * Tram
# * Light rail
# * Trains
# * Cluster the transport options down to the street level and draw conclusions
#
# ----
# > Jump to :
# * [Part 2](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part2-forUpload.ipynb), *Extracting Foursquare Data*
# * [Part 3](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part3.ipynb) , *Exploratory Data Analysis*
# * [Part 4](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part4.ipynb), *Clustering and Visualising*
# * [Part 5](https://github.com/Niladri-B/Coursera_Captstone/blob/master/wk4/Capstone_part5.ipynb), *Conclusion & Discussion*
# # Part I: Web Scraping of postcodes and streets
# ## 1. Set up environment
# In[1]:
import pandas as pd
import numpy as np
import folium
import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
# ## 2. Obtain and parse URL
# #### 2.1 Ignore SSL errors
# In[2]:
cntxt = ssl.create_default_context()
cntxt.check_hostname = False
cntxt.verify_mode = ssl.CERT_NONE
# #### 2.2 Obtain URL
# In[4]:
url = input('Please enter the website to obtain data from: ')
if len(url) < 1: url = 'https://www.erikbolstad.no/postnummer-koordinatar/kommune.php?kommunenummer=301'
print('You want data from >>>\n', url, '\n<<<')
# #### 2.3 Parse URL
# In[5]:
#Use a file-handle like object to open the url
html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end
#Use BeautifulSoup to parse
soup = BeautifulSoup(html, 'html.parser')
# ## 3. Explore and obtain relevant data
# For each Postkodenummer :
# * Ignore postnummer til postboksar, postnummer til servicepostnummer, postnummer til gateadresser og postboksar
# * Obtain Lat + Lon of postnummer of gate-/veg-adresse, save in table1
# * Obtain all street names (Gate-/veg-adresse)
# * For each streetname :
# * Obtain Koordinatar of all addresses
# * Extract coodinates of start address/minimum
# * Extract coordinates of end address/last
# * ~~Estimate street length~~
# * Establish mid-point
# * Save in table2
# In[6]:
#Data is contained in the
tags
#tags = soup('tr')
# In[8]:
#Extract tags that are in use
tagsinUse = soup.find_all('tr', class_ = 'ibruk')
count = 0 #Initiate counter
dictn = {} #With postcode as KEy, and everything else as LIST of values
lisT = []
for i in tagsinUse:
#count = count +1
#print('\nCount: {}'.format(count))
#if count == 5:
# break
#Look at the parts of the tag
#print('TAG:', i)
#print('Postnummer:', i.th.text)#WORKS
#print(str(i.contents))
#print(type(str(i.contents)))
if re.search('Gate-/veg-adresse', str(i.contents)):#i.e. print only if postbox belongs to gate/veg-adresse
#For intial testing
#count = count +1
#if count > 5:
# break
#print('\nCount: {}'.format(count))
#print(str(i.contents))#Convert to str is important to be able to perform regEX
#print(i.th.a.text)#Extract postcode number
#print(len(i.contents))#Seems to be constant length of 9 (counting from 0)
#print(i.contents[5].a.text)#From 5th element (counting from 0, extract the lat+ long)
#contents[7].text #Extract bydel
#print(i.th.a.text,i.contents[7].text, i.contents[5].a.text)
pinCity = i.th.a.text.split(' ')#0018 OSLO #So there is space in between, split on it
#print(pinCity[0])
#Also split the latitude and longitude
latlon = i.contents[5].a.text.split(',')
#print(latlon[0], latlon[1])
#list.append(i.contents[7].text), list.append(i.contents[5].a.text)
lisT.append(i.contents[7].text), lisT.append(latlon[0]), lisT.append(latlon[1])
dictn[pinCity[0]] = lisT
else:
continue#Ignore all other instances: Postboksar/Gateadresser og postboksar/Servicepostnummer
lisT = []
#print('\n<<<----The dictionary follows------>>>>>')
#for k,v in dict.items():
# print(k,v)
# In[9]:
#Convert to dataframe
data = pd.DataFrame.from_dict(dictn, orient='index')
data.head()
# In[10]:
#Make modifications on the table to make it prettier- assign column names + reset index
#Reset index
data.reset_index(inplace= True)
data.head()
# In[11]:
#Change column names
data.columns.values[0:] = ['Postcode','Bydel/District','Latitude','Longitude']
data.head()
# In[249]:
data.shape
# In[12]:
data.to_csv(path_or_buf= './postCode_Bydel.csv', index = False)
# ## 4. Obtain street-level information
# In[7]:
def postCheck(pass_the_url):
exceptions = ['0018','0045']
#if (re.search(exceptions, pass_the_url) for post in exceptions):
if [post for post in exceptions if re.search(post, pass_the_url)]:#Using List comprehension
#if re.search(post, pass_the_url):
print('Exceptional post found, skipping...')
return 0
else:
print(pass_the_url)
# In[8]:
import requests
#This library is superior to the urllib for parsing out non-ASCII characters such as those found in Norwegian
#r = requests.get('https://no.wikipedia.org/wiki/Jonas_Gahr_Støre')
#print(r.text)
# In[9]:
#Small version, to test things out
#Extract tags that are in use
tagsinUse = soup.find_all('tr', class_ = 'ibruk')
count = 0 #Initiate counter
midStreetDict = dict()
midStreetLoc = list()
failedStreet = list()
for i in tagsinUse:
if re.search('Gate-/veg-adresse', str(i.contents)): #i.e. print only if postbox belongs to gate/veg-adresse
#For intial testing
count = count +1
if count > 6:break
print('\nCount: {}'.format(count))
print('Working on postcode {}...'.format(i.th.a.text))
posturl = i.th.a.get('href', None )#Postcode link
postCheck(posturl)#Custom function defined above
#Parse Postcode link and open it
html2 = urllib.request.urlopen(posturl, context=cntxt)#Open, similar to filehandle
#print('HTML2 is {}'.format(html2))
soup2 = BeautifulSoup(html2, 'html.parser')#Parse
#print(soup2)
#Establish canonical link that will combine with street address URL to form new URL
canonical = 'https://www.erikbolstad.no/postnummer-koordinatar/'
#Extract tag in the new link
tagsinUse2 = soup2.find_all('table', style="margin-top: 4rem;")
for j in tagsinUse2:
#print('\n<<<>>>')
#print(j)
td = j.find_all('td', itemtype="https://schema.org/StreetAddress")#Find the street addresses
#print('<<<>>>')
#print(td,'\n')
#print(j.td.a.text)#Gives only 1 street address#Early version of script
for tag in td:
streetUrl = canonical+tag.a.get('href',None)
streetName = tag.a.text
print(streetUrl)
#print('Working on:', tag.a.text,'\n')
print('Working on:', streetName)
#Parse street url link and open it
try:
#html3 = urllib.request.urlopen(streetUrl, context=cntxt)#Open, similar to filehandle
html3 = requests.get(streetUrl).text
#print('HTML3 is {}'.format(html3))
soup3 = BeautifulSoup(html3, 'html.parser')#Parse
#Extract all anchor tags since within them one finds the street address + geocoordinates
streets = soup3.find_all('a')
streetList = []
latlonList = []
#latlonTup = tuple()
for datas in streets:
text = datas.text
#print(text)
#Only get the text that is street name or lat, long
if re.search(streetName+'\s[0-9]+', text):# or re.findall('[0-9]+,', text):
#print(text)
streetList.append(text)
elif re.search('[0-9]+,', text):
coordinates = text.split(',')
#print(tuple(coordinates)) #Make into latlon coordinates tuple so that it can be appended to list
latlonList.append(tuple(coordinates))
#print(streetList)
#print(latlonList)
combined = dict(zip(streetList, latlonList))
#Establish mid-points of streets
lat2 = []
long2 = []
for l in latlonList :
lat2.append(float(l[0]))
long2.append(float(l[1]))
midStreetLat = sum(lat2)/len(lat2)
midStreetLon = sum(long2)/len(long2)
#midStreetLoc.append(midStreetLat,midStreetLoc)
print(streetName,midStreetLat, midStreetLon,'\n')
except:
print('Can\'t open URL, possible UnicodeEncode error. Skipping...\n')
failedStreet.append(streetName)
# In[10]:
print(failedStreet)
# #### NOTE: FOLLOWING CODE TAKES ~45 mins to RUN
# In[21]:
#LONG CODE # RUN ONLY IF ABSOLUTELY REQUIRED
#Extract tags that are in use
tagsinUse = soup.find_all('tr', class_ = 'ibruk')
count = 0 #Initiate counter
midStreetDict = dict()
midStreetLoc = list()
for i in tagsinUse:
if re.search('Gate-/veg-adresse', str(i.contents)): #i.e. print only if postbox belongs to gate/veg-adresse
#For intial testing
count = count +1
#if count > 4:break
print('\nCount: {}'.format(count))
print('Working on postcode {}...'.format(i.th.a.text))
posturl = i.th.a.get('href', None )#Postcode link
postCheck(posturl)#Custom function defined above
#Parse Postcode link and open it
html2 = urllib.request.urlopen(posturl, context=cntxt)#Open, similar to filehandle
#print('HTML2 is {}'.format(html2))
soup2 = BeautifulSoup(html2, 'html.parser')#Parse
#print(soup2)
#Establish canonical link that will combine with street address URL to form new URL
canonical = 'https://www.erikbolstad.no/postnummer-koordinatar/'
#Extract tag in the new link
tagsinUse2 = soup2.find_all('table', style="margin-top: 4rem;")
for j in tagsinUse2:
#print('\n<<<>>>')
#print(j)
td = j.find_all('td', itemtype="https://schema.org/StreetAddress")#Find the street addresses
#print('<<<>>>')
#print(td,'\n')
#print(j.td.a.text)#Gives only 1 street address#Early version of script
for tag in td:
streetUrl = canonical+tag.a.get('href',None)
streetName = tag.a.text
print(streetUrl)
#print('Working on:', tag.a.text,'\n')
print('Working on:', streetName)
#Parse street url link and open it
try:
#html3 = urllib.request.urlopen(streetUrl, context=cntxt)#Open, similar to filehandle
html3 = requests.get(streetUrl).text
#print('HTML3 is {}'.format(html3))
soup3 = BeautifulSoup(html3, 'html.parser')#Parse
#Extract all anchor tags since within them one finds the street address + geocoordinates
streets = soup3.find_all('a')
streetList = []
latlonList = []
#latlonTup = tuple()
for datas in streets:
text = datas.text
#print(text)
#Only get the text that is street name or lat, long
if re.search(streetName+'\s[0-9]+', text):# or re.findall('[0-9]+,', text):
#print(text)
streetList.append(text)
elif re.search('[0-9]+,', text):
coordinates = text.split(',')
#print(tuple(coordinates)) #Make into latlon coordinates tuple so that it can be appended to list
latlonList.append(tuple(coordinates))
#print(streetList)
#print(latlonList)
combined = dict(zip(streetList, latlonList))
#Establish mid-points of streets
lat2 = []
long2 = []
for l in latlonList :
lat2.append(float(l[0]))
long2.append(float(l[1]))
midStreetLat = sum(lat2)/len(lat2)
midStreetLon = sum(long2)/len(long2)
#midStreetStr= str(midStreetLat) +','+ str(midStreetLon)
#midStreetCoord = midStreetStr.split(',')
#midStreetLoc.append((midStreetLat,midStreetLon))
midStreetLoc.append(midStreetLat), midStreetLoc.append(midStreetLon)
print(streetName,midStreetLat, midStreetLon,'\n')
midStreetDict[streetName] = midStreetLoc
midStreetLoc = []
except:
print('Can\'t open URL, possible UnicodeEncode error. Skipping...\n')
# In[22]:
#Create backup version of the dictionary created above, just in case
midStreetDict_backUp = midStreetDict
#Check backup dictionary formed
for k,v in midStreetDict_backUp.items():
print(k,v)
# In[12]:
#Check a small sample of streets
streetDataSmall = pd.DataFrame.from_dict(combined, orient='index')
#Reset row index to numerical values
streetDataSmall.reset_index(inplace = True)
#Change column names
streetDataSmall.columns.values[0:] = ['StreetAddress','Latitude','Longitude']
#View first 5 rows
streetDataSmall.head()
# In[13]:
#Arrange above df in ascending order of address
#streetDataSmall.sort_values('StreetAddress', ascending= True, axis = 0)
from natsort import order_by_index, index_natsorted
streetDataSmall.reindex(index=order_by_index(streetDataSmall.index, index_natsorted(streetDataSmall.StreetAddress))).reset_index()
# In[14]:
from natsort import natsorted
natsorted((k,v) for k,v in combined.items())#Sorts on K, and if you want v with it, bracket them together
# In[28]:
#Obtain first and last address
#Find length of list
#listLen = len(natsorted((k,v) for k,v in combined.items()))
#print(listLen)
#If we want to extract coordinates
#count = 0
#for i in natsorted((k,v) for k,v in combined.items()):
# count = count+1
# if count == 1:
# coords1 = i[1][0], i[1][1]#Can index within a list of a list in python
# print(coords1)#Geopy distance estimator takes in tuple
# elif count == listLen:
# coords2 = i[1][0], i[1][1]
# print(coords2)
# In[24]:
#midStreetDict
# In[25]:
streetData = pd.DataFrame.from_dict(midStreetDict, orient= 'index')
#Reset index
streetData.reset_index(inplace = True)
#Rename columns
streetData.columns.values[0:] = ['Street','MidLatitude','MidLongitude']
streetData.head()
# In[26]:
streetData.shape
# In[27]:
streetData.isnull().sum()
# In[ ]:
#Save file to disk
streetData.to_csv(path_or_buf='./streetData_Midcoordinates.csv', index = False)