import urllib.request, urllib.parse, urllib.error
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import re
cntxt = ssl.create_default_context()
cntxt.check_hostname = False
cntxt.verify_mode = ssl.CERT_NONE
url = input('Please enter the website to obtain data from: ')
if len(url) < 1: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
print('You want data from >>>\n', url, '\n<<<')
You want data from >>> https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M <<<
#Use a file-handle like object to open the url
html = urlopen(url, context= cntxt).read() #Read slurps everything in #Note that is additional function written at end
#Use BeautifulSoup to parse
soup = BeautifulSoup(html, 'html.parser')
type(soup)
bs4.BeautifulSoup
#Try retrieving 'tr' tags
#print(soup)
tags = soup('tr')
print('Total tags extracted: ', len(tags),'\n')
#Look at the tag extracted
count = 0 #Initialise counter to count iterations & if rqd, help break out of loop
dict = {} #Initialise empty dictionary to store postcode as KEY, boroughs and neighbourhoods as VALUES
list = [] #Initialise empty list to hold the borough + neighbourhood info
#Loop through the tags
for i in tags:
count = count + 1 #Increase counter value beginning through each iteration
if count == 290: break #For limiting output and stopping the loop from running + length of tags for some reason does not correspond to actual number of elements
#Skip if Borough is Not Assigned
if i.contents[3].text == 'Not assigned':
continue
#Skip if text is Postcode
if i.contents[1].text == 'Postcode':
continue
#If pincode already exists, eg M5A, M6A, append new data
if i.contents[1].text in dict:
# Append the new data to the existing array at this slot with the following NEW SYNTAX
#dict[existing_key].append(value)
dict[i.contents[1].text].append(i.contents[5].text.rstrip())
#If borough exists but not the neighbourhood, run the following
elif re.search('[a-z]', i.contents[3].text) and re.search('No.*', i.contents[5].text):
list.append(i.contents[3].text), list.append(i.contents[3].text)
dict[i.contents[1].text] = list
else:
list.append(i.contents[3].text)
list.append(i.contents[5].text.rstrip())#rstrip gets rid of newline char.
dict[i.contents[1].text] = list
#Would need to reset the list to empty after each round otherwise each successive iteration will bloat up the key-value
list = []
#print(dict)
print('\nTotal post-codes with borough info:', len(dict))
Total tags extracted: 294 Total post-codes with borough info: 103
#6.1 Import the library
import pandas as pd
#6.2 Convert dictionary to dataframe
data = pd.DataFrame.from_dict(dict, orient= 'index')#index when keys are row labels
data.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
M3A | North York | Parkwoods | None | None | None | None | None | None | None |
M1G | Scarborough | Woburn | None | None | None | None | None | None | None |
M4N | Central Toronto | Lawrence Park | None | None | None | None | None | None | None |
M1L | Scarborough | Clairlea | Golden Mile | Oakridge | None | None | None | None | None |
M5G | Downtown Toronto | Central Bay Street | None | None | None | None | None | None | None |
#6.3 Assign row name to postcodes
data.index.name = 'PostalCode'
data.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
PostalCode | |||||||||
M3A | North York | Parkwoods | None | None | None | None | None | None | None |
M1G | Scarborough | Woburn | None | None | None | None | None | None | None |
M4N | Central Toronto | Lawrence Park | None | None | None | None | None | None | None |
M1L | Scarborough | Clairlea | Golden Mile | Oakridge | None | None | None | None | None |
M5G | Downtown Toronto | Central Bay Street | None | None | None | None | None | None | None |
#6.4 Reset index
data.reset_index(inplace= True)
data.head()
PostalCode | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | M3A | North York | Parkwoods | None | None | None | None | None | None | None |
1 | M1G | Scarborough | Woburn | None | None | None | None | None | None | None |
2 | M4N | Central Toronto | Lawrence Park | None | None | None | None | None | None | None |
3 | M1L | Scarborough | Clairlea | Golden Mile | Oakridge | None | None | None | None | None |
4 | M5G | Downtown Toronto | Central Bay Street | None | None | None | None | None | None | None |
#6.5 Set column 2 i.e. after Postcode to Borough
data.columns.values[1] = 'Borough'#We need to drill down to the array, accessed with .values
data.head()
PostalCode | Borough | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | M3A | North York | Parkwoods | None | None | None | None | None | None | None |
1 | M1G | Scarborough | Woburn | None | None | None | None | None | None | None |
2 | M4N | Central Toronto | Lawrence Park | None | None | None | None | None | None | None |
3 | M1L | Scarborough | Clairlea | Golden Mile | Oakridge | None | None | None | None | None |
4 | M5G | Downtown Toronto | Central Bay Street | None | None | None | None | None | None | None |
#6.6 Convert columns to list form so as to remove hidden characters that interfer with column extraction by name
data.columns = data.columns.tolist()
data[['Borough']].head() #data[[5]] for the column numbered 5
Borough | |
---|---|
0 | North York |
1 | Scarborough |
2 | Central Toronto |
3 | Scarborough |
4 | Downtown Toronto |
#6.7 Remove None values
## It is vital to try this in the beginning when only specific cells have None
### It becomes a bit complex if we try to remove None after merging, as each cell then has several values
#data_sorted = data.apply(sorted,key=pd.isnull)
#data_sorted.head()
data_clean = data[~pd.isnull(data).all(1)].fillna('')
data_clean.head()
PostalCode | Borough | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | M3A | North York | Parkwoods | |||||||
1 | M1G | Scarborough | Woburn | |||||||
2 | M4N | Central Toronto | Lawrence Park | |||||||
3 | M1L | Scarborough | Clairlea | Golden Mile | Oakridge | |||||
4 | M5G | Downtown Toronto | Central Bay Street |
#6.8 Concatenate the neighbourhood columns
## Obtain column position after which concatenation is to take place
source_col_loc = data_clean.columns.get_loc('Borough') # column position starts from 0
source_col_loc
#Create new column that will merge all neighbourhoods
data_clean['Neighbourhood'] = data_clean.iloc[:,source_col_loc+1:source_col_loc+8].apply(
lambda x: ",".join(x.astype(str)), axis=1)
data_clean.head()
PostalCode | Borough | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Neighbourhood | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | M3A | North York | Parkwoods | Parkwoods,,,,,, | |||||||
1 | M1G | Scarborough | Woburn | Woburn,,,,,, | |||||||
2 | M4N | Central Toronto | Lawrence Park | Lawrence Park,,,,,, | |||||||
3 | M1L | Scarborough | Clairlea | Golden Mile | Oakridge | Clairlea,Golden Mile,Oakridge,,,, | |||||
4 | M5G | Downtown Toronto | Central Bay Street | Central Bay Street,,,,,, |
#6.9 Drop columns 1 through 8
import numpy as np
cols_to_remove = np.arange(1,9)
#print(cols_to_remove, '\n')
#Drop
data_clean.drop(cols_to_remove, axis=1, inplace = True)
#data_clean.head()
data_clean.head()
PostalCode | Borough | Neighbourhood | |
---|---|---|---|
0 | M3A | North York | Parkwoods,,,,,, |
1 | M1G | Scarborough | Woburn,,,,,, |
2 | M4N | Central Toronto | Lawrence Park,,,,,, |
3 | M1L | Scarborough | Clairlea,Golden Mile,Oakridge,,,, |
4 | M5G | Downtown Toronto | Central Bay Street,,,,,, |
data_clean[data_clean['PostalCode'] == 'M5A']
PostalCode | Borough | Neighbourhood | |
---|---|---|---|
34 | M5A | Downtown Toronto | Harbourfront,Regent Park,,,,, |
data_clean.shape
(103, 3)
#import geocoder #Not installed
#From terminal, install GeoPy
#sudo python3 -m pip install geopy
#sudo python3 -m pip install geocoder #NOTE: THIS IS ANOTHER PACKAGE!
#from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
import folium #Plotting library
#import geocoder
#g = geocoder.google('Mountain View, CA')
#print(g.latlng)
# initialize your variable to None
#lat_lng_coords = None
# loop until you get the coordinates
#while(lat_lng_coords is None):
# g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
# lat_lng_coords = g.latlng
#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]
#Read direct from provided CSV as the package was not giving results
import pandas as pd
latlang = pd.read_csv('/Users/peaceful_warrior/Downloads/Geospatial_Coordinates.csv')
latlang.head()
Postal Code | Latitude | Longitude | |
---|---|---|---|
0 | M1B | 43.806686 | -79.194353 |
1 | M1C | 43.784535 | -79.160497 |
2 | M1E | 43.763573 | -79.188711 |
3 | M1G | 43.770992 | -79.216917 |
4 | M1H | 43.773136 | -79.239476 |
latlang.columns
Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')
latlang.columns.values[0] = 'PostalCode'
latlang.head()
PostalCode | Latitude | Longitude | |
---|---|---|---|
0 | M1B | 43.806686 | -79.194353 |
1 | M1C | 43.784535 | -79.160497 |
2 | M1E | 43.763573 | -79.188711 |
3 | M1G | 43.770992 | -79.216917 |
4 | M1H | 43.773136 | -79.239476 |
data_clean.head()
PostalCode | Borough | Neighbourhood | |
---|---|---|---|
0 | M3A | North York | Parkwoods,,,,,, |
1 | M1G | Scarborough | Woburn,,,,,, |
2 | M4N | Central Toronto | Lawrence Park,,,,,, |
3 | M1L | Scarborough | Clairlea,Golden Mile,Oakridge,,,, |
4 | M5G | Downtown Toronto | Central Bay Street,,,,,, |
data_clean_sorted = data_clean.sort_values(by = 'PostalCode', ascending= True)
data_clean_sorted.head()
PostalCode | Borough | Neighbourhood | |
---|---|---|---|
6 | M1B | Scarborough | Rouge,Malvern,,,,, |
86 | M1C | Scarborough | Highland Creek,Rouge Hill,Port Union,,,, |
49 | M1E | Scarborough | Guildwood,Morningside,West Hill,,,, |
1 | M1G | Scarborough | Woburn,,,,,, |
66 | M1H | Scarborough | Cedarbrae,,,,,, |
data_clean_sorted.columns
Index(['PostalCode', 'Borough', 'Neighbourhood'], dtype='object')
data_clean_sorted[['PostalCode']].head()
PostalCode | |
---|---|
6 | M1B |
86 | M1C |
49 | M1E |
1 | M1G |
66 | M1H |
#latlang[['PostalCode']].head()#Throws error, possibly due to hidden character, so convert to list
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-54-400fad942f23> in <module> ----> 1 latlang[['PostalCode']].head() /opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/frame.py in __getitem__(self, key) 2932 key = list(key) 2933 indexer = self.loc._convert_to_indexer(key, axis=1, -> 2934 raise_missing=True) 2935 2936 # take() does not accept boolean indexers /opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter, raise_missing) 1352 kwargs = {'raise_missing': True if is_setter else 1353 raise_missing} -> 1354 return self._get_listlike_indexer(obj, axis, **kwargs)[1] 1355 else: 1356 try: /opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing) 1159 self._validate_read_indexer(keyarr, indexer, 1160 o._get_axis_number(axis), -> 1161 raise_missing=raise_missing) 1162 return keyarr, indexer 1163 /opt/local/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing) 1244 raise KeyError( 1245 u"None of [{key}] are in the [{axis}]".format( -> 1246 key=key, axis=self.obj._get_axis_name(axis))) 1247 1248 # We (temporarily) allow for some missing keys with .loc, except in KeyError: "None of [Index(['PostalCode'], dtype='object')] are in the [columns]"
latlang.columns
Index(['PostalCode', 'Latitude', 'Longitude'], dtype='object')
#Convert to list
latlang.columns = latlang.columns.tolist()
latlang[['PostalCode']].head()#Now it displays properly
PostalCode | |
---|---|
0 | M1B |
1 | M1C |
2 | M1E |
3 | M1G |
4 | M1H |
#Merge the two dataframes on PostalCode
data_merged = pd.merge(data_clean_sorted, latlang, on='PostalCode')
data_merged.head()
PostalCode | Borough | Neighbourhood | Latitude | Longitude | |
---|---|---|---|---|---|
0 | M1B | Scarborough | Rouge,Malvern,,,,, | 43.806686 | -79.194353 |
1 | M1C | Scarborough | Highland Creek,Rouge Hill,Port Union,,,, | 43.784535 | -79.160497 |
2 | M1E | Scarborough | Guildwood,Morningside,West Hill,,,, | 43.763573 | -79.188711 |
3 | M1G | Scarborough | Woburn,,,,,, | 43.770992 | -79.216917 |
4 | M1H | Scarborough | Cedarbrae,,,,,, | 43.773136 | -79.239476 |
data_merged[data_merged['PostalCode'] == 'M5A']
PostalCode | Borough | Neighbourhood | Latitude | Longitude | |
---|---|---|---|---|---|
53 | M5A | Downtown Toronto | Harbourfront,Regent Park,,,,, | 43.65426 | -79.360636 |