import pandas as pd
import requests
#http://stackoverflow.com/a/22320207/454773
#https://github.com/schmijos/html-table-parser-python3/blob/master/html_table_parser/parser.py
import urllib.request
from html.parser import HTMLParser
class HTMLTableParser(HTMLParser):
""" This class serves as a html table parser. It is able to parse multiple
tables which you feed in. You can access the result per .tables field.
"""
def __init__(self):
HTMLParser.__init__(self)
self.__in_td = False
self.__in_th = False
self.__in_a = False
self.__current_table = []
self.__current_row = []
self.__current_cell = []
self.tables = []
def handle_starttag(self, tag, attrs):
""" We need to remember the opening point for the content of interest.
The other tags (
, ) are only handled at the closing point.
"""
if tag == 'td':
self.__in_td = True
if tag == 'th':
self.__in_th = True
if tag == "a":
for name, value in attrs:
if name == "href":
self.__in_a = value
def handle_data(self, data):
""" This is where we save content to a cell """
if self.__in_td ^ self.__in_th:
self.__current_cell.append(data.strip())
if self.__in_a:
self.__current_cell.append('::'+self.__in_a+'::')
def handle_endtag(self, tag):
""" Here we exit the tags. If the closing tag is
, we know that we
can save our currently parsed cells to the current table as a row and
prepare for a new row. If the closing tag is
, we save the
current table and prepare for a new one.
"""
if tag == 'td':
self.__in_td = False
if tag == 'th':
self.__in_th = False
if tag == 'a':
self.__in_a = False
if (tag == 'td') ^ (tag == 'th'):
final_cell = " ".join(self.__current_cell).strip()
self.__current_row.append(final_cell)
self.__current_cell = []
if tag == 'tr':
self.__current_table.append(self.__current_row)
self.__current_row = []
if tag == 'table':
self.tables.append(self.__current_table)
self.__current_table = []
# get website content
req = urllib.request.Request(url=url)
f = urllib.request.urlopen(req)
xhtml = f.read().decode('latin-1')
# instantiate the parser and feed it
p = HTMLTableParser()
p.feed(xhtml)
#print(p.tables)
p.tables[1][0]
def grabber(url):
# get website content
req = urllib.request.Request(url=url)
f = urllib.request.urlopen(req)
xhtml = f.read().decode('latin-1')
# instantiate the parser and feed it
p = HTMLTableParser()
p.feed(xhtml)
return p
import pandas as pd
import string
allTheLetters = string.ascii_uppercase
data=[]
for letter in allTheLetters:
url='http://www.britac.ac.uk/links/uksahss.asp?Letter='+letter
p=grabber(url)
for row in p.tables[1]:
if row[0]!='':
row=row[0].replace('Telephone:','::').replace('Email:','::')
row = [item.strip() for item in row.split('::')]
data.append(row)
df=pd.DataFrame(data)
df.drop([5, 6, 7,8], axis=1,inplace=True)
df.columns=['Name','URL','Contact','Phone','email']
df=df.dropna(axis=1,how='all')
df.to_csv('societies.csv',index=False)