import pandas as pd import requests #http://stackoverflow.com/a/22320207/454773 #https://github.com/schmijos/html-table-parser-python3/blob/master/html_table_parser/parser.py import urllib.request from html.parser import HTMLParser class HTMLTableParser(HTMLParser): """ This class serves as a html table parser. It is able to parse multiple tables which you feed in. You can access the result per .tables field. """ def __init__(self): HTMLParser.__init__(self) self.__in_td = False self.__in_th = False self.__in_a = False self.__current_table = [] self.__current_row = [] self.__current_cell = [] self.tables = [] def handle_starttag(self, tag, attrs): """ We need to remember the opening point for the content of interest. The other tags (, ) are only handled at the closing point. """ if tag == 'td': self.__in_td = True if tag == 'th': self.__in_th = True if tag == "a": for name, value in attrs: if name == "href": self.__in_a = value def handle_data(self, data): """ This is where we save content to a cell """ if self.__in_td ^ self.__in_th: self.__current_cell.append(data.strip()) if self.__in_a: self.__current_cell.append('::'+self.__in_a+'::') def handle_endtag(self, tag): """ Here we exit the tags. If the closing tag is , we know that we can save our currently parsed cells to the current table as a row and prepare for a new row. If the closing tag is
, we save the current table and prepare for a new one. """ if tag == 'td': self.__in_td = False if tag == 'th': self.__in_th = False if tag == 'a': self.__in_a = False if (tag == 'td') ^ (tag == 'th'): final_cell = " ".join(self.__current_cell).strip() self.__current_row.append(final_cell) self.__current_cell = [] if tag == 'tr': self.__current_table.append(self.__current_row) self.__current_row = [] if tag == 'table': self.tables.append(self.__current_table) self.__current_table = [] # get website content req = urllib.request.Request(url=url) f = urllib.request.urlopen(req) xhtml = f.read().decode('latin-1') # instantiate the parser and feed it p = HTMLTableParser() p.feed(xhtml) #print(p.tables) p.tables[1][0] def grabber(url): # get website content req = urllib.request.Request(url=url) f = urllib.request.urlopen(req) xhtml = f.read().decode('latin-1') # instantiate the parser and feed it p = HTMLTableParser() p.feed(xhtml) return p import pandas as pd import string allTheLetters = string.ascii_uppercase data=[] for letter in allTheLetters: url='http://www.britac.ac.uk/links/uksahss.asp?Letter='+letter p=grabber(url) for row in p.tables[1]: if row[0]!='': row=row[0].replace('Telephone:','::').replace('Email:','::') row = [item.strip() for item in row.split('::')] data.append(row) df=pd.DataFrame(data) df.drop([5, 6, 7,8], axis=1,inplace=True) df.columns=['Name','URL','Contact','Phone','email'] df=df.dropna(axis=1,how='all') df.to_csv('societies.csv',index=False)