#!/usr/bin/env python # coding: utf-8 # # Simple Scraper # # # Find how to split the problem up. # # Do simple tests of each part, if necessary, then assemble them later on... # # In[2]: #Load file import requests url='https://www.ssepd.co.uk/Powertrack/' r =requests.get(url) # In[7]: from bs4 import BeautifulSoup soup= BeautifulSoup(r.text,'html.parser') # Find a div of interest... # In[68]: powertracksummary= soup.findAll('div', {'class':"power-track-summary"}) #How many are there? There should be just one len(powertracksummary) # In[69]: powertracksummary = powertracksummary[0] # In[105]: #halfrows because there is another block of divs that contains data for same data row halfrows=powertracksummary.findAll('div',{'class':'row'}) # In[96]: len(halfrows) # Okay - there's something wrong with that tag because there are *not* 21 rows in the table. The `row` class is not uniquely identifying the thing we are interested in, which makes it less than useful... # ## Make sure you know what you're working with... # # (I didnlt do this when you were here - I should have checked!) # # Let's just look at some of the `row` classed rows... # In[72]: #preview one of them halfrows[0] # In[102]: #Let's look at the last one... halfrows[-1] # In[103]: #for reference... halfrows[-1]['class'] # This is not good and can be a massive distractor / time waster trying to scrape this if we assume the `row` classed rows are all the same structure. In fact, the `row` class is actually grabbing us different sorts of row, so it's not very useful... # # We really need to look for an alternative, becuase the main of of scraping is to look for repeatable patterns that we can parse information from in a regulaar, repeated way... # # If the `row` class pulls back rows that are all structured the same, we can write a scraper for one row that will work on them all; if the class pulls back two or more srots of row, we need to detect which sort and scrape each separately, which just gets messy. # ## A Unique identifier # # The `accordion-group` group does identify rows properly: # In[128]: accordionrows = powertracksummary.findAll('div',{'class':'accordion-group'}) len(accordionrows) # So use that... # # Let's try and scrape one row: # In[130]: accordionrow = accordionrows[0] # In[131]: accordionrow.find('div',{'class':'date'}) # In[134]: divs = accordionrow.find('div',{'class':'date'}).findAll('div') divs # In[137]: _date = divs[0].text.strip() _date # In[138]: _time = divs[1].text.strip() _time # Example of finding all the dates and times: # In[139]: #Create a list to store data from each row ,one row per list item records = [] #Assemble the recipe from the ingredients we started to prepare above for accordionrow in accordionrows: divs = accordionrow.find('div',{'class':'date'}).findAll('div') _date = divs[0].text.strip() _time = divs[1].text.strip() record = { 'time':_time, 'date':_date} records.append(record) records # In[ ]: