#!/usr/bin/env python # coding: utf-8 # http://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/weekly-ae-sitreps-2014-15/ # http://www.england.nhs.uk/statistics/statistical-work-areas/winter-daily-sitreps/winter-daily-sitrep-2014-15-data/ # ## Harvesting # One frequently encountered task when working with regularly produced datasets is the downloading of 'latest release' spreadsheet or CSV data files. These can typically take one of three forms: # # 1) a new weekly or monthly file is released containing just the figures for that week or month; # 2) a refreshed release of the current state of an often time-sensitive dataset. For example, a list of current roadworks where no historical data is accessible # 3) an updated longitudinal dataset that contains historical data, organised by date/time, as well as the most recent data. # # In each case, there is a requirement to regularly download the newly released file. In the first case, when starting a longitudinal analysis, there may be a requirement to download historical data files (the monthly releases for previous months, for example) from which a longitudinal dataset may be created. # # *Harvesting* data refers to the collection of data from a known location. Harvesting may occur over time, for example, according a schedule - collecting a monhtly dataset when it has been published - or collecting a set of data from multiple files or locations in order to seed a larger data set. # # Where regularly released datafiles are posted to a known web location with a long lived URL in the form of a link on that page to the datafile itself, possibly along with a sef of previously posted links to historical datasets, we can often automate the collection of the data by *scraping* the links to the datafile or datafiles and then using those links as the basis of an automatic downloader. # ### Simple Screenscraper and Link Harvester # In[ ]: #import sys #reload(sys) #sys.setdefaultencoding("utf-8") # In[7]: import pandas as pd #requests is a library that simplifies grabbing web pages and downloading files from web addresses / URLs import requests #BeautifulSoup is a library that helps parse HTML web pages and XML files from bs4 import BeautifulSoup # In[5]: url='http://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/weekly-ae-sitreps-2014-15/' #Grab the contents of a file of an arbitrary type that is published at a particular web location using an HTTP GET page = requests.get(url) #The file we have grabbed in his case is a web page - that is, an HTML file #We can get the content of the page and parse it soup=BeautifulSoup(page.content) #BeautifulSoup has a routine - find_all() - that will find all the HTML tags of a particular sort #Links are represented in HTML pages in the form link text #Grab all the (anchor) tags... links=soup.find_all('a') # In[11]: #We can iterate through all the links one at a time, calling each link "link" temporarily as we do so for link in links: #Do a logical test to see if the linked URL ends in .csv, the conventional suffix for a CSV file if link['href'].endswith('.csv'): #If the link does look like its a link to a CSV file, print the URL out print(link['href'],link.text) # So we've seen how we can grab a list of links to CSV files that are published via a single web page. How does that help us harvest these files? # In[7]: import os directory='AEdata' #Create a new directory - AEdata - if it doesnlt already exist if not os.path.exists(directory): os.makedirs(directory) # In[14]: ##Via http://stackoverflow.com/a/16696317/454773 #TH - added path #The following function seems to be a pretty robust, generic function, for downloading an arbitrary file to our local machine import requests def download_file(url,path): ''' Download a file from a particular URL to the current directory with original filename ''' local_filename = url.split('/')[-1] local_filename='/'.join([path,local_filename]) # NOTE the stream=True parameter r = requests.get(url, stream=True) with open(local_filename, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() return local_filename # In[15]: #In the same way we iterested through a list of links extracted from a web page so we could then print them out, #we can iterate through a list of links so for link in links: if link['href'].endswith('.csv'): fn=download_file(link['href'],directory) print("Downloaded {}".format(fn)) # In[190]: #The os.listdir(PATH) command returns a list of files contain in the directory at path PATH os.listdir('./AEdata')[:3] # In[ ]: #ACTIVITY - How would you identify whether or not a link to a file you haven't already downloaded has been posted? # In[19]: #Automation #Once we have worked out a recipe of one or more steps that might be usefully reusable, #we might create a function that combines those steps so they can be called from a single command #So for example, we could combine the command that gets the links from a page #with a line that finds the ones corresponding to a particular filetype #Bear in mind the function may be dependent on some imported libraries or other functions #If we put the function into a file, we should make sure we also import or declare any other required libraries and functions def linkscraper(url,path,filetypes): ''' url: a web page containing a list of links path: a path to a local download directory filetypes: a list of filetypes we want to check for''' if not os.path.exists(directory): os.makedirs(directory) page = requests.get(url) soup=BeautifulSoup(page.content) links=soup.find_all('a') for link in links: filetype=link['href'].split('.')[-1] if filetype in filetypes: fn=download_file(link['href'],path) print("Downloaded {}".format(fn)) # In[21]: #We now have a single command that will download files of a particular type linked to from a given URL url='http://www.england.nhs.uk/statistics/statistical-work-areas/winter-daily-sitreps/winter-daily-sitrep-2014-15-data/' linkscraper(url,directory,['xlsx']) # We might want to further refine the download for future use to only download files we haven't already downloaded. Note that this might further require checking against an exceptions list, eg for spreadsheets or CSV files that retain a filename but that get updated on a regular basis. # In[6]: get_ipython().system('ls AEdata/') # ## Parsing Complex Spreadsheets # # When working with third party spreadsheets, it is not uncommon to find layouts that include metadata elements in the first sheet or at the top of a sheet, hierarchically declared column headings, where for example a heading spans several columns in one rows and is then subdivided into subheadings that appear in the next row, footnotes that appear at the bottom of a spreadsheet and even sidenotes that are pasted into additional columns. # # Even simpler CSV files may include hierarchical indices and leading and trailing metadata or additional notes. # # In the following example we'll start to look at the *WSitAE* reports, followed by the *DailySR* reports. # In[178]: get_ipython().system('head AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv') # In[214]: #The following recipe will open a file and then read the first few lines with open('AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv','r') as f: for i in range(0,7): print('>>{}<<'.format(f.readline())) # Inspecting the first few rows of the file, we notice that we need to find some way of representing a hierarchical index. In *pandas*, this is referred to as a `MultiIndex`. Working with such structures can be confusing: it is worth keeping notes of any useful recipes you manage to work out and keep a set of worked examples that demonstrate principles and techniques that may be useful to you up-to-date. # In[204]: get_ipython().run_line_magic('pinfo', 'pd.read_csv') # In[250]: df=pd.read_csv("AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv", header=[4,5]) df.head(3) # In[251]: #Flatten herarchical column names #http://stackoverflow.com/a/14508355/454773 df.columns = ['::'.join(col).strip() for col in df.columns.values] df.head(3) # In[252]: df.head(3) # In[253]: df_long=pd.melt(df,id_vars=["Unnamed: 0_level_0::Unnamed: 0_level_1","Unnamed: 1_level_0::Unnamed: 1_level_1"]) df_long.head(3) # In[254]: df_long.columns=['Code','Name',"Type",'Count'] df_long.head(3) # In[255]: df_long['Type0']=df_long['Type'].map(lambda x: x.split('::')[0]) df_long['Type1']=df_long['Type'].map(lambda x: x.split('::')[1]) # In[256]: df_long.head() # In[262]: df_long['Type0'].unique() # In[266]: #There's more than one way to do it... #use .replace() #df_long['Type0']=df_long['Type0'].str.replace('Unnamed: 3_level_0':'Number of A&E Attendances') #etc #use .map() colmap={'Number of A&E Attendances':'Number of A&E Attendances', 'Unnamed: 3_level_0':'Number of A&E Attendances', 'Unnamed: 4_level_0':'Number of A&E Attendances', 'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission', 'Unnamed: 6_level_0':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission', 'Unnamed: 7_level_0':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission', 'Number of patients spending >4 hours but <12 hours from decision to admit to admission':'Number of patients spending >4 hours but <12 hours from decision to admit to admission', 'Unnamed: 9_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission', 'Unnamed: 10_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission', 'Unnamed: 11_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission', 'Number of patients spending >12 hours from decision to admit to admission':'Number of patients spending >12 hours from decision to admit to admission', 'Unnamed: 13_level_0':'Number of patients spending >12 hours from decision to admit to admission', 'Unnamed: 14_level_0':'Number of patients spending >12 hours from decision to admit to admission', 'Unnamed: 15_level_0':'Number of patients spending >12 hours from decision to admit to admission'} # In[268]: df_long['Type0']=df_long['Type0'].map(colmap) # ### Adding metadata back in... # One of the things we might want to do to this dataset is add some metadata back in relating to the date the data applies to. One source of this data was the filename: # # ANother source was in the metadata rows contained within the file itself: # # ### WSitAE CSV API Demo # ### WSitAE-Extractions-AE # In[247]: import os def WSitAE_AE_read(date,path='AEData'): ls=os.listdir(path) fn=None for item in ls: if item.find(date)!=-1 and item.find('WSitAE-Extractions-AE')!=-1: fn=item if fn is None: return pd.DataFrame() ffn='{path}/{fn}'.format(path=path,fn=fn) z={} with open(ffn,'r') as f: for i in range(0,2): for a in s.split(','): z[a.strip('"').split(':')[0]]=a.strip('"').split(':')[1] f.close() #Read the file in without setting any headers dfw=pd.read_csv(ffn, header=None,skiprows=4) #Get rid of any empty columns dfw.dropna(how='all',axis=1,inplace=True) #The contents of the first two rows will be our column headers #Fill across so blank columns are named according to the previously named column dfw[:2].fillna(method='ffill', axis=1,inplace=True) #Generate a multiindex in the columns from the first two rows dfw.columns=pd.MultiIndex.from_arrays(dfw[:2].values, names=['Category','SubCategory']) #Flatten the columns multi-index to give a single set of flat column names dfw.columns = ['Code','Name']+['::'.join(col).strip() for col in dfw.columns.values[2:]] ##Hack a date from period- should really cast to period dfw['Period Name']=z['Period Name'].strip()#date dfw['Date']=z['Period Name'].replace('W/E','').strip() dfw['Year']=z['Year'].strip() return dfw[2:] def WSitAE_AE_melt(df): #Declare the id_vars idx=['Code','Name','Year','Period Name','Date'] df=pd.melt(df,id_vars=idx, var_name='Type',value_name='Value') df['SubType']=df['Type'].map(lambda x: x.split('::')[1]) df['Type']=df['Type'].map(lambda x: x.split('::')[0]) return df[idx+['Type','SubType','Value']] def WSitAE_AE_long(date,path='AEData'): df=WSitAE_AE_read(date,path) return WSitAE_AE_melt(df) # In[2]: get_ipython().system('ls AEdata') # In[249]: dfw=WSitAE_AE_long('2015.02.08') dfw[:3] # ### WSitAE-Extractions-Adm # In[250]: def WSitAE_Adm_read(date,path='AEData'): ls=os.listdir(path) fn=None for item in ls: if item.find(date)!=-1 and item.find('WSitAE-Extractions-Adm')!=-1: fn=item if fn is None: return pd.DataFrame() ffn='{path}/{fn}'.format(path=path,fn=fn) dfw=pd.read_csv(ffn, header=4) dfw.dropna(how='all',axis=1,inplace=True) dfw['Date']=dfw['Period Name'].str.replace('W/E','').str.strip() cols=dfw.columns.tolist() cols=cols[:3]+cols[-1:]+cols[3:-1] return dfw[cols] # In[251]: WSitAE_Adm_read('2015.02.15')[:3] # In[257]: #Alternatively, we can load the data contained in the two CSV files from a single spreadsheet sheet dfx=pd.read_excel('AEData/2015.03.08-AEHLnoU.xls',header=None,skiprows=14) dfx.dropna(how='all',axis=1,inplace=True) dfx[:1].fillna(method='ffill', axis=1,inplace=True) dfx.columns=pd.MultiIndex.from_arrays(dfx[:2].values, names=['Category','SubCategory']) dfx.columns = ['Area Team','Code','Name']+['::'.join(col).strip() for col in dfx.columns.values[3:]] dfx[:3] # In[ ]: #Alternatively, we could have separated out the A&E data from the Emergency Admissions Data # ## Daily SitRep # In[15]: import pandas as pd def DailySR_read(f): return pd.ExcelFile(f) #xl=DailySR_read('AEData/DailySR-Web-file-WE-01.03.15.xlsx') xl=DailySR_read('AEData/DailySR-Web-file-WE-07.12.14.xlsx') xl.sheet_names # In[115]: def DailySR_parse(xl,sheet,skiprows=13,header=None): xlclass={'A&E closures':0, 'A&E diverts':0, 'Cancelled operations':1, 'Critical care transfers':1, 'Ambulances queuing':0, 'G&A beds':1, 'D&V, Norovirus':1, 'Delayed transfers of care':0, 'Adult critical care':1, 'Paediatric intensive care':1, 'Neonatal intensive care ':1, 'Trusts with Ops problems':0, 'A&E Performance':1#, #'Macro1':-1 } if sheet not in xlclass: return pd.DataFrame() dfx=xl.parse(sheet,skiprows=skiprows,header=header) dfx.dropna(how='all',inplace=True) dfx.dropna(how='all',axis=1,inplace=True) cols=[] if xlclass[sheet]==0: dfx.drop([2,3],inplace=True) dfx.dropna(how='all',axis=1,inplace=True) dfx.columns=dfx.ix[1].values dfx=dfx[1:] elif xlclass[sheet]==1: #This is a real hack, trying to think what a sensible shape is and how to get there #Drop an empty row and the total count for England row dfx.drop([2,3],inplace=True) #Get rid of empty columns dfx.dropna(how='all',axis=1,inplace=True) #Fill across on dates dfx[:1].fillna(method='ffill', axis=1,inplace=True) #Patch the leading empty columns dfx[:1].fillna('Metadata',inplace=True) #Generate a multi-index across the columns from the first two rows dfx.columns=pd.MultiIndex.from_arrays(dfx[:2].values, names=['Category','SubCategory']) #Drop the rows we used to make the multi-index dfx=dfx[2:] #Transpose rows and columns dfx=dfx.T #Set the column names to be the hospital codes dfx.columns=dfx.ix['Metadata','Code'].values #And then throw away the metadata rows dfx=dfx[3:] #Tidy up by throwing away any empty columns we created dfx.dropna(how='all',axis=1,inplace=True) return dfx xx=DailySR_parse(xl,'G&A beds') xx[:4] # In[117]: xx['RNZ'][:,'Escalation beds avail'] # In[119]: DailySR_parse(xl,'Paediatric intensive care') # So now I've worked out the steps to get the separate waiting time # ## Hierachical Column Indexes in Excel Spreadsheets # # Where spreadsheets contain multiple heterogenous sheets, we need to load them in using the *pandas* `ExcelFile` as the following examples using a *DailySR* report shows. # In[208]: dfx=pd.ExcelFile('AEdata/DailySR-Web-file-WE-01.02.15.xlsx') dfx.sheet_names # In[214]: df=dfx.parse('G&A beds',header=None,skiprows=13,parse_cols="B,D:Y") df.head() # In[215]: df.set_index([0,1,2],inplace=True) df.head(2) # In[216]: #http://stackoverflow.com/a/24154138/454773 #fill in Null values in "Headers" df[:1].fillna(method='ffill', axis=1,inplace=True) #df[:1].fillna(value='Base',axis=1,inplace=True) df.head(3) # In[217]: #http://stackoverflow.com/a/24154138/454773 #create multiindex column names df.columns=pd.MultiIndex.from_arrays(df[:2].values, names=['Date','Report']) # In[218]: df[4:].head(3).T # In[218]: # In[218]: # In[ ]: