#!/usr/bin/env python
# coding: utf-8
# http://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/weekly-ae-sitreps-2014-15/
# http://www.england.nhs.uk/statistics/statistical-work-areas/winter-daily-sitreps/winter-daily-sitrep-2014-15-data/
# ## Harvesting
# One frequently encountered task when working with regularly produced datasets is the downloading of 'latest release' spreadsheet or CSV data files. These can typically take one of three forms:
#
# 1) a new weekly or monthly file is released containing just the figures for that week or month;
# 2) a refreshed release of the current state of an often time-sensitive dataset. For example, a list of current roadworks where no historical data is accessible
# 3) an updated longitudinal dataset that contains historical data, organised by date/time, as well as the most recent data.
#
# In each case, there is a requirement to regularly download the newly released file. In the first case, when starting a longitudinal analysis, there may be a requirement to download historical data files (the monthly releases for previous months, for example) from which a longitudinal dataset may be created.
#
# *Harvesting* data refers to the collection of data from a known location. Harvesting may occur over time, for example, according a schedule - collecting a monhtly dataset when it has been published - or collecting a set of data from multiple files or locations in order to seed a larger data set.
#
# Where regularly released datafiles are posted to a known web location with a long lived URL in the form of a link on that page to the datafile itself, possibly along with a sef of previously posted links to historical datasets, we can often automate the collection of the data by *scraping* the links to the datafile or datafiles and then using those links as the basis of an automatic downloader.
# ### Simple Screenscraper and Link Harvester
# In[ ]:
#import sys
#reload(sys)
#sys.setdefaultencoding("utf-8")
# In[7]:
import pandas as pd
#requests is a library that simplifies grabbing web pages and downloading files from web addresses / URLs
import requests
#BeautifulSoup is a library that helps parse HTML web pages and XML files
from bs4 import BeautifulSoup
# In[5]:
url='http://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/weekly-ae-sitreps-2014-15/'
#Grab the contents of a file of an arbitrary type that is published at a particular web location using an HTTP GET
page = requests.get(url)
#The file we have grabbed in his case is a web page - that is, an HTML file
#We can get the content of the page and parse it
soup=BeautifulSoup(page.content)
#BeautifulSoup has a routine - find_all() - that will find all the HTML tags of a particular sort
#Links are represented in HTML pages in the form link text
#Grab all the (anchor) tags...
links=soup.find_all('a')
# In[11]:
#We can iterate through all the links one at a time, calling each link "link" temporarily as we do so
for link in links:
#Do a logical test to see if the linked URL ends in .csv, the conventional suffix for a CSV file
if link['href'].endswith('.csv'):
#If the link does look like its a link to a CSV file, print the URL out
print(link['href'],link.text)
# So we've seen how we can grab a list of links to CSV files that are published via a single web page. How does that help us harvest these files?
# In[7]:
import os
directory='AEdata'
#Create a new directory - AEdata - if it doesnlt already exist
if not os.path.exists(directory):
os.makedirs(directory)
# In[14]:
##Via http://stackoverflow.com/a/16696317/454773
#TH - added path
#The following function seems to be a pretty robust, generic function, for downloading an arbitrary file to our local machine
import requests
def download_file(url,path):
''' Download a file from a particular URL to the current directory with original filename '''
local_filename = url.split('/')[-1]
local_filename='/'.join([path,local_filename])
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
return local_filename
# In[15]:
#In the same way we iterested through a list of links extracted from a web page so we could then print them out,
#we can iterate through a list of links so
for link in links:
if link['href'].endswith('.csv'):
fn=download_file(link['href'],directory)
print("Downloaded {}".format(fn))
# In[190]:
#The os.listdir(PATH) command returns a list of files contain in the directory at path PATH
os.listdir('./AEdata')[:3]
# In[ ]:
#ACTIVITY - How would you identify whether or not a link to a file you haven't already downloaded has been posted?
# In[19]:
#Automation
#Once we have worked out a recipe of one or more steps that might be usefully reusable,
#we might create a function that combines those steps so they can be called from a single command
#So for example, we could combine the command that gets the links from a page
#with a line that finds the ones corresponding to a particular filetype
#Bear in mind the function may be dependent on some imported libraries or other functions
#If we put the function into a file, we should make sure we also import or declare any other required libraries and functions
def linkscraper(url,path,filetypes):
''' url: a web page containing a list of links
path: a path to a local download directory
filetypes: a list of filetypes we want to check for'''
if not os.path.exists(directory):
os.makedirs(directory)
page = requests.get(url)
soup=BeautifulSoup(page.content)
links=soup.find_all('a')
for link in links:
filetype=link['href'].split('.')[-1]
if filetype in filetypes:
fn=download_file(link['href'],path)
print("Downloaded {}".format(fn))
# In[21]:
#We now have a single command that will download files of a particular type linked to from a given URL
url='http://www.england.nhs.uk/statistics/statistical-work-areas/winter-daily-sitreps/winter-daily-sitrep-2014-15-data/'
linkscraper(url,directory,['xlsx'])
# We might want to further refine the download for future use to only download files we haven't already downloaded. Note that this might further require checking against an exceptions list, eg for spreadsheets or CSV files that retain a filename but that get updated on a regular basis.
# In[6]:
get_ipython().system('ls AEdata/')
# ## Parsing Complex Spreadsheets
#
# When working with third party spreadsheets, it is not uncommon to find layouts that include metadata elements in the first sheet or at the top of a sheet, hierarchically declared column headings, where for example a heading spans several columns in one rows and is then subdivided into subheadings that appear in the next row, footnotes that appear at the bottom of a spreadsheet and even sidenotes that are pasted into additional columns.
#
# Even simpler CSV files may include hierarchical indices and leading and trailing metadata or additional notes.
#
# In the following example we'll start to look at the *WSitAE* reports, followed by the *DailySR* reports.
# In[178]:
get_ipython().system('head AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv')
# In[214]:
#The following recipe will open a file and then read the first few lines
with open('AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv','r') as f:
for i in range(0,7):
print('>>{}<<'.format(f.readline()))
# Inspecting the first few rows of the file, we notice that we need to find some way of representing a hierarchical index. In *pandas*, this is referred to as a `MultiIndex`. Working with such structures can be confusing: it is worth keeping notes of any useful recipes you manage to work out and keep a set of worked examples that demonstrate principles and techniques that may be useful to you up-to-date.
# In[204]:
get_ipython().run_line_magic('pinfo', 'pd.read_csv')
# In[250]:
df=pd.read_csv("AEdata/2015.02.08-WSitAE-Extractions-AE4df65w.csv", header=[4,5])
df.head(3)
# In[251]:
#Flatten herarchical column names
#http://stackoverflow.com/a/14508355/454773
df.columns = ['::'.join(col).strip() for col in df.columns.values]
df.head(3)
# In[252]:
df.head(3)
# In[253]:
df_long=pd.melt(df,id_vars=["Unnamed: 0_level_0::Unnamed: 0_level_1","Unnamed: 1_level_0::Unnamed: 1_level_1"])
df_long.head(3)
# In[254]:
df_long.columns=['Code','Name',"Type",'Count']
df_long.head(3)
# In[255]:
df_long['Type0']=df_long['Type'].map(lambda x: x.split('::')[0])
df_long['Type1']=df_long['Type'].map(lambda x: x.split('::')[1])
# In[256]:
df_long.head()
# In[262]:
df_long['Type0'].unique()
# In[266]:
#There's more than one way to do it...
#use .replace()
#df_long['Type0']=df_long['Type0'].str.replace('Unnamed: 3_level_0':'Number of A&E Attendances')
#etc
#use .map()
colmap={'Number of A&E Attendances':'Number of A&E Attendances',
'Unnamed: 3_level_0':'Number of A&E Attendances',
'Unnamed: 4_level_0':'Number of A&E Attendances',
'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission',
'Unnamed: 6_level_0':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission',
'Unnamed: 7_level_0':'Number of patients spending >4 hours in A&E from arrival to discharge, transfer or admission',
'Number of patients spending >4 hours but <12 hours from decision to admit to admission':'Number of patients spending >4 hours but <12 hours from decision to admit to admission',
'Unnamed: 9_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission',
'Unnamed: 10_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission',
'Unnamed: 11_level_0':'Number of patients spending >4 hours but <12 hours from decision to admit to admission',
'Number of patients spending >12 hours from decision to admit to admission':'Number of patients spending >12 hours from decision to admit to admission',
'Unnamed: 13_level_0':'Number of patients spending >12 hours from decision to admit to admission',
'Unnamed: 14_level_0':'Number of patients spending >12 hours from decision to admit to admission',
'Unnamed: 15_level_0':'Number of patients spending >12 hours from decision to admit to admission'}
# In[268]:
df_long['Type0']=df_long['Type0'].map(colmap)
# ### Adding metadata back in...
# One of the things we might want to do to this dataset is add some metadata back in relating to the date the data applies to. One source of this data was the filename:
#
# ANother source was in the metadata rows contained within the file itself:
#
# ### WSitAE CSV API Demo
# ### WSitAE-Extractions-AE
# In[247]:
import os
def WSitAE_AE_read(date,path='AEData'):
ls=os.listdir(path)
fn=None
for item in ls:
if item.find(date)!=-1 and item.find('WSitAE-Extractions-AE')!=-1:
fn=item
if fn is None: return pd.DataFrame()
ffn='{path}/{fn}'.format(path=path,fn=fn)
z={}
with open(ffn,'r') as f:
for i in range(0,2):
for a in s.split(','):
z[a.strip('"').split(':')[0]]=a.strip('"').split(':')[1]
f.close()
#Read the file in without setting any headers
dfw=pd.read_csv(ffn, header=None,skiprows=4)
#Get rid of any empty columns
dfw.dropna(how='all',axis=1,inplace=True)
#The contents of the first two rows will be our column headers
#Fill across so blank columns are named according to the previously named column
dfw[:2].fillna(method='ffill', axis=1,inplace=True)
#Generate a multiindex in the columns from the first two rows
dfw.columns=pd.MultiIndex.from_arrays(dfw[:2].values, names=['Category','SubCategory'])
#Flatten the columns multi-index to give a single set of flat column names
dfw.columns = ['Code','Name']+['::'.join(col).strip() for col in dfw.columns.values[2:]]
##Hack a date from period- should really cast to period
dfw['Period Name']=z['Period Name'].strip()#date
dfw['Date']=z['Period Name'].replace('W/E','').strip()
dfw['Year']=z['Year'].strip()
return dfw[2:]
def WSitAE_AE_melt(df):
#Declare the id_vars
idx=['Code','Name','Year','Period Name','Date']
df=pd.melt(df,id_vars=idx, var_name='Type',value_name='Value')
df['SubType']=df['Type'].map(lambda x: x.split('::')[1])
df['Type']=df['Type'].map(lambda x: x.split('::')[0])
return df[idx+['Type','SubType','Value']]
def WSitAE_AE_long(date,path='AEData'):
df=WSitAE_AE_read(date,path)
return WSitAE_AE_melt(df)
# In[2]:
get_ipython().system('ls AEdata')
# In[249]:
dfw=WSitAE_AE_long('2015.02.08')
dfw[:3]
# ### WSitAE-Extractions-Adm
# In[250]:
def WSitAE_Adm_read(date,path='AEData'):
ls=os.listdir(path)
fn=None
for item in ls:
if item.find(date)!=-1 and item.find('WSitAE-Extractions-Adm')!=-1:
fn=item
if fn is None: return pd.DataFrame()
ffn='{path}/{fn}'.format(path=path,fn=fn)
dfw=pd.read_csv(ffn, header=4)
dfw.dropna(how='all',axis=1,inplace=True)
dfw['Date']=dfw['Period Name'].str.replace('W/E','').str.strip()
cols=dfw.columns.tolist()
cols=cols[:3]+cols[-1:]+cols[3:-1]
return dfw[cols]
# In[251]:
WSitAE_Adm_read('2015.02.15')[:3]
# In[257]:
#Alternatively, we can load the data contained in the two CSV files from a single spreadsheet sheet
dfx=pd.read_excel('AEData/2015.03.08-AEHLnoU.xls',header=None,skiprows=14)
dfx.dropna(how='all',axis=1,inplace=True)
dfx[:1].fillna(method='ffill', axis=1,inplace=True)
dfx.columns=pd.MultiIndex.from_arrays(dfx[:2].values, names=['Category','SubCategory'])
dfx.columns = ['Area Team','Code','Name']+['::'.join(col).strip() for col in dfx.columns.values[3:]]
dfx[:3]
# In[ ]:
#Alternatively, we could have separated out the A&E data from the Emergency Admissions Data
# ## Daily SitRep
# In[15]:
import pandas as pd
def DailySR_read(f):
return pd.ExcelFile(f)
#xl=DailySR_read('AEData/DailySR-Web-file-WE-01.03.15.xlsx')
xl=DailySR_read('AEData/DailySR-Web-file-WE-07.12.14.xlsx')
xl.sheet_names
# In[115]:
def DailySR_parse(xl,sheet,skiprows=13,header=None):
xlclass={'A&E closures':0,
'A&E diverts':0,
'Cancelled operations':1,
'Critical care transfers':1,
'Ambulances queuing':0,
'G&A beds':1,
'D&V, Norovirus':1,
'Delayed transfers of care':0,
'Adult critical care':1,
'Paediatric intensive care':1,
'Neonatal intensive care ':1,
'Trusts with Ops problems':0,
'A&E Performance':1#,
#'Macro1':-1
}
if sheet not in xlclass: return pd.DataFrame()
dfx=xl.parse(sheet,skiprows=skiprows,header=header)
dfx.dropna(how='all',inplace=True)
dfx.dropna(how='all',axis=1,inplace=True)
cols=[]
if xlclass[sheet]==0:
dfx.drop([2,3],inplace=True)
dfx.dropna(how='all',axis=1,inplace=True)
dfx.columns=dfx.ix[1].values
dfx=dfx[1:]
elif xlclass[sheet]==1:
#This is a real hack, trying to think what a sensible shape is and how to get there
#Drop an empty row and the total count for England row
dfx.drop([2,3],inplace=True)
#Get rid of empty columns
dfx.dropna(how='all',axis=1,inplace=True)
#Fill across on dates
dfx[:1].fillna(method='ffill', axis=1,inplace=True)
#Patch the leading empty columns
dfx[:1].fillna('Metadata',inplace=True)
#Generate a multi-index across the columns from the first two rows
dfx.columns=pd.MultiIndex.from_arrays(dfx[:2].values, names=['Category','SubCategory'])
#Drop the rows we used to make the multi-index
dfx=dfx[2:]
#Transpose rows and columns
dfx=dfx.T
#Set the column names to be the hospital codes
dfx.columns=dfx.ix['Metadata','Code'].values
#And then throw away the metadata rows
dfx=dfx[3:]
#Tidy up by throwing away any empty columns we created
dfx.dropna(how='all',axis=1,inplace=True)
return dfx
xx=DailySR_parse(xl,'G&A beds')
xx[:4]
# In[117]:
xx['RNZ'][:,'Escalation beds avail']
# In[119]:
DailySR_parse(xl,'Paediatric intensive care')
# So now I've worked out the steps to get the separate waiting time
# ## Hierachical Column Indexes in Excel Spreadsheets
#
# Where spreadsheets contain multiple heterogenous sheets, we need to load them in using the *pandas* `ExcelFile` as the following examples using a *DailySR* report shows.
# In[208]:
dfx=pd.ExcelFile('AEdata/DailySR-Web-file-WE-01.02.15.xlsx')
dfx.sheet_names
# In[214]:
df=dfx.parse('G&A beds',header=None,skiprows=13,parse_cols="B,D:Y")
df.head()
# In[215]:
df.set_index([0,1,2],inplace=True)
df.head(2)
# In[216]:
#http://stackoverflow.com/a/24154138/454773
#fill in Null values in "Headers"
df[:1].fillna(method='ffill', axis=1,inplace=True)
#df[:1].fillna(value='Base',axis=1,inplace=True)
df.head(3)
# In[217]:
#http://stackoverflow.com/a/24154138/454773
#create multiindex column names
df.columns=pd.MultiIndex.from_arrays(df[:2].values, names=['Date','Report'])
# In[218]:
df[4:].head(3).T
# In[218]:
# In[218]:
# In[ ]: