#!/usr/bin/env python # coding: utf-8 # # Quick Look At TEF Data # *Tony Hirst, @psychemedia* # # Notebook to load and have a quick peek at the Teaching Excellence Framework (TEF) data: [Teaching Excellence Framework](http://www.hefce.ac.uk/lt/tef/). # # Example spreadsheet format used to develop the original code can be found here: [Teaching Excellence Framework: year 2 specification](https://www.gov.uk/government/publications/teaching-excellence-framework-year-2-specification). # # Final spreadsheets available from: [http://www.hefce.ac.uk/lt/tef/data/](http://www.hefce.ac.uk/lt/tef/data/) # ## User Settings # In[10]: #Set to false for full run #TEST: True, False TEST=False # In[6]: #Put Excel files into specified (sub)directory and load them from there directory='TEFYearTwo_AllMetrics'#'tef' # ## Utils # # Bits and bobs that may or may not be useful... # In[3]: #Packages import os import pandas as pd #Support inline plotting get_ipython().run_line_magic('matplotlib', 'inline') # In[4]: #Set letter vars as numeric - may be useful for referencing cells? import string k=0 for l in string.ascii_uppercase: exec(l+"="+str(k)) k=k+1 A, D, Z # In[5]: #Get some UK HE admin data - may be useful? #Via http://learning-provider.data.ac.uk/ #lp=pd.read_csv('learning-providers-plus.csv') lp=pd.read_csv('http://learning-provider.data.ac.uk/data/learning-providers-plus.csv') lp.head() # ## Get the Data # # Data is published as via [http://www.hefce.ac.uk/lt/tef/data/](http://www.hefce.ac.uk/lt/tef/data/). # # Originally it was thought that data files would be provided just as individual Excel spreadsheets ([original example](https://www.gov.uk/government/publications/teaching-excellence-framework-year-2-specification)), one per institution, hence the need for this notebook. In the end, multiple versions of the data were made available, including monolithic CSV documents (of file for all institutions), rendering this notebook superfluous. # # The final released spreadsheets had some minor differences in layout and sheet labeling from the original example. The original notebook scripts were not intelligent enough to automatically cope with these and nor is this one. For example, cell ranges are hard coded rather than being autodetected. # # In[177]: #Download the zipped Excel workbooks get_ipython().system('wget http://www.hefce.ac.uk/media/HEFCE,2014/Content/Learning,and,teaching/TEF/TEFYearTwo/data/TEFYearTwo_AllMetrics.zip') # Put all the spreadsheet files into a single folder; the script will then load them in and generate some monolithic CSV files of various flavours. # # Note the processing of the files by the rest of the notebook is not very efficient... # In[184]: get_ipython().system('mkdir -p {directory}') get_ipython().system('unzip -q TEFYearTwo_AllMetrics.zip -d {directory}') # In[35]: #Column names are crappy with white space - clean it off def cleanCols(df): df.rename(columns=lambda x: str(x).strip(),inplace=True) return df #Hack the PRN out of each sheet and use it to key data values with a new prn column in each dataset def getPRN2(fn,sn): i=pd.read_excel(fn, sheetname=sn, parse_cols=[0,1],header=None).head() institution=None prn=None institution=str(i[i[0].notnull() & i[0].str.contains('Institution')][1].iloc[0]).strip() prn=str(i[i[0].notnull() & i[0].str.contains('UKPRN')][1].iloc[0]).strip() return institution,prn def getPRN(fn,sn): #TEFYearTwo_AllMetrics/10007792_University of Exeter_Metrics.xlsx prn=fn.split('/')[1].split('_')[0] institution=fn.split('/')[1].split('_')[1] return institution,prn #Quick helper that would let us leave test spreadsheets in the data dir. def fntest(f): if TEST: return not f.startswith('~') and f.startswith('TEST') return not f.startswith('~') and not f.startswith('TEST') # In[167]: fn=os.listdir(directory)[0] xl=pd.read_excel('{}/{}'.format(directory,fn), sheetname=None) xl.keys() # In[169]: #get PRN from filename getPRN('{}/{}'.format(directory,fn),list(xl.keys())[1]) # ### Notes on Parsing the spreadsheets # # The spreadsheets include various compound tables which need processing to split out the separate subtables, such as data for full-time versus part-time students. # # The final spreadsheets also had populated, administrative(?) cells at the extremity of many, if not all, sheets. Using the `pandas.read_excel()` function's `parse_cols` parameter allow the ingest of only the desired columns. (Loading in the whole sheet width often caused the multiple header definition part of the ingest to break.) # ## Core metrics # ## Core metrics and splits # # Guess that the spreadsheets are regular and use absolute range finding... Which is really dangerous... # In[104]: def get_coremetricsAndSplits(fn): sn='Core metrics and splits' print(fn) xls=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=7, header=[0,1]).dropna(how='all',axis=0)) institution,prn=getPRN(fn,sn) i_fulltime=xls[xls.columns[0:27]][:6] i_fulltime.index.names=['Topic'] i_fulltime['PRN']=prn i_fulltime['type']='FT' i_fulltime=i_fulltime.reset_index().set_index(['Topic','PRN']) i_fulltime.drop('Unnamed: 0_level_0', level=0, axis=1, inplace=True) i_parttime=xls[xls.columns[0:27]][7:13] i_parttime.index.names=['Topic'] i_parttime['PRN']=prn i_parttime['type']='PT' i_parttime=i_parttime.reset_index().set_index(['Topic','PRN']) i_parttime.drop('Unnamed: 0_level_0', level=0, axis=1, inplace=True) return i_fulltime, i_parttime cms_ft=pd.DataFrame() cms_pt=pd.DataFrame() cms_complete=pd.DataFrame() #Loop through all the files in the declared directory, and parse the contents of a particular sheet out for filename in [f for f in os.listdir(directory) if fntest(f)]: cms_ft_tmp, cms_pt_tmp=get_coremetricsAndSplits('{}/{}'.format(directory,filename)) cms_ft=pd.concat([cms_ft,cms_ft_tmp]) cms_pt=pd.concat([cms_pt,cms_pt_tmp]) cms_complete=pd.concat([cms_complete,cms_pt_tmp,cms_ft_tmp]) cms_complete.to_csv('tef_coremetricandsplits.csv') # In[105]: cms_complete.head() # In[106]: #It may be easier to work with if we put everything into long format? cms_complete_long=cms_complete.reset_index().melt(id_vars=['Topic','PRN', 'type'], var_name=['Heading','Subheading']) cms_complete_long.head() # In[107]: cms_complete_long['type'].unique() # In[108]: #In long format it's easy to write queires... For example, report on ++ rows: cms_complete_long[cms_complete_long['value']=='++'].head() # In[511]: #A query over more columns... cms_complete_long[(cms_complete_long['value']=='++') & (cms_complete_long['Heading']=='Disadvantaged') & (cms_complete_long['Subheading']=='Yes')] # In[783]: cms_complete.to_csv('tef_coremetricandsplits_long.csv', index=False) # In[943]: #Would it be easier/more useful to demo SQL queries? from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) q=''' SELECT * FROM cms_complete_long WHERE value='++' AND Heading='Disadvantaged'AND Subheading='Yes' LIMIT 10; ''' pysqldf(q) # ## Contextual data # In[49]: def get_contextualdata(fn): sn='Contextual data' print(fn) institution,prn=getPRN(fn,sn) _demog=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=11,parse_cols=8)) _demog.columns=['Category','Dummy','Group','FT Headcount','FT %','PT Headcount','PT %','Total Headcount','Total %'] _demog = _demog.drop('Dummy', axis=1) _demog.fillna(method='ffill',inplace=True) _demog['PRN']=prn _subj=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=9,parse_cols=range(10,17))).dropna(how='all',axis=1).dropna(how='all',axis=0) _subj.columns=['Subject of study','FT Headcount','FT %','PT Headcount','PT %','Total Headcount','Total %'] _subj['PRN']=prn return _demog, _subj demog=pd.DataFrame() subj=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: demog_tmp,subj_tmp=get_contextualdata('{}/{}'.format(directory,filename)) demog=pd.concat([demog,demog_tmp]) subj=pd.concat([subj,subj_tmp]) # In[170]: demog.head() # In[585]: subj.head() # In[594]: subj[['Subject of study','PRN','Total Headcount']].head() # In[825]: subj_long=subj.melt(id_vars=['Subject of study','PRN'],var_name='Category') subj_long['value']= pd.to_numeric(subj_long['value'],errors='coerce') subj_long.head() # In[872]: def longify(df,id_vars=['Topic','PRN'],var_name=['Heading','Subheading'],valname='value',resetIndex=True): if resetIndex: df=df.reset_index() df=df.melt(id_vars=id_vars, var_name=var_name, value_name=valname) df[valname]= pd.to_numeric(df[valname],errors='coerce') return df # In[873]: subj_long=longify(subj,id_vars=['Subject of study','PRN'],var_name='Category', resetIndex=False) subj_long.head() # In[785]: subj_long.to_csv('tef_subj_long.csv', index=False) # ## Indicators (a) # In[143]: def get_indicators(fn): sn='Indicator (a)' print(fn) institution,prn=getPRN(fn,sn) i=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22, header=[0,1]).dropna(how='all',axis=0).dropna(how='all',axis=1)) i['PRN']=prn ift=i[1:7][:] ift['Type']='FT' ipt=i[8:14][:] ipt['Type']='PT' i=pd.concat([ift,ipt]).dropna(how='all',axis=1) i.index.names=['Topic'] i=i.reset_index().set_index(['Topic','PRN']) return i # In[144]: indicators=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: indicators=pd.concat([indicators,get_indicators('{}/{}'.format(directory,filename))]) indicators.to_csv('tef_indicators.csv') # In[171]: indicators.head(15) # In[877]: get_ipython().system('head tef_indicators.csv') # In[878]: pd.read_csv('tef_indicators.csv',header=[0,1],index_col=[0,1]).head() # In[122]: dummyPRN=indicators.index.get_level_values('PRN').unique().tolist()[0] dummyPRN # In[123]: #Example cross-section indicators.xs(dummyPRN, level='PRN')['Age','Young'] # In[881]: #Display the PRNs in the index indicators.index.get_level_values('PRN').unique().tolist() # In[882]: #Example filter on column and PRN indicators.xs('JHDH', level='PRN')['Domicile'] # In[883]: indicators_long=longify(indicators) indicators_long.tail(50).head(10) # In[821]: indicators_long.to_csv('tef_indicators_long.csv', index=False) # ## Benchmarks (b) # In[131]: #This code is similar to that used for Zscores sheet def get_benchmarks(fn): sn='Benchmark (b)' print(fn) institution,prn=getPRN(fn,sn) z=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=21, header=[0,1])) z['PRN']=prn zft=z[1:7][:] zft['Type']='FT' zpt=z[8:14][:] zpt['Type']='PT' z=pd.concat([zft,zpt]).dropna(how='all',axis=1) sd=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=30,parse_cols=21, header=[0,1])) sd['PRN']=prn sdft=sd[1:7][:] sdft['Type']='FT' sdpt=sd[8:14][:] sdpt['Type']='PT' sd=pd.concat([sdft,sdpt]).dropna(how='all',axis=1) z['Benchmark type']='benchmark' sd['Benchmark type']='providercontrib' z.index.names=['Topic'] sd.index.names=['Topic'] return z,sd # In[132]: benchmark=pd.DataFrame() providercontrib=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: benchmark_tmp,providercontrib_tmp= get_benchmarks(fn='{}/{}'.format(directory,filename)) benchmark=pd.concat([benchmark,benchmark_tmp]) providercontrib=pd.concat([providercontrib,providercontrib_tmp]) # In[172]: benchmark.head() # In[861]: providercontrib.head() # In[891]: benchmark_long=longify(benchmark,id_vars=['Topic','PRN','Benchmark type'],valname='value') benchmark_long.head() # In[892]: providercontrib_long=longify(providercontrib,id_vars=['Topic','PRN','Benchmark type'],valname='value') providercontrib_long.head() # In[937]: benchmarkprovider=pd.concat([benchmark_long,providercontrib_long]) benchmarkprovider.head() # In[941]: benchmarkprovider_wide=benchmarkprovider.pivot_table(index=['Topic','PRN','Heading','Subheading'], columns='Benchmark type', values='value').reset_index() benchmarkprovider_wide.head() # In[ ]: benchmarkprovider_wide.to_csv('tef_benchmarkprovider_wide.csv', index=False) # ## Differences (a-b) # In[149]: #Thic code is actually same as for indicators I think? def get_differences(fn): sn='Difference (a-b)' print(fn) institution,prn=getPRN(fn,sn) i=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22, header=[0,1]).dropna(how='all',axis=0).dropna(how='all',axis=1)) i.index.names=['Topic'] i['PRN']=prn ift=i[1:7][:] ift['Type']='FT' ipt=i[8:14][:] ipt['Type']='PT' i=pd.concat([ift,ipt]).dropna(how='all',axis=1) i=i[:].reset_index().set_index(['Topic','PRN']) return i # In[150]: differences=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: differences=pd.concat([differences,get_differences('{}/{}'.format(directory,filename))]) differences.head()#.to_csv('tef_differences.csv') # In[173]: differences.head(20) # In[853]: differences_long=longify(differences,'value') differences_long.head() # In[854]: differences_long.to_csv('tef_differences_long.csv', index=False) # ## Zscores # # These relate to the indicators. # In[165]: def get_zscores(fn): sn='Z-score' print(fn) institution,prn=getPRN(fn,sn) z=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22,header=[0,1])) z['PRN']=prn zft=z[1:7][:] zft['Type']='FT' zpt=z[8:14][:] zpt['Type']='PT' sd=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=30,parse_cols=22, header=[0,1])) sd['PRN']=prn sdft=sd[1:7][:] sdft['Type']='FT' sdpt=sd[8:14][:] sdpt['Type']='PT' z=pd.concat([zft,zpt]).dropna(how='all',axis=1) sd=pd.concat([sdft,sdpt]).dropna(how='all',axis=1) z['Error type']='z' sd['Error type']='sd' z.index.names=['Topic'] sd.index.names=['Topic'] return z,sd #fn='tef/TEST_TEF_Metrics_workbook_exemplar.xlsx' #z,sd= get_zscores(fn) z=pd.DataFrame() sd=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: z_tmp,sd_tmp= get_zscores(fn='{}/{}'.format(directory,filename)) z=pd.concat([z,z_tmp]) sd=pd.concat([sd,sd_tmp]) # In[174]: z.head(20) # In[841]: sd.head() # In[855]: sd_long=longify(sd,'sd value') sd_long.head() # In[856]: z_long=longify(z,'z value') z_long.head() # In[758]: indicators_long.head() # In[795]: ifull_long=indicators_long.merge( sd_long, on=['Topic','PRN','Heading','Subheading']) ifull_long=ifull_long.merge( z_long, on=['Topic','PRN','Heading','Subheading']) ifull_long.head() # In[796]: ifull_long.to_csv('tef_indicators_with_error_long.csv', index=False) # ## Ethnicity # In[82]: def get_ethnicity(fn): sn='BME breakdown' print(fn) institution,prn=getPRN(fn,sn) e=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=9,parse_cols=5)) e.columns=['Category','NULL','BME','Black','Asian','Other'] e['PRN']=prn ft=e[:6].dropna(how='all',axis=1) ft['Type']='FT' pt=e[7:13].dropna(how='all',axis=1)#.dropna(how='all',axis=1))[:-1] pt['Type']='PT' return pd.concat([ft,pt]) # In[175]: #fn='tef/TEF_Metrics_workbook_exemplar.xlsx' #get_ethnicity(fn) ethnicity=pd.DataFrame() for filename in [f for f in os.listdir(directory) if not f.startswith('~')]: ethnicity=pd.concat([ethnicity,get_ethnicity('{}/{}'.format(directory,filename))]) ethnicity.to_csv('tef_ethnicity.csv',index=False) # In[631]: get_ipython().system('head tef_ethnicity.csv') # In[176]: ethnicity.head() # In[ ]: