#!/usr/bin/env python
# coding: utf-8

# # Quick Look At TEF Data
# *Tony Hirst, @psychemedia*
# 
# Notebook to load and have a quick peek at the Teaching Excellence Framework (TEF) data: [Teaching Excellence Framework](http://www.hefce.ac.uk/lt/tef/).
# 
# Example spreadsheet format used to develop the original code can be found here: [Teaching Excellence Framework: year 2 specification](https://www.gov.uk/government/publications/teaching-excellence-framework-year-2-specification).
# 
# Final spreadsheets available from: [http://www.hefce.ac.uk/lt/tef/data/](http://www.hefce.ac.uk/lt/tef/data/)

# ## User Settings

# In[10]:


#Set to false for full run
#TEST: True, False
TEST=False


# In[6]:


#Put Excel files into specified (sub)directory and load them from there
directory='TEFYearTwo_AllMetrics'#'tef'


# ## Utils
# 
# Bits and bobs that may or may not be useful...

# In[3]:


#Packages
import os
import pandas as pd

#Support inline plotting
get_ipython().run_line_magic('matplotlib', 'inline')


# In[4]:


#Set letter vars as numeric - may be useful for referencing cells?
import string
k=0
for l in string.ascii_uppercase:
    exec(l+"="+str(k))
    k=k+1
    
A, D, Z


# In[5]:


#Get some UK HE admin data - may be useful?
#Via http://learning-provider.data.ac.uk/
#lp=pd.read_csv('learning-providers-plus.csv')
lp=pd.read_csv('http://learning-provider.data.ac.uk/data/learning-providers-plus.csv')
lp.head()


# ## Get the Data
# 
# Data is published as via [http://www.hefce.ac.uk/lt/tef/data/](http://www.hefce.ac.uk/lt/tef/data/).
# 
# Originally it was thought that data files would be provided just as individual Excel spreadsheets ([original example](https://www.gov.uk/government/publications/teaching-excellence-framework-year-2-specification)), one per institution, hence the need for this notebook. In the end, multiple versions of the data were made available, including monolithic CSV documents (of file for all institutions), rendering this notebook superfluous.
# 
# The final released spreadsheets had some minor differences in layout and sheet labeling from the original example. The original notebook scripts were not intelligent enough to automatically cope with these and nor is this one. For example, cell ranges are hard coded rather than being autodetected.
# 

# In[177]:


#Download the zipped Excel workbooks
get_ipython().system('wget http://www.hefce.ac.uk/media/HEFCE,2014/Content/Learning,and,teaching/TEF/TEFYearTwo/data/TEFYearTwo_AllMetrics.zip')
    

# Put all the spreadsheet files into a single folder; the script will then load them in and generate some monolithic CSV files of various flavours.
# 
# Note the processing of the files by the rest of the notebook is not very efficient...

# In[184]:


get_ipython().system('mkdir -p {directory}')
get_ipython().system('unzip -q TEFYearTwo_AllMetrics.zip -d {directory}')


# In[35]:


#Column names are crappy with white space - clean it off
def cleanCols(df):
    df.rename(columns=lambda x: str(x).strip(),inplace=True)
    return df


#Hack the PRN out of each sheet and use it to key data values with a new prn column in each dataset
def getPRN2(fn,sn):
    i=pd.read_excel(fn, sheetname=sn, parse_cols=[0,1],header=None).head()
    institution=None
    prn=None
    institution=str(i[i[0].notnull() & i[0].str.contains('Institution')][1].iloc[0]).strip()
    prn=str(i[i[0].notnull() & i[0].str.contains('UKPRN')][1].iloc[0]).strip()
    return institution,prn

def getPRN(fn,sn):
    #TEFYearTwo_AllMetrics/10007792_University of Exeter_Metrics.xlsx
    prn=fn.split('/')[1].split('_')[0]
    institution=fn.split('/')[1].split('_')[1]
    return institution,prn

#Quick helper that would let us leave test spreadsheets in the data dir.
def fntest(f):
    if TEST: return not f.startswith('~') and f.startswith('TEST')
    return not f.startswith('~') and not f.startswith('TEST')


# In[167]:


fn=os.listdir(directory)[0]
xl=pd.read_excel('{}/{}'.format(directory,fn), sheetname=None)
xl.keys()


# In[169]:


#get PRN from filename
getPRN('{}/{}'.format(directory,fn),list(xl.keys())[1])


# ### Notes on Parsing the spreadsheets
# 
# The spreadsheets include various compound tables which need processing to split out the separate subtables, such as data for full-time versus part-time students.
# 
# The final spreadsheets also had populated, administrative(?) cells at the extremity of many, if not all, sheets. Using the `pandas.read_excel()` function's `parse_cols` parameter allow the ingest of only the desired columns. (Loading in the whole sheet width often caused the multiple header definition part of the ingest to break.)

# ## Core metrics

# ## Core metrics and splits
# 
# Guess that the spreadsheets are regular and use absolute range finding... Which is really dangerous...

# In[104]:


def get_coremetricsAndSplits(fn):
    sn='Core metrics and splits'
    print(fn)
    xls=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=7,
                              header=[0,1]).dropna(how='all',axis=0))
    institution,prn=getPRN(fn,sn)
    i_fulltime=xls[xls.columns[0:27]][:6]
    i_fulltime.index.names=['Topic']
    i_fulltime['PRN']=prn
    i_fulltime['type']='FT'

    i_fulltime=i_fulltime.reset_index().set_index(['Topic','PRN'])

    i_fulltime.drop('Unnamed: 0_level_0', level=0, axis=1, inplace=True)

    i_parttime=xls[xls.columns[0:27]][7:13]

    i_parttime.index.names=['Topic']
    i_parttime['PRN']=prn
    i_parttime['type']='PT'

    i_parttime=i_parttime.reset_index().set_index(['Topic','PRN'])

    i_parttime.drop('Unnamed: 0_level_0', level=0, axis=1, inplace=True)
    return i_fulltime, i_parttime
    
cms_ft=pd.DataFrame()
cms_pt=pd.DataFrame()
cms_complete=pd.DataFrame()

#Loop through all the files in the declared directory, and parse the contents of a particular sheet out
for filename in [f for f in os.listdir(directory) if fntest(f)]:
    cms_ft_tmp, cms_pt_tmp=get_coremetricsAndSplits('{}/{}'.format(directory,filename))
    cms_ft=pd.concat([cms_ft,cms_ft_tmp])
    cms_pt=pd.concat([cms_pt,cms_pt_tmp])
    cms_complete=pd.concat([cms_complete,cms_pt_tmp,cms_ft_tmp])

cms_complete.to_csv('tef_coremetricandsplits.csv')


# In[105]:


cms_complete.head()


# In[106]:


#It may be easier to work with if we put everything into long format?
cms_complete_long=cms_complete.reset_index().melt(id_vars=['Topic','PRN', 'type'],
                                                  var_name=['Heading','Subheading'])
cms_complete_long.head()


# In[107]:


cms_complete_long['type'].unique()


# In[108]:


#In long format it's easy to write queires... For example, report on ++ rows:
cms_complete_long[cms_complete_long['value']=='++'].head()


# In[511]:


#A query over more columns...
cms_complete_long[(cms_complete_long['value']=='++') & 
                  (cms_complete_long['Heading']=='Disadvantaged') &
                  (cms_complete_long['Subheading']=='Yes')]


# In[783]:


cms_complete.to_csv('tef_coremetricandsplits_long.csv', index=False)


# In[943]:


#Would it be easier/more useful to demo SQL queries?
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

q='''
SELECT * FROM cms_complete_long
WHERE value='++' AND Heading='Disadvantaged'AND Subheading='Yes' LIMIT 10;
'''
pysqldf(q)


# ## Contextual data

# In[49]:


def get_contextualdata(fn):
    sn='Contextual data'
    print(fn)
    
    institution,prn=getPRN(fn,sn)
    
    _demog=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=11,parse_cols=8))
    _demog.columns=['Category','Dummy','Group','FT Headcount','FT %','PT Headcount','PT %','Total Headcount','Total %']
    _demog = _demog.drop('Dummy', axis=1)
    _demog.fillna(method='ffill',inplace=True)
    _demog['PRN']=prn
    
    _subj=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=9,parse_cols=range(10,17))).dropna(how='all',axis=1).dropna(how='all',axis=0)
    _subj.columns=['Subject of study','FT Headcount','FT %','PT Headcount','PT %','Total Headcount','Total %']
    _subj['PRN']=prn
    return _demog, _subj

demog=pd.DataFrame()
subj=pd.DataFrame()


for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    demog_tmp,subj_tmp=get_contextualdata('{}/{}'.format(directory,filename))
    demog=pd.concat([demog,demog_tmp])
    subj=pd.concat([subj,subj_tmp])


# In[170]:


demog.head()


# In[585]:


subj.head()


# In[594]:


subj[['Subject of study','PRN','Total Headcount']].head()


# In[825]:


subj_long=subj.melt(id_vars=['Subject of study','PRN'],var_name='Category')
subj_long['value']= pd.to_numeric(subj_long['value'],errors='coerce')
subj_long.head()


# In[872]:


def longify(df,id_vars=['Topic','PRN'],var_name=['Heading','Subheading'],valname='value',resetIndex=True):
    if resetIndex: df=df.reset_index()
    df=df.melt(id_vars=id_vars,
               var_name=var_name,
               value_name=valname)
    df[valname]= pd.to_numeric(df[valname],errors='coerce')
    return df


# In[873]:


subj_long=longify(subj,id_vars=['Subject of study','PRN'],var_name='Category', resetIndex=False)
subj_long.head()


# In[785]:


subj_long.to_csv('tef_subj_long.csv', index=False)


# ## Indicators (a)

# In[143]:


def get_indicators(fn):
    sn='Indicator (a)'
    print(fn)
    institution,prn=getPRN(fn,sn)

    i=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22,
                              header=[0,1]).dropna(how='all',axis=0).dropna(how='all',axis=1))
    
    i['PRN']=prn
    
    ift=i[1:7][:]
    ift['Type']='FT'
    ipt=i[8:14][:]
    ipt['Type']='PT'
    
    i=pd.concat([ift,ipt]).dropna(how='all',axis=1)
    i.index.names=['Topic']


    i=i.reset_index().set_index(['Topic','PRN'])

    return i


# In[144]:


indicators=pd.DataFrame()

for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    indicators=pd.concat([indicators,get_indicators('{}/{}'.format(directory,filename))])
    
indicators.to_csv('tef_indicators.csv')


# In[171]:


indicators.head(15)


# In[877]:


get_ipython().system('head tef_indicators.csv')


# In[878]:


pd.read_csv('tef_indicators.csv',header=[0,1],index_col=[0,1]).head()


# In[122]:


dummyPRN=indicators.index.get_level_values('PRN').unique().tolist()[0]
dummyPRN


# In[123]:


#Example cross-section
indicators.xs(dummyPRN, level='PRN')['Age','Young']


# In[881]:


#Display the PRNs in the index
indicators.index.get_level_values('PRN').unique().tolist()


# In[882]:


#Example filter on column and PRN
indicators.xs('JHDH', level='PRN')['Domicile']


# In[883]:


indicators_long=longify(indicators)
indicators_long.tail(50).head(10)


# In[821]:


indicators_long.to_csv('tef_indicators_long.csv', index=False)


# ## Benchmarks (b)

# In[131]:


#This code is similar to that used for Zscores sheet
def get_benchmarks(fn):
    sn='Benchmark (b)'
    print(fn)
    institution,prn=getPRN(fn,sn)

    z=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=21, header=[0,1]))
    z['PRN']=prn
    zft=z[1:7][:]
    zft['Type']='FT'
    zpt=z[8:14][:]
    zpt['Type']='PT'
    z=pd.concat([zft,zpt]).dropna(how='all',axis=1)
    
    sd=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=30,parse_cols=21, header=[0,1]))
    sd['PRN']=prn
    sdft=sd[1:7][:]
    sdft['Type']='FT'
    sdpt=sd[8:14][:]
    sdpt['Type']='PT'
    
    sd=pd.concat([sdft,sdpt]).dropna(how='all',axis=1)
    
    z['Benchmark type']='benchmark'
    sd['Benchmark type']='providercontrib'
    
    z.index.names=['Topic']
    sd.index.names=['Topic']
    
    return z,sd


# In[132]:


benchmark=pd.DataFrame()
providercontrib=pd.DataFrame()

for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    benchmark_tmp,providercontrib_tmp= get_benchmarks(fn='{}/{}'.format(directory,filename))
    benchmark=pd.concat([benchmark,benchmark_tmp])
    providercontrib=pd.concat([providercontrib,providercontrib_tmp])


# In[172]:


benchmark.head()


# In[861]:


providercontrib.head()


# In[891]:


benchmark_long=longify(benchmark,id_vars=['Topic','PRN','Benchmark type'],valname='value')
benchmark_long.head()


# In[892]:


providercontrib_long=longify(providercontrib,id_vars=['Topic','PRN','Benchmark type'],valname='value')
providercontrib_long.head()


# In[937]:


benchmarkprovider=pd.concat([benchmark_long,providercontrib_long])
benchmarkprovider.head()


# In[941]:


benchmarkprovider_wide=benchmarkprovider.pivot_table(index=['Topic','PRN','Heading','Subheading'],
                                                     columns='Benchmark type',
                                                     values='value').reset_index()
benchmarkprovider_wide.head()


# In[ ]:


benchmarkprovider_wide.to_csv('tef_benchmarkprovider_wide.csv', index=False)


# ## Differences (a-b)

# In[149]:


#Thic code is actually same as for indicators I think?
def get_differences(fn):
    sn='Difference (a-b)'
    print(fn)
    institution,prn=getPRN(fn,sn)

    i=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22,
                              header=[0,1]).dropna(how='all',axis=0).dropna(how='all',axis=1))
    i.index.names=['Topic']
    i['PRN']=prn
    
    ift=i[1:7][:]
    ift['Type']='FT'
    ipt=i[8:14][:]
    ipt['Type']='PT'
    
    i=pd.concat([ift,ipt]).dropna(how='all',axis=1)

    i=i[:].reset_index().set_index(['Topic','PRN'])

    return i


# In[150]:


differences=pd.DataFrame()

for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    differences=pd.concat([differences,get_differences('{}/{}'.format(directory,filename))])
    
differences.head()#.to_csv('tef_differences.csv')


# In[173]:


differences.head(20)


# In[853]:


differences_long=longify(differences,'value')
differences_long.head()


# In[854]:


differences_long.to_csv('tef_differences_long.csv', index=False)


# ## Zscores
# 
# These relate to the indicators.

# In[165]:


def get_zscores(fn):
    sn='Z-score'
    print(fn)
    institution,prn=getPRN(fn,sn)

    z=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=8,parse_cols=22,header=[0,1]))
    z['PRN']=prn
    zft=z[1:7][:]
    zft['Type']='FT'
    zpt=z[8:14][:]
    zpt['Type']='PT'
    
    sd=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=30,parse_cols=22, header=[0,1]))
    sd['PRN']=prn
    sdft=sd[1:7][:]
    sdft['Type']='FT'
    sdpt=sd[8:14][:]
    sdpt['Type']='PT'
    
    z=pd.concat([zft,zpt]).dropna(how='all',axis=1)
    sd=pd.concat([sdft,sdpt]).dropna(how='all',axis=1)
    
    z['Error type']='z'
    sd['Error type']='sd'
    
    z.index.names=['Topic']
    sd.index.names=['Topic']
    
    return z,sd

#fn='tef/TEST_TEF_Metrics_workbook_exemplar.xlsx'
#z,sd= get_zscores(fn)

z=pd.DataFrame()
sd=pd.DataFrame()

for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    z_tmp,sd_tmp= get_zscores(fn='{}/{}'.format(directory,filename))
    z=pd.concat([z,z_tmp])
    sd=pd.concat([sd,sd_tmp])


# In[174]:


z.head(20)


# In[841]:


sd.head()


# In[855]:


sd_long=longify(sd,'sd value')
sd_long.head()


# In[856]:


z_long=longify(z,'z value')
z_long.head()


# In[758]:


indicators_long.head()


# In[795]:


ifull_long=indicators_long.merge( sd_long, on=['Topic','PRN','Heading','Subheading'])
ifull_long=ifull_long.merge( z_long, on=['Topic','PRN','Heading','Subheading'])
ifull_long.head()


# In[796]:


ifull_long.to_csv('tef_indicators_with_error_long.csv', index=False)


# ## Ethnicity

# In[82]:


def get_ethnicity(fn):
    sn='BME breakdown'
    print(fn)
    institution,prn=getPRN(fn,sn)

    e=cleanCols(pd.read_excel(fn, sheetname=sn, skiprows=9,parse_cols=5))
    e.columns=['Category','NULL','BME','Black','Asian','Other']
    e['PRN']=prn
    
    ft=e[:6].dropna(how='all',axis=1)
    ft['Type']='FT'
    
    pt=e[7:13].dropna(how='all',axis=1)#.dropna(how='all',axis=1))[:-1]
    pt['Type']='PT'

    return pd.concat([ft,pt])


# In[175]:


#fn='tef/TEF_Metrics_workbook_exemplar.xlsx'
#get_ethnicity(fn)

ethnicity=pd.DataFrame()

for filename in [f for f in os.listdir(directory) if not f.startswith('~')]:
    ethnicity=pd.concat([ethnicity,get_ethnicity('{}/{}'.format(directory,filename))])
    
ethnicity.to_csv('tef_ethnicity.csv',index=False)


# In[631]:


get_ipython().system('head tef_ethnicity.csv')


# In[176]:


ethnicity.head()


# In[ ]: