#!/usr/bin/env python # coding: utf-8 # Wrapper for [Local authority housing statistics data returns for 2013 to 2014](https://www.gov.uk/government/statistical-data-sets/local-authority-housing-statistics-data-returns-for-2013-to-2014). # # As well as for: https://www.gov.uk/government/statistical-data-sets/live-tables-on-dwelling-stock-including-vacants # See folder: housingstock # In[9]: #!~/anaconda/bin/pip install --upgrade pyexcel-ods #!~/anaconda/bin/pip install --upgrade git+https://github.com/chfw/pyexcel-ods.git # In[5]: get_ipython().system('~/anaconda/bin/pip install --upgrade pip') # In[2]: get_ipython().system('ls housingstock') # In[1]: from pyexcel_ods import get_data data = get_data("housingstock/LiveTable_122.ods") # In[2]: data['LT122'] # In[8]: #import json ##http://stackoverflow.com/a/25895504/454773 #def date_handler(obj): # return obj.isoformat() if hasattr(obj, 'isoformat') else obj # #print json.dumps(data, default=date_handler) #print(json.dumps(data)) # In[ ]: # In[7]: #import pandas as pd #pd.DataFrame.from_dict(data).iloc[9].tolist() # In[3]: get_ipython().system('wget https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/390701/Local_Authority_Housing_Statistics_dataset_2013-14.xlsx') # In[2]: import pandas as pd dfx=pd.ExcelFile('Local_Authority_Housing_Statistics_dataset_2013-14.xlsx') dfx.sheet_names # In[3]: #Menu sheet parse to identify sheets A-I import re sd=re.compile(r'Section (\w) - (.*)$') sheetDetails={} for row in dfx.parse('Menu')[[1]].values: if str(row[0]).startswith('Section'): sheetDetails[sd.match(row[0]).group(1)]=sd.match(row[0]).group(2) sheetDetails # In[4]: dfx.parse('A',header=None)[:15] # In[5]: df=dfx.parse('A',header=None) df=df.dropna(how='all') row = df[df.apply(lambda x: (x == "DCLG code").any(), axis=1)].index.tolist()[0]#.values[0] # will be an array row # In[6]: df=dfx.parse('A',header=row).dropna(how='all').dropna(how='all',axis=1) df=df[df['DCLG code'].notnull()].reset_index(drop=True) df[:5] # In[7]: def dfgrabber(dfx,sheet): #First pass - identify row for headers df=dfx.parse(sheet,header=None) df=df.dropna(how='all') row = df[df.apply(lambda x: (x == "DCLG code").any(), axis=1)].index.tolist()[0]#.values[0] # will be an array #Second pass - generate dataframe df=dfx.parse(sheet,header=row).dropna(how='all').dropna(how='all',axis=1) df=df[df['DCLG code'].notnull()].reset_index(drop=True) df.columns=[c.split(' ')[0] for c in df.columns] return df,row # In[8]: dfgrabber(dfx,'A')[:5] # In[9]: pd.read_csv('messyindex.csv',header=[0,1,2]) # In[10]: #Next step - decode the column codes row=7 df=dfx.parse('A',header=None) title=df[0][[0]][0] title # In[11]: df=df[1:row+1].dropna(how='all').dropna(how='all',axis=1) df # In[12]: xx=dfx.parse('A',header=None)[1:row].dropna(how='all') xx # In[13]: #Fill down xx.fillna(method='ffill', axis=0,inplace=True) #Fill across xx=xx.fillna(method='ffill', axis=1) xx # In[14]: xx=xx.append(dfx.parse('A',header=None)[row:row+1]) xx # In[15]: dfx.parse('A',header=None)[row:row+1] # In[16]: #Add empty row trick via #http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/tutorials/things_in_pandas.ipynb import numpy as np def addemptyrow(df): df = df.append(pd.Series([np.nan]*len(df.columns),index=df.columns),ignore_index=True) return df xx=addemptyrow(xx) # In[17]: xx.to_csv('multi_index.csv',header=False,index=False) mxx=pd.read_csv('multi_index.csv',header=[0,1,2]) mxx # In[18]: for c in mxx.columns.get_level_values(0).tolist(): if c.startswith('Unnamed'): mxx = mxx.drop(c, level=0, axis=1) mxx # In[20]: keyx={} for r in dd: keyx[dd[r][0].split(' ')[0]]=r keyx # In[161]: import collections def coldecoder(dfx,sheet,row): zz=dfx.parse(sheet,header=None) stitle=zz[0][[0]][0] xx=zz[1:row].dropna(how='all') #Fill down xx.fillna(method='ffill', axis=0,inplace=True) #Fill across xx=xx.fillna(method='ffill', axis=1) #How many rows in the header? keydepth=len(xx) header=[i for i in range(0,keydepth)] xx=xx.append(zz[row:row+1]) xx.to_csv('multi_index.csv',header=False,index=False,encoding='utf-8') mxx=pd.read_csv('multi_index.csv',header=header,encoding='utf-8') for c in mxx.columns.get_level_values(0).tolist(): if c.startswith('Unnamed'): mxx = mxx.drop(c, level=0, axis=1) ##This returns an unordered dict:-( ##dd=mxx.to_dict(orient='list') ##keyx=collections.OrderedDict() #{} ##for r in dd: ## if not pd.isnull(dd[r][0]): ## print dd[r][0].split(' ')[0] ## keyx[dd[r][0].split(' ')[0]]=r #We need to preserve the order of the header columns dd=mxx.to_dict(orient='split') ddz=zip(dd['columns'],dd['data'][0]) keyx=collections.OrderedDict() #{} for r in ddz: if not pd.isnull(r[1]): #print r[1].split(' ')[0] keyx[r[1].split(' ')[0]]=r[0] return stitle,keyx,keydepth # In[ ]: df,row=dfgrabber(dfx,'B') sname,skey,kd=coldecoder(dfx,'B',row) # In[164]: sname # In[165]: df,row=dfgrabber(dfx,'D') sname,skey,kd=coldecoder(dfx,'D',row) # In[119]: skey # In[26]: df # In[117]: kq={} for k in skey: kq[k]=[] for j in skey[k]: if j not in kq[k]: kq[k].append(j) kq # In[28]: colmapper={} for kkq in kq: curr_level = colmapper depth=0 for path in kq[kkq]: depth=depth+1 if path not in curr_level: if depth