#!/usr/bin/env python # coding: utf-8 # # The plan # # * Live "Presentation" with Jupyter Notebooks, guaranteed not to work # * Pandas # * You lot trolling me # # # Top Level Funky Stuff # # * Data Import / Export functionality # * *Actually Decent* Datetime Operations # * Boiled in Statistics # * Groupby operations # * Integrations (`seaborn`,`cufflinks`) # # FYI / Disclaimers # * NI Postcode Data Sucks # * NI Constituency Boundaries Suck # * This has not been updated with corrected DfE RHI Claimant Data as it was not output in an open, readable format, so the below is shown only for educational purposes and no political or journalistic decision making should be based on its content... # # # Requirements # * condable (or pip as backup) # * pandas matplotlib seaborn beautifulsoup4 lxml xlrd requests' # * pip # * cufflinks # * Rise presentation funkiness (requires python == 3.5) # * https://github.com/damianavila/RISE # # # Data Sources # * Election Office Constituency/Postcode data - http://www.eoni.org.uk/Elections/Electoral-areas # * RHI Data - Friendly Slackers # # In[17]: import pandas as pd from matplotlib import pyplot as plt from IPython.display import display get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = (20.0, 10.0) # ## Ingress and Formatting # In[18]: rhi_csv = 'rhi-nondomestic-beneficiaries-ltd-companies-organisations.csv' #rhi_csv = 'tabula-RHI-beneficiaries-non-domestic-individuals-companies.csv' df = pd.read_csv(rhi_csv) display(df.head()) display(df.dtypes) # In[19]: df = pd.read_csv(rhi_csv, names=['Company','Date','Postcode', 'Type','Capacity (kWtH)', 'Cash'] ) display(df.head()) display(df.dtypes) # In[20]: df = pd.read_csv(rhi_csv, names=['Company','Date','Postcode', 'Type','Capacity (kWtH)', 'Cash'] ) df['Date'] = pd.to_datetime(df['Date']) display(df.head()) display(df.dtypes) # In[21]: df = pd.read_csv(rhi_csv, names=['Company','Date','Postcode', 'Type','Capacity (kWtH)', 'Cash'] ) df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') display(df.head()) display(df.dtypes) # In[22]: df = pd.read_csv(rhi_csv, names=['Company','Date','Postcode', 'Type','Capacity (kWtH)', 'Cash'] ) df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') df['Cash'] = df['Cash'].replace('[£,]','', regex=True).astype(float) display(df.head()) display(df.dtypes) # ## Stats and Basic Grouping # In[23]: df.Cash.describe() # In[24]: f,ax = plt.subplots() df.Cash.hist(ax=ax) # In[25]: df.Postcode.unique() # In[26]: df.Postcode.isnull() # In[27]: df.dropna(how='all', inplace=True) df.Postcode.sort_values().unique() # In[28]: # df['Postcode'] = df['Postcode'].apply(str.upper) df['Postcode'] = df['Postcode'].str.upper() # In[29]: df[df.Postcode=='BA14'] # df.query("Postcode == 'BA14'") # In[79]: df.Cash.sum() # In[30]: df.groupby('Postcode')['Cash'].sum().sort_values().head(20) # In[31]: f,ax = plt.subplots() _=df.groupby('Postcode')['Cash'].sum().hist(ax=ax) # In[32]: f,ax = plt.subplots() _=df.groupby('Postcode')['Cash'].sum().sort_values(ascending=False).plot.pie(ax=ax) # In[33]: def make_autopct(total): def my_autopct(pct): val = int(round(pct*total/100.0)) return '{p:.1f}% (£{v:,.1f}m)'.format(p=pct,v=val/1000000) return my_autopct f,ax = plt.subplots() _=df.groupby('Postcode')['Cash'].sum().sort_values(ascending=False)\ .plot.pie(ax=ax, autopct=make_autopct(df.Cash.sum())) # ## Web Assets and Joins (AKA BeautifulSoup) # In[34]: ea_url = "http://www.eoni.org.uk/Elections/Electoral-areas" import urllib _url = urllib.parse.urlparse(ea_url) _base_url = _url.scheme + '://' + _url.netloc import requests from bs4 import BeautifulSoup as bs r = bs(requests.get(_url.geturl()).text, 'lxml') for link in r.find_all('a'): print(link.contents) # In[35]: constituency_urls = {} for link in r.find_all('a'): if str(link.contents[0]).endswith('(XLS)'): constituency = ' '.join(link.contents[0].split()[:-4]) constituency_urls[constituency] = _base_url+link.get('href') print(constituency,constituency_urls[constituency]) # In[36]: pd.read_excel(constituency_urls['North Antrim']).head() # In[37]: pd.read_excel(constituency_urls['North Antrim'], skiprows=3).head() # In[38]: _cons_df = pd.read_excel(constituency_urls['North Antrim'], skiprows=3) _cons_df['Base Post Code'] = _cons_df['Post Code'].apply(lambda s: s.split()[0]) _cons_df.head() # In[39]: _cons_df = pd.read_excel(constituency_urls['North Antrim'], skiprows=3) _cons_df['Base Post Code'] = _cons_df['Post Code'].apply(lambda s: s.split()[0]) _cons_df.head() # In[40]: constituency_frames = {} for _constituency, _url in constituency_urls.items(): _cons_df = pd.read_excel(_url, skiprows=3) _cons_df['Base Post Code'] = _cons_df['Post Code'].apply(lambda s: s.split()[0]) constituency_frames[_constituency] = _cons_df # In[41]: constituency_frames['North Antrim'].head() # But there's a better way to do this; add the constituency information to *each row* using a single assign # # Then simply concatenate all the frames together into one big one # In[42]: _cons_df = pd.read_excel(constituency_urls['North Antrim'], skiprows=3) _cons_df['Base Post Code'] = _cons_df['Post Code'].apply(lambda s: s.split()[0]) _cons_df['Constituency'] = 'North Antrim' _cons_df.head() # In[43]: constituency_frames = {} for _constituency, _url in constituency_urls.items(): _cons_df = pd.read_excel(_url, skiprows=3) _cons_df['Base Post Code'] = _cons_df['Post Code'].apply(lambda s: s.split()[0]) _cons_df['Constituency'] = _constituency constituency_frames[_constituency] = _cons_df constituency_frame = pd.concat(constituency_frames.values()) constituency_frame.head() # We've only got time to look at the postcode-constituency angle, so we'll focus on those columns # In[44]: constituency_frame[['Base Post Code','Constituency']].head() # However, because postcodes suck, the mapping from Post code to Constituency, especially with only the first part as reported in the RHI stats, is not one-to-one # In[45]: constituency_frame[constituency_frame['Base Post Code'] == 'BT41']\ .Constituency.unique() # In[46]: constituency_frame[['Base Post Code','Constituency']]\ .groupby('Base Post Code')['Constituency'].describe()\ .unstack().sort_values('unique').tail() # In[47]: postcode_cons_map = constituency_frame[['Base Post Code','Constituency']]\ .groupby('Base Post Code')['Constituency'].max()\ .to_frame().reset_index() postcode_cons_map # SQL-style joins (yes, it does inner, outer, left etc) # In[48]: postcode_cons_map = postcode_cons_map.rename(columns={'Base Post Code':'Postcode'}) con_df=pd.merge(df,postcode_cons_map, on='Postcode', how='left') display(df.shape) display(con_df.shape) con_df # Pass-through plotting arguments to `matplotlib` (i.e. `autopct`) # In[49]: f,ax = plt.subplots() _=con_df.groupby('Constituency')['Cash'].sum().sort_values(ascending=False).plot.pie(ax=ax, autopct='%.2f%%') # In[50]: f,ax = plt.subplots() _=con_df.groupby('Constituency')['Cash'].sum().sort_values(ascending=False).plot.pie(ax=ax, autopct=make_autopct(df.Cash.sum())) # In[51]: f,ax = plt.subplots() _=con_df.groupby('Constituency')['Cash'].sum().sort_values(ascending=False).plot.pie(ax=ax, autopct=make_autopct(df.Cash.sum())) # ## DateTime Formatting and Grouping # In[52]: con_df.groupby(pd.TimeGrouper(key='Date',freq='M')).size() # In[53]: f,ax = plt.subplots() con_df.groupby(pd.TimeGrouper(key='Date',freq='M'))['Cash'].sum().plot(ax=ax) # In[54]: con_df.groupby([pd.TimeGrouper(key='Date',freq='M'),'Constituency'])['Cash']\ .sum().unstack().head() # In[55]: from matplotlib import dates import matplotlib.ticker as tick def _x_month_formatter(x,y): return '{year}-{month}'.format(year=x.year, month=x.month) f,ax = plt.subplots() _con_df_month_con=con_df.groupby([pd.TimeGrouper(key='Date',freq='M'),'Constituency'])['Cash'].sum().unstack() _con_df_month_con.plot.bar(ax=ax, stacked=True, cmap=plt.get_cmap('Vega20')) ax.yaxis.set_major_formatter(tick.FuncFormatter(lambda x,y: '£{:,d}k'.format(y))) ax.xaxis.set_major_formatter(tick.FixedFormatter(_con_df_month_con.index.map(lambda d: '{}-{}'.format(d.year,d.month)))) _=plt.setp( ax.xaxis.get_majorticklabels(), rotation=60 ) ax.legend(loc='upper center', bbox_to_anchor=(1.35, 1.05), ncol=1, fancybox=True, shadow=True) # # `seaborn` # Fancy plots and quick analysis # In[56]: import seaborn as sns f,ax = plt.subplots() sns.boxplot(data=df, x=df.Date.dt.year, y='Cash',ax=ax) # In[57]: df.sort_values('Cash', ascending=False) # In[58]: df[df.Date>=pd.Timestamp('2015/10/01')].sort_values('Cash', ascending=False) # In[59]: con_df.groupby('Company').sum().plot.scatter(x="Capacity (kWtH)",y="Cash") # In[60]: f,ax = plt.subplots() sns.regplot(x="Capacity (kWtH)",y="Cash", data=con_df.groupby('Company').sum(), ax=ax) # In[61]: sns.jointplot(x="Capacity (kWtH)",y="Cash", data=con_df.groupby('Company').sum(), kind='reg') # In[62]: company_df = con_df.groupby('Company').sum() company_df['N'] = con_df.groupby('Company').size() company_df['Cash Per Cap.'] = company_df['Cash']/company_df['Capacity (kWtH)'] company_df.sort_values('Cash Per Cap.', ascending=False) ##http://www.newsletter.co.uk/news/farmer-who-got-660k-for-13-rhi-boilers-says-only-80k-was-profit-1-7870639 # # `cufflinks` # # Fancy interactive plotting with `plotly` # In[63]: import cufflinks con_df.groupby(pd.TimeGrouper(key='Date',freq='M'))['Cash'].sum()\ .iplot() # In[ ]: # In[64]: _con_df_month_con=con_df.groupby([pd.TimeGrouper(key='Date',freq='M'),'Constituency'])['Cash'].sum().unstack() _con_df_month_con.iplot() # In[65]: # https://plot.ly/~bolster/211 _con_df_month_con.iplot(kind='bar', barmode='stack', filename='pybfs-stackedbar', world_readable=True) # In[66]: _con_df_month_con.iplot(kind='heatmap',colorscale='spectral') # In[67]: _con_df_month_con.head() # In[68]: _con_df_month_con.sum().sort_values(ascending=False) # In[69]: _con_df_month_con[_con_df_month_con.sum().sort_values(ascending=False).index]\ .iplot(kind='heatmap',colorscale='spectral', filename='pybfs-heatmap',world_readable=True) # # Extra Fun Bits # ## TQDM Notebook # # In[70]: from tqdm import tqdm, tqdm_notebook, tqdm_pandas, tnrange from time import sleep for j in tqdm_notebook(range(100), desc='2nd loop'): sleep(0.01) # In[71]: _df = pd.DataFrame(np.random.randint(0, 100, (100000, 6))) tn=tqdm_notebook() tn.pandas(desc='Wrong kind of bar') def worker(value): sleep(0.1) return value.sum() _df.progress_apply(worker) # In[72]: _df = pd.DataFrame(np.random.randint(0, 100, (100000, 6))) tn=tqdm_notebook() tn.pandas(desc='Wrong kind of bar') def worker(value): sleep(0.1) if not value.sum()%10: tn.write('Teven') return value.sum() _df.progress_apply(worker) # # Extra Toys # In[73]: df[df.Company.str.contains('urch')] # In[74]: df.groupby('Company')['Capacity (kWtH)'].sum().sort_values(ascending=False).head() # In[75]: df.groupby('Company').size().sort_values(ascending=False).head() # In[76]: con_df[con_df.Date > pd.Timestamp('2015/09/28')].groupby('Constituency')['Cash'].sum().sort_values(ascending=False).plot.barh() # In[77]: # All-time ratios of how much cash each constituency got con_ratio = con_df.groupby('Constituency')['Cash'].sum()/con_df['Cash'].sum() # October/November/December 2015 ratios of how much cash each constituency got nov_con_ratio = con_df[con_df.Date > pd.Timestamp('2015/09/28')].groupby('Constituency')['Cash'].sum()/con_df[con_df.Date > pd.Timestamp('2015/09/28')]['Cash'].sum() # How much each constituency increased above their normal application cost ratio in the final months of RHI pd.concat([con_ratio, nov_con_ratio], keys=['fuggit','this'], axis=1).diff(axis=1)['this'].plot.barh() # In[82]: con_df.to_hdf('datastore.h5','original') # In[ ]: