#!/usr/bin/env python # coding: utf-8 # # CQC and NHS Code Reconciliation # # CQC and NHS both maintain administrative datasets regarding a range of health and social care providers, but the identifier schemes are independent to each of them. # # This notebook explores a rough and ready way of rying to reconcile them. # # NHS administrative data retrieved from NHS Digital and added to a local database using the recipe described here: [NHS and GP Administrative Data.ipynb](https://github.com/psychemedia/openHealthDataDoodles/blob/master/notebooks/NHS%20and%20GP%20Administrative%20Data.ipynb) # # Note: for the purposes of this notebook, we could equally just download the NHS adminstrative data file `epraccurr` from [NHS Digital: GP and GP practice related data](https://digital.nhs.uk/organisation-data-service/data-downloads/gp-data). # In[3]: import pandas as pd # In[ ]: #!mkdir -p data #!wget http://www.cqc.org.uk/sites/default/files/HSCA%20Active%20Locations.xlsx -P data # Download the *CQC Active Locations* bulk data file: # In[8]: cqc_df=pd.read_excel('data/HSCA Active Locations.xlsx', skiprows=6, parse_dates=['HSCA start date','Provider HSCA start date']) cqc_df.head(3) # In[16]: #Identify the name of the postcode column [c for c in cqc_df.columns if 'post' in c.lower()] # Access the NHS GP practice list: # In[9]: import sqlite3 con = sqlite3.connect("nhsadmin.sqlite") EPRACCUR='epraccur' nhs_df=pd.read_sql_query('SELECT * FROM {typ}'.format(typ=EPRACCUR), con) nhs_df.head() # # ## Pass 1 - Postcode + Exact Name Match # # Use the postcode as a crib and then also match on exact name, albeit case insensitive. # # *epraccurr* status code - `A` is *active*. # In[48]: cqc_df['Location Name']=cqc_df['Location Name'].str.upper() merge1=pd.merge(nhs_df[nhs_df['Status Code']=='A'][['Organisation Code','Name','Postcode']], cqc_df[['Location ID','Location Name','Postal Code']], left_on=['Postcode','Name'], right_on=['Postal Code','Location Name'],indicator=True,how='left') m1=merge1 m1.head() # The `left join` means we retain all the current practices from the NHS Digital listing. # # We can filter out the rows that didn't match by searching for records with a `left_only` desginator in the `merge` column of the merged dataframe. # In[57]: print('Number of unmatched rows: {}'.format(len(m1[m1['_merge']=='left_only']))) m1[m1['_merge']=='left_only'].head() # ## Pass 2 - Postcode + Fuzzy Name Match # # Use the postcode as a crib and then try to fuzzy match on name, again case insensitive. # Let's see if we can do some partial/fuzzy matching with a relatively high degree of confidence: # In[75]: from fuzzywuzzy import fuzz m2=pd.merge(m1[m1['_merge']=='left_only'][['Organisation Code','Name','Postcode']], cqc_df[['Location ID','Location Name','Postal Code']], left_on=['Postcode'], right_on=['Postal Code'],indicator=True,how='left') m2['fuzz']=m2.dropna().apply(lambda x: fuzz.ratio(x['Name'],x['Location Name']),axis=1) m2['fuzz2']=m2.dropna().apply(lambda x: fuzz.partial_ratio(x['Name'],x['Location Name']),axis=1) # In[89]: def confdisp(minconf=85,maxconf=100,typ='fuzz',lim=10): return m2[(m2[typ]>=minconf) & (m2[typ]<=maxconf)].head(lim) print(len(confdisp(85))) confdisp(85).head() # We can make an interactive explorer using `ipywidgets` to explore different levels of confidence: # In[90]: from ipywidgets import interact import ipywidgets interact(confdisp, minconf=ipywidgets.IntSlider(min=0,max=100,step=1,value=85), maxconf=ipywidgets.IntSlider(min=0,max=100,step=1,value=100), typ=ipywidgets.RadioButtons(options=['fuzz', 'fuzz2'])); # We can also explore other fuzzy match combinations - note how we might be able to improve matters if we remove subject specific stop words e.g. *Suregry*, *Medical Centre* etc. # In[91]: m2[(m2['fuzz']<=85) & (m2['fuzz2']>=85)].head() # In[ ]: