#!/usr/bin/env python
# coding: utf-8

# # CQC and NHS Code Reconciliation
# 
# CQC and NHS both maintain administrative datasets regarding a range of health and social care providers, but the identifier schemes are independent to each of them.
# 
# This notebook explores a rough and ready way of rying to reconcile them.
# 
# NHS administrative data retrieved from NHS Digital and added to a local database using the recipe described here: [NHS and GP Administrative Data.ipynb](https://github.com/psychemedia/openHealthDataDoodles/blob/master/notebooks/NHS%20and%20GP%20Administrative%20Data.ipynb)
# 
# Note: for the purposes of this notebook, we could equally just download the NHS adminstrative data file `epraccurr` from [NHS Digital: GP and GP practice related data](https://digital.nhs.uk/organisation-data-service/data-downloads/gp-data).

# In[3]:


import pandas as pd


# In[ ]:


#!mkdir -p data
#!wget http://www.cqc.org.uk/sites/default/files/HSCA%20Active%20Locations.xlsx -P data


# Download the *CQC Active Locations* bulk data file:

# In[8]:


cqc_df=pd.read_excel('data/HSCA Active Locations.xlsx',
                 skiprows=6,
                 parse_dates=['HSCA start date','Provider HSCA start date'])
cqc_df.head(3)


# In[16]:


#Identify the name of the postcode column
[c for c in cqc_df.columns if 'post' in c.lower()]


# Access the NHS GP practice list:

# In[9]:


import sqlite3
con = sqlite3.connect("nhsadmin.sqlite")
EPRACCUR='epraccur'
nhs_df=pd.read_sql_query('SELECT * FROM {typ}'.format(typ=EPRACCUR), con)
nhs_df.head()


# 
# ## Pass 1 - Postcode + Exact Name Match
# 
# Use the postcode as a crib and then also match on exact name, albeit case insensitive.
# 
# *epraccurr* status code - `A` is *active*.

# In[48]:


cqc_df['Location Name']=cqc_df['Location Name'].str.upper()
merge1=pd.merge(nhs_df[nhs_df['Status Code']=='A'][['Organisation Code','Name','Postcode']],
                       cqc_df[['Location ID','Location Name','Postal Code']],
                      left_on=['Postcode','Name'], right_on=['Postal Code','Location Name'],indicator=True,how='left')
m1=merge1
m1.head()


# The `left join` means we retain all the current practices from the NHS Digital listing.
# 
# We can filter out the rows that didn't match by searching for records with a `left_only` desginator in the `merge` column of the merged dataframe.

# In[57]:


print('Number of unmatched rows: {}'.format(len(m1[m1['_merge']=='left_only'])))
m1[m1['_merge']=='left_only'].head()


# ## Pass 2 - Postcode + Fuzzy Name Match
# 
# Use the postcode as a crib and then try to fuzzy match on name, again case insensitive.

# Let's see if we can do some partial/fuzzy matching with a relatively high degree of confidence:

# In[75]:


from fuzzywuzzy import fuzz

m2=pd.merge(m1[m1['_merge']=='left_only'][['Organisation Code','Name','Postcode']],
                       cqc_df[['Location ID','Location Name','Postal Code']],
                      left_on=['Postcode'], right_on=['Postal Code'],indicator=True,how='left')

m2['fuzz']=m2.dropna().apply(lambda x: fuzz.ratio(x['Name'],x['Location Name']),axis=1)
m2['fuzz2']=m2.dropna().apply(lambda x: fuzz.partial_ratio(x['Name'],x['Location Name']),axis=1)


# In[89]:


def confdisp(minconf=85,maxconf=100,typ='fuzz',lim=10):
    return m2[(m2[typ]>=minconf) & (m2[typ]<=maxconf)].head(lim)


print(len(confdisp(85)))
      
confdisp(85).head()


# We can make an interactive explorer using `ipywidgets` to explore different levels of confidence:

# In[90]:


from ipywidgets import interact
import ipywidgets


interact(confdisp,
         minconf=ipywidgets.IntSlider(min=0,max=100,step=1,value=85),
         maxconf=ipywidgets.IntSlider(min=0,max=100,step=1,value=100),
         typ=ipywidgets.RadioButtons(options=['fuzz', 'fuzz2']));


# We can also explore other fuzzy match combinations - note how we might be able to improve matters if we remove subject specific stop words e.g. *Suregry*, *Medical Centre* etc.

# In[91]:


m2[(m2['fuzz']<=85) & (m2['fuzz2']>=85)].head()


# In[ ]: