#!/usr/bin/env python
# coding: utf-8

# In[1]:


#Stuff to initalise this notebook...
get_ipython().run_line_magic('load_ext', 'sql')
#This is how we connect to a sql database
#Monolithic VM addressing style
get_ipython().run_line_magic('sql', 'postgresql://postgres:PGPass@postgres:5432/postgres')

#COnnection details for connecting via pandas
from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres:PGPass@postgres:5432/postgres")

import pandas as pd

#Somwtimes it can be handly to create out own network/graph structures on the fly - networkx is good for that
import networkx as nx


# # Where Did You Say You Were From, Again?

# In[25]:


nationalities_df = get_ipython().run_line_magic('sql', 'SELECT nationality FROM sigcontrol WHERE nationality IS NOT NULL')
nationalities_df=nationalities_df.DataFrame()
nationalities_df['nationality'].value_counts()[:10]


# In[36]:


#!pip2 install git+https://github.com/amueller/word_cloud.git
from wordcloud import WordCloud

wc = WordCloud(collocations=False, background_color='white')


# This is one situation where a wordcloud may be useful...

# In[37]:


import matplotlib.pyplot as plt
nationalities_df['nationality']=nationalities_df['nationality'].str.strip().str.replace(' ','_')
wordcloud = wc.generate('\n '.join(nationalities_df['nationality']))
plt.imshow(wordcloud)


# Let's also look at some of the rarer submissions...

# In[40]:


#More rarely...
rarenationalities = get_ipython().run_line_magic('sql', 'SELECT nationality, COUNT(*) AS cnt FROM sigcontrol WHERE nationality IS NOT NULL                                  GROUP BY nationality HAVING COUNT(*) < 3')
rarenationalities= rarenationalities.DataFrame()

rarenationalities['nationality']=rarenationalities['nationality'].str.strip().str.replace(' ','_')
rarenationalities['freq']=rarenationalities['cnt'] / rarenationalities['cnt'].sum()
rarenationalities_dict = {x['nationality']: x['freq'] for x in rarenationalities.to_dict(orient='records')}

wordcloud = wc.generate_from_frequencies(rarenationalities_dict)
plt.imshow(wordcloud)


# # Country matching using Fuzzyset
# 
# [`fuzzyset`](https://github.com/axiak/fuzzyset) is a new-to-me library that looks like it could be useful: given a set of "true" items, give it a scruffy item and it will try to find a match in the known item list.
# 
# It runs in Python2 - we can launch a python2 notebook or change the notebook kernel to the python2 one via the notebook *Kernel -> Change Kernel* menu item.

# In[6]:


get_ipython().system('pip2 install --quiet git+https://github.com/axiak/fuzzyset.git')
from fuzzyset import FuzzySet


# As far as nationalities go, th UK Foreign Office register of country names also includes nationalities:

# In[ ]:


#UK Foreign Office Country Register
get_ipython().system('wget https://country.register.gov.uk/records.csv?page-size=5000')


# In[8]:


nationalities_fco=pd.read_csv('records.csv?page-size=5000')[['country','name','official-name','citizen-names']]
nationalities_fco.head()


# There's a bit of mess in the data...

# In[9]:


nationalities_fco[nationalities_fco['country']=='NL']


# So we need to figure out how to handle those...

# In[10]:


def countrysplit(x):
    if ';' in x:
        for y in x.split(';'):
            y=y.replace('citizen','').strip()
            fuzzycountry.add(y)
    else: fuzzycountry.add(x)


# For the `fuzzyset` masterlist, use all the nationalities that the UK FCO recognise in their register.

# In[11]:


fuzzycountry = FuzzySet()
nationalities_fco.apply(lambda x: countrysplit(x['citizen-names']),axis=1)
fuzzycountry.get("Dutch")


# However...

# In[13]:


unique_nationalities = nationalities_df['nationality'].unique().tolist()
unique_nationalities[:3]


# In[14]:


fuzzycountry.get('English')


# So we need to think about how to cope with things like that... search and replace? Or add some known 'not-nationalities' to the *fuzzycountry* list and then replace those later?
# 
# For now, let's just see if we can try to generate matches into FCO nationaliaties from the nationaliaties identified in the register of interests orientation dataset.

# In[15]:


ff={}
for u in unique_nationalities:
    matches=fuzzycountry.get(str(u))
    if matches:
        if matches[0][1] not in ff:
            ff[matches[0][1]]=[u]
        else:
            ff[matches[0][1]].append(u)


# In[16]:


ff


# In[35]:


#Look up alternatives for a given country
' | '.join(ff['British'])


# In[34]:


#Demo of using a text widget - enter an nationality (case sesnitve - needs to be a dict key) and see near hits
from ipywidgets import interact
import ipywidgets as widgets

inputText=widgets.Text()

def crappyNationality(n):
    if n.value in ff:
        for r in ff[n.value]:
            print(r)

inputText.on_submit(crappyNationality)
inputText


# In[ ]:


#Should probably also try the original recipe with range of alternative similiarlity measures...
#!pip install jellyfish
#import jellyfish

#See also:
#pudo: https://pypi.python.org/pypi/countrynames