#!/usr/bin/env python # coding: utf-8 # In[1]: #Stuff to initalise this notebook... get_ipython().run_line_magic('load_ext', 'sql') #This is how we connect to a sql database #Monolithic VM addressing style get_ipython().run_line_magic('sql', 'postgresql://postgres:PGPass@postgres:5432/postgres') #COnnection details for connecting via pandas from sqlalchemy import create_engine engine = create_engine("postgresql://postgres:PGPass@postgres:5432/postgres") import pandas as pd #Somwtimes it can be handly to create out own network/graph structures on the fly - networkx is good for that import networkx as nx # # Where Did You Say You Were From, Again? # In[25]: nationalities_df = get_ipython().run_line_magic('sql', 'SELECT nationality FROM sigcontrol WHERE nationality IS NOT NULL') nationalities_df=nationalities_df.DataFrame() nationalities_df['nationality'].value_counts()[:10] # In[36]: #!pip2 install git+https://github.com/amueller/word_cloud.git from wordcloud import WordCloud wc = WordCloud(collocations=False, background_color='white') # This is one situation where a wordcloud may be useful... # In[37]: import matplotlib.pyplot as plt nationalities_df['nationality']=nationalities_df['nationality'].str.strip().str.replace(' ','_') wordcloud = wc.generate('\n '.join(nationalities_df['nationality'])) plt.imshow(wordcloud) # Let's also look at some of the rarer submissions... # In[40]: #More rarely... rarenationalities = get_ipython().run_line_magic('sql', 'SELECT nationality, COUNT(*) AS cnt FROM sigcontrol WHERE nationality IS NOT NULL GROUP BY nationality HAVING COUNT(*) < 3') rarenationalities= rarenationalities.DataFrame() rarenationalities['nationality']=rarenationalities['nationality'].str.strip().str.replace(' ','_') rarenationalities['freq']=rarenationalities['cnt'] / rarenationalities['cnt'].sum() rarenationalities_dict = {x['nationality']: x['freq'] for x in rarenationalities.to_dict(orient='records')} wordcloud = wc.generate_from_frequencies(rarenationalities_dict) plt.imshow(wordcloud) # # Country matching using Fuzzyset # # [`fuzzyset`](https://github.com/axiak/fuzzyset) is a new-to-me library that looks like it could be useful: given a set of "true" items, give it a scruffy item and it will try to find a match in the known item list. # # It runs in Python2 - we can launch a python2 notebook or change the notebook kernel to the python2 one via the notebook *Kernel -> Change Kernel* menu item. # In[6]: get_ipython().system('pip2 install --quiet git+https://github.com/axiak/fuzzyset.git') from fuzzyset import FuzzySet # As far as nationalities go, th UK Foreign Office register of country names also includes nationalities: # In[ ]: #UK Foreign Office Country Register get_ipython().system('wget https://country.register.gov.uk/records.csv?page-size=5000') # In[8]: nationalities_fco=pd.read_csv('records.csv?page-size=5000')[['country','name','official-name','citizen-names']] nationalities_fco.head() # There's a bit of mess in the data... # In[9]: nationalities_fco[nationalities_fco['country']=='NL'] # So we need to figure out how to handle those... # In[10]: def countrysplit(x): if ';' in x: for y in x.split(';'): y=y.replace('citizen','').strip() fuzzycountry.add(y) else: fuzzycountry.add(x) # For the `fuzzyset` masterlist, use all the nationalities that the UK FCO recognise in their register. # In[11]: fuzzycountry = FuzzySet() nationalities_fco.apply(lambda x: countrysplit(x['citizen-names']),axis=1) fuzzycountry.get("Dutch") # However... # In[13]: unique_nationalities = nationalities_df['nationality'].unique().tolist() unique_nationalities[:3] # In[14]: fuzzycountry.get('English') # So we need to think about how to cope with things like that... search and replace? Or add some known 'not-nationalities' to the *fuzzycountry* list and then replace those later? # # For now, let's just see if we can try to generate matches into FCO nationaliaties from the nationaliaties identified in the register of interests orientation dataset. # In[15]: ff={} for u in unique_nationalities: matches=fuzzycountry.get(str(u)) if matches: if matches[0][1] not in ff: ff[matches[0][1]]=[u] else: ff[matches[0][1]].append(u) # In[16]: ff # In[35]: #Look up alternatives for a given country ' | '.join(ff['British']) # In[34]: #Demo of using a text widget - enter an nationality (case sesnitve - needs to be a dict key) and see near hits from ipywidgets import interact import ipywidgets as widgets inputText=widgets.Text() def crappyNationality(n): if n.value in ff: for r in ff[n.value]: print(r) inputText.on_submit(crappyNationality) inputText # In[ ]: #Should probably also try the original recipe with range of alternative similiarlity measures... #!pip install jellyfish #import jellyfish #See also: #pudo: https://pypi.python.org/pypi/countrynames