#!/usr/bin/env python # coding: utf-8 # # Getting Started # # The base containers include: # # - a PostgreSQL database # - a Jupyter notebook server containing a range of scientific python packages; # # The notebook container is lacking some libraries and packages we need to make life easier... The `-qq` and `--quiet` flags make the install a quiet one... # In[ ]: get_ipython().system('sudo apt-get update -qq && sudo apt-get install -qq -y libpq-dev python-dev') get_ipython().system('pip install --quiet ipython-sql psycopg2') # Jupyter notebooks support SQLMagic. This lets us run SQL commands in a code cell. # In[1]: get_ipython().run_line_magic('load_ext', 'sql') #This is how we connect to a sql database #Monolithic VM addressing style get_ipython().run_line_magic('sql', 'postgresql://postgres:PGPass@postgres:5432/postgres') # We can also run SQL commands on the database using the `pandas` python package. # In[2]: from sqlalchemy import create_engine engine = create_engine("postgresql://postgres:PGPass@postgres:5432/postgres") # ## Importing the base orientation data # # Loading CSV data directly into a PostgreSQL database often required the CSV to be well behaved; *pandas* is a bit more forgiving on the import and can be used to load the data from the original CSV file and then insert it into the PostgreSQL database. As *pandas* dataframes are stored in memory we need to load the data in from the source file in chunks of several thouasnds of lines at a time. # In[ ]: fn="companies-with-controlling-entities v6.csv" # In[ ]: get_ipython().system('head "$fn"') # The PostgreSQL database table will be defined based on the dataframe column names properties - we may need to force column datatypes when we read in the datafile. # In[3]: #Import the data from the CSV file import pandas as pd # In[ ]: #We'll import the base data into a table called: sigcontrol #Drop the table if it already exists so we start from a blank state get_ipython().run_line_magic('sql', 'DROP TABLE IF EXISTS sigcontrolchunks=pd.read_csv(fn,chunksize=10000,dtype=str)') #Note that we can also read in from a zip file - as long as it isn't password protected: #pd.read_csv(fn,chunksize=10000,dtype=str, compression='gzip') for chunk in chunks: #Pop the data into the chunk.to_sql('sigcontrol',engine,index=False, if_exists='append') # In[ ]: get_ipython().run_line_magic('sql', 'SELECT * FROM sigcontrol LIMIT 3') # ## Orientation Data Exercises # # To get you started with the data, here are some opening questions, with queries to solve them: # In[29]: #Who controls G4S? get_ipython().run_line_magic('sql', "SELECT * FROM sigcontrol WHERE company_name LIKE '%G4S%' LIMIT 3") # In[34]: # Who controls Whitbread? df=get_ipython().run_line_magic('sql', "SELECT * FROM sigcontrol WHERE company_name LIKE '%WHITBREAD%'") # We can create a graph that allows us to see how these companies are related to each other. The `networkx` library is a handy tool for this. # In[41]: import networkx as nx def companyInterestsGrapher(row,DG): DG.add_node(row['company_number'], Label=row['company_name']) DG.add_node(row['controlling_entity_company_number'], Label=row['Controlling Entity Name']) DG.add_edge(row['company_number'],row['controlling_entity_company_number']) return # In[42]: DG=nx.DiGraph() # In[44]: tmp=df.DataFrame().apply(lambda x: companyInterestsGrapher(x,DG), axis=1) # In[46]: DG.number_of_nodes() # In[65]: nx.draw(DG,with_labels=True,labels=node_labels) # In[63]: # In[60]: node_labels = nx.get_node_attributes(DG,'Label') node_labels # In[ ]: nx.write_gexf(DG, "companytree.gexf") # In[ ]: #Show the most popular named entity nodes for node in sorted(DG.in_degree_iter(),key=itemgetter(1),reverse=True)[:20]: print(node) # In[ ]: get_ipython().run_line_magic('pinfo', 'owners') A simple count of nationalities reveals 1,725 but with a lot of duplicates (i.e. there are 23 different ways of spelling/representing the nationality "Zimbabwean". We might need to remove text after a "/" to remove duel nationalities in some cleaning. We then might need to match to a complete list of nationalities to understand. How many royalty control companies? What is the age of the youngest person who controls a company? df = pd.read_csv("companies-with-controlling-entities-v5.csv") df.dob_year.min() df[df.dob_year == 2016] There are 1097 Registered in this year. What is the age of the oldest person who controls a company? df = pd.read_csv("companies-with-controlling-entities-v5.csv") df.dob_year.min() df[df.dob_year == 1916] There are three persons born in 1916 There is someone born in the future; in the year 9998! Which individual controls the most number of companies? df.apply(lambda x: ' '.join([str(x.given_name),str(x.family_name)]), axis=1).value_counts() The individual Gabrielle Southern owns 761 companies ( ! ) # In[ ]: # ## Spend Network Data # # If you try to unzip the data, you may get an error depending on the format of the zip fule used: # # - *unzip message need PK compat. v5.1 (can do v4.6)* : install the `p7zip-full` package and use `p7zip` to unzip. # # Install `p7zip`: # In[ ]: get_ipython().system('sudo apt-get install p7zip-full') # Launch a terminal from the notebooks homepage (*New -> Terminal*) and run the following command (when prompted, the password is *****): # # `7z e PATH_TO_FILE/FILENAME.csv.zip` # # For example: # `7z e Spend\ Network/central_gov_slim.csv.zip` # Load the data in (this may take some time). # In[22]: fn='central_gov_slim.csv' get_ipython().run_line_magic('sql', 'DROP TABLE IF EXISTS central_gov_slim') chunks=pd.read_csv(fn,chunksize=1000,dtype=str,encoding='utf-8-sig') err=0 #Note that we can also read in from a zip file - as long as it isn't password protected: #pd.read_csv(fn,chunksize=10000,dtype=str, compression='gzip') for chunk in chunks: #Pop the data into the try: chunk.to_sql('central_gov_slim',engine,index=False, if_exists='append') except: err=err+1 print(err) # In[24]: get_ipython().run_line_magic('sql', 'SELECT COUNT(*) FROM central_gov_slim') # In[ ]: