#!/usr/bin/env python
# coding: utf-8

# # Getting Started
# 
# The base containers include:
# 
# - a PostgreSQL database
# - a Jupyter notebook server containing a range of scientific python packages;
# 
# The notebook container is lacking some libraries and packages we need to make life easier... The `-qq` and `--quiet` flags make the install a quiet one...

# In[ ]:


get_ipython().system('sudo apt-get update -qq  && sudo apt-get install -qq -y libpq-dev python-dev')
get_ipython().system('pip install --quiet ipython-sql psycopg2')


# Jupyter notebooks support SQLMagic. This lets us run SQL commands in a code cell.

# In[1]:


get_ipython().run_line_magic('load_ext', 'sql')
#This is how we connect to a sql database
#Monolithic VM addressing style
get_ipython().run_line_magic('sql', 'postgresql://postgres:PGPass@postgres:5432/postgres')


# We can also run SQL commands on the database using the `pandas` python package.

# In[2]:


from sqlalchemy import create_engine
engine = create_engine("postgresql://postgres:PGPass@postgres:5432/postgres")


# ## Importing the base orientation data
# 
# Loading CSV data directly into a PostgreSQL database often required the CSV to be well behaved; *pandas* is a bit more forgiving on the import and can be used to load the data from the original CSV file and then insert it into the PostgreSQL database. As *pandas* dataframes are stored in memory we need to load the data in from the source file in chunks of several thouasnds of lines at a time.

# In[ ]:


fn="companies-with-controlling-entities v6.csv"


# In[ ]:


get_ipython().system('head "$fn"')


# The PostgreSQL database table will be defined based on the dataframe column names properties - we may need to force column datatypes when we read in the datafile.

# In[3]:


#Import the data from the CSV file
import pandas as pd


# In[ ]:


#We'll import the base data into a table called: sigcontrol
#Drop the table if it already exists so we start from a blank state
get_ipython().run_line_magic('sql', 'DROP TABLE IF EXISTS sigcontrolchunks=pd.read_csv(fn,chunksize=10000,dtype=str)')

#Note that we can also read in from a zip file - as long as it isn't password protected:
#pd.read_csv(fn,chunksize=10000,dtype=str, compression='gzip')
for chunk in chunks:
    #Pop the data into the 
    chunk.to_sql('sigcontrol',engine,index=False, if_exists='append')


# In[ ]:


get_ipython().run_line_magic('sql', 'SELECT * FROM sigcontrol LIMIT 3')


# ## Orientation Data Exercises
# 
# To get you started with the data, here are some opening questions, with queries to solve them:

# In[29]:


#Who controls G4S?
get_ipython().run_line_magic('sql', "SELECT * FROM sigcontrol WHERE company_name LIKE '%G4S%' LIMIT 3")


# In[34]:


# Who controls Whitbread?
df=get_ipython().run_line_magic('sql', "SELECT * FROM sigcontrol WHERE company_name LIKE '%WHITBREAD%'")


# We can create a graph that allows us to see how these companies are related to each other. The `networkx` library is a handy tool for this.

# In[41]:


import networkx as nx

def companyInterestsGrapher(row,DG):
    DG.add_node(row['company_number'], Label=row['company_name'])
    DG.add_node(row['controlling_entity_company_number'], Label=row['Controlling Entity Name'])
    DG.add_edge(row['company_number'],row['controlling_entity_company_number'])
    return


# In[42]:


DG=nx.DiGraph()


# In[44]:


tmp=df.DataFrame().apply(lambda x: companyInterestsGrapher(x,DG), axis=1)


# In[46]:


DG.number_of_nodes()


# In[65]:


nx.draw(DG,with_labels=True,labels=node_labels)


# In[63]:


# In[60]:


node_labels = nx.get_node_attributes(DG,'Label')
node_labels


# In[ ]:


nx.write_gexf(DG, "companytree.gexf")


# In[ ]:


#Show the most popular named entity nodes
for node in sorted(DG.in_degree_iter(),key=itemgetter(1),reverse=True)[:20]:
    print(node)


# In[ ]:


get_ipython().run_line_magic('pinfo', 'owners')
A simple count of nationalities reveals 1,725 but with a lot of duplicates (i.e. there are 23 different ways of spelling/representing the nationality "Zimbabwean". We might need to remove text after a "/" to remove duel nationalities in some cleaning. We then might need to match to a complete list of nationalities to understand.
How many royalty control companies?
What is the age of the youngest person who controls a company?
df = pd.read_csv("companies-with-controlling-entities-v5.csv")
df.dob_year.min()
df[df.dob_year == 2016]
There are 1097 Registered in this year.
What is the age of the oldest person who controls a company?
df = pd.read_csv("companies-with-controlling-entities-v5.csv")
df.dob_year.min()
df[df.dob_year == 1916]
There are three persons born in 1916
There is someone born in the future; in the year 9998!
Which individual controls the most number of companies?
df.apply(lambda x: ' '.join([str(x.given_name),str(x.family_name)]), axis=1).value_counts()
The individual Gabrielle Southern  owns 761 companies ( ! )


# In[ ]:


# ## Spend Network Data
# 
# If you try to unzip the data, you may get an error depending on the format of the zip fule used:
# 
# - *unzip message need PK compat. v5.1 (can do v4.6)* : install the `p7zip-full` package and use `p7zip` to unzip.
# 
# Install `p7zip`:

# In[ ]:


get_ipython().system('sudo apt-get install p7zip-full')


# Launch a terminal from the notebooks homepage (*New -> Terminal*) and run the following command (when prompted, the password is *****):
# 
# `7z e PATH_TO_FILE/FILENAME.csv.zip` 
# 
# For example:
# `7z e Spend\ Network/central_gov_slim.csv.zip`  

# Load the data in (this may take some time).

# In[22]:


fn='central_gov_slim.csv'
get_ipython().run_line_magic('sql', 'DROP TABLE IF EXISTS central_gov_slim')

chunks=pd.read_csv(fn,chunksize=1000,dtype=str,encoding='utf-8-sig')

err=0

#Note that we can also read in from a zip file - as long as it isn't password protected:
#pd.read_csv(fn,chunksize=10000,dtype=str, compression='gzip')
for chunk in chunks:
    #Pop the data into the 
    try:
        chunk.to_sql('central_gov_slim',engine,index=False, if_exists='append')
    except:
        err=err+1
print(err)


# In[24]:


get_ipython().run_line_magic('sql', 'SELECT COUNT(*) FROM central_gov_slim')


# In[ ]: