We start by importing all necessary libraries
from py2neo import Graph # to connect to our neo4j Graph database
from pandas import DataFrame
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
import itertools
#https://gist.github.com/DaniSancas/1d5265fc159a95ff457b940fc5046887 Cypher cheatsheet
Connect to running instance of Neo4J
graph_url = "http://localhost:7474/db/data"
graph = Graph(graph_url)
Test the connection by returning, for example, some of the countries in the database.
test_query = """ MATCH (n:Country)
WITH n.name AS Country
RETURN Country
LIMIT 10; """
DataFrame(graph.data(test_query))
Country | |
---|---|
0 | Australia |
1 | Belgium |
2 | United States of America |
3 | Brazil |
4 | Canada |
5 | Poland |
6 | Germany |
7 | Finland |
8 | People's Republic of China |
9 | Czech Republic |
For another test, let us see the number of publications over the years in the database.
# query
date_query = """ MATCH (n:PUBLICATION)
RETURN labels(n), n.year, count(*)
ORDER BY n.year
"""
# Extract number of publications and years, remove last entry because its "Null"
matrix = DataFrame(graph.data(date_query)).as_matrix()
dates = matrix[:, 2][0:-1]
numbers = matrix[:, 0][0:-1]
# plot
plt.title("Number of publications over time")
plt.xlabel("Year")
plt.ylabel("Number of Patents")
plt.plot(dates, numbers)
plt.grid()
plt.show()
The goal of this part is to create a frequency matrix. As mentioned in the first meeting.
This frequency matrix has the following items:
Furthermore, the matrix should be standardized.
Make query
query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech)
MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output)
WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count
RETURN Feedstock, Processing_Technology, Output, count
ORDER BY count
DESC LIMIT 50""" # DESC LIMIT 300
data = DataFrame(graph.data(query)).as_matrix()
Create the axis of the matrix
feedstock = set(list(data[:, 0]))
processing_technology = set(list(data[:, 1]))
output = set(list(data[:, 2]))
matrix_axis = list(feedstock) + list(processing_technology) + list(output)
Build Matrix
# initialize matrix
matrix = np.zeros([len(matrix_axis), len(matrix_axis)])
# find index of element in a given list
def find_index(element, in_list):
return in_list.index(element)
# for every row in original response
for row in data:
# the last column is the frequency
frequency = row[-1]
indexes = [find_index(element, matrix_axis) for element in row[0:-1]]
# add frequency value to matrix position
for pair in itertools.combinations(indexes, 2):
matrix[pair[0], pair[1]] += frequency
matrix[pair[1], pair[0]] += frequency
Let us normalize and visualize the matrix as a heatmap
# normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization
normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix)
# plot a heatmap of the matrix
fig, ax = plt.subplots(figsize=(8,8))
ax.imshow(normalized_matrix, cmap='hot', interpolation='none') # plt.cm.Reds # normalization affects simmetry.
ax.set_xticklabels(matrix_axis)
ax.set_yticklabels(matrix_axis)
plt.title("Correlation Matrix: Maplotlib Version")
plt.xticks(rotation=90)
plt.show()
Let us see if seaborn can make this prettier.
import seaborn as sns
# plot a heatmap of the matrix
fig, ax = plt.subplots(figsize=(8,8))
r = sns.heatmap(normalized_matrix, cmap='BuPu')
ax.set_xticklabels(matrix_axis)
ax.set_yticklabels(matrix_axis)
plt.title("Correlation Matrix: Seaborn Version")
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()
The matrix is symmetrical as expected. With empty squares in the middle as expected.
In this part of the exploration we will try to compare the "frequency matrix" of two countries.
We start by creating a function that given a country, returns its correlation matrix. Here we don't limit the query because the axis have already been determined in the first plot.
def get_the_matrix_from(country):
query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech)
MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output)
MATCH (a1:Asset)-[:LOCATED_IN]->(country:Country)
WHERE country.name = "{}"
WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count
RETURN Feedstock, Processing_Technology, Output, count
ORDER BY count """.format(country)
data = DataFrame(graph.data(query)).as_matrix()
# initialize matrix
matrix = np.zeros([len(matrix_axis), len(matrix_axis)])
# for every row in original response
for row in data:
# the last column is the frequency
frequency = row[-1]
indexes = [find_index(element, matrix_axis) for element in row[0:-1] if element in matrix_axis]
for pair in itertools.combinations(indexes, 2):
matrix[pair[0], pair[1]] += frequency
matrix[pair[1], pair[0]] += frequency
# normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization
normalized_matrix = normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix)
return normalized_matrix
After definning the function, we apply it to two countries.
country1 = 'United States of America'
country2 = 'Brazil'
country3 = 'Denmark'
country4 = 'Norway'
country1_matrix = get_the_matrix_from(country1)
country2_matrix = get_the_matrix_from(country2)
country3_matrix = get_the_matrix_from(country3)
country4_matrix = get_the_matrix_from(country4)
After applying the function to the countries, the results are plotted.
# first row
plt.subplots(2,1,figsize=(15,15))
plt.subplot(121)
sns.heatmap(country1_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis)
plt.title('{}'.format(country1))
plt.subplot(122)
sns.heatmap(country2_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False)
plt.title('{}'.format(country2))
plt.show()
#second row
plt.subplots(2,1,figsize=(15,15))
plt.subplot(121)
sns.heatmap(country3_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis)
plt.title('{}'.format(country3))
plt.subplot(122)
sns.heatmap(country4_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False)
plt.title('{}'.format(country4))
plt.show()
Severall differences can be noted in this plot:
- When comparing the USA and Brazil, the USA has a bigger focus on using hydrolisis for attaining hydrolysis.
- Brazil and norway appear to have a rather well distributed capability.
- Norway and Denmark are less well distributed with a big focus on certain outputs.
After seeying how to matrix changes from country to country, it's also interesting to see how it changes from year to year. For example, is the country so focused on certain flows every year?
We start by creating a function that takes two dates and returns the frequency above, reutilizing much of the code. Here we don't limit the query because the axis have already been determined in the first plot.
def get_the_matrix_from_dates(begin_date, end_date):
query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech)
MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output)
WHERE (toInteger(a1.year)<={} AND toInteger(a1.year)>={})
WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count
RETURN Feedstock, Processing_Technology, Output, count
ORDER BY count """.format(end_date, begin_date)
data = DataFrame(graph.data(query)).as_matrix()
# initialize matrix
matrix = np.zeros([len(matrix_axis), len(matrix_axis)])
# for every row in original response
for row in data:
# the last column is the frequency
frequency = row[-1]
indexes = [find_index(element, matrix_axis) for element in row[0:-1] if element in matrix_axis]
for pair in itertools.combinations(indexes, 2):
matrix[pair[0], pair[1]] += frequency
matrix[pair[1], pair[0]] += frequency
# normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization
normalized_matrix = normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix)
return normalized_matrix
Now, we apply to two different time intervals, here choosen "randomly"
first_interval = [1990, 2005]
second_interval = [2006, 2019]
first_interval_matrix = get_the_matrix_from_dates(first_interval[0], first_interval[1])
second_interval_matrix = get_the_matrix_from_dates(second_interval[0], second_interval[1])
We plot the two frquency matrixes corresponding to each time interval.
# first row
plt.subplots(2,1,figsize=(15,15))
plt.subplot(121)
sns.heatmap(first_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis)
plt.title('{} - {}'.format(first_interval[0], first_interval[1]))
plt.subplot(122)
sns.heatmap(second_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False)
plt.title('{} - {}'.format(second_interval[0], second_interval[1]))
plt.show()
Here, there seem to be some divergences when comparing:
- For example, in the first older interval, the output enzymatic hydrolisis seems to be more importnat than in the more recent interval.
- At the same time, in the more recent interval, there seem to be more concentration.
Another interesting measure is to understand if a particular year has divergences when compared to other years. For this, let's take a look at the year of 2016. Since most of the code is reused, there's not much to explain here.
# set intervals
first_interval = [2006, 2006]
second_interval = [2016, 2016]
first_interval_matrix = get_the_matrix_from_dates(first_interval[0], first_interval[1])
second_interval_matrix = get_the_matrix_from_dates(second_interval[0], second_interval[1])
# first row
plt.subplots(2,1,figsize=(15,15))
# if you only want to plot half
#mask = np.zeros_like(first_interval_matrix)
#mask[np.triu_indices_from(mask)] = True
# add to heatmap, property mask=mask
plt.subplot(121)
sns.heatmap(first_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis)
plt.title('{} - {}'.format(first_interval[0], first_interval[1]))
plt.subplot(122)
sns.heatmap(second_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False)
plt.title('{} - {}'.format(second_interval[0], second_interval[1]))
plt.show()
In this part of the exploration, we will take the first matrix that was extracted, (the non normalized, global, and absolute), and transform it into a network. Let's see what type of properties we get.
We start by importing networkX and transforming the adjency matrix into an undirected graph.
import networkx as nx
# convert matrix to graph
G = nx.to_networkx_graph(matrix) # we cant use the normalized because we want an adjency matrix.
We then rename the nodes.
# create a dictionnary to relabel nodes from numbers to asset name. (system req.)
relabel_map = {}
# for every number
for number in G.nodes():
# the name is the position in the matrix_axis list.
relabel_map[number] = matrix_axis[number]
# use the dictionnary to relabel.
nx.relabel_nodes(G, relabel_map, copy=False)
<networkx.classes.graph.Graph at 0x1129ff050>
We color the nodes according to their properties (feedstock, processing technology, output). And we draw the graph
# create a list of colors, these are related to each node.
color_map = []
for node in G:
if node in feedstock: # if node is in this category
color_map.append('red') # add to list a color specific to the category
if node in processing_technology:
color_map.append('green')
if node in output:
color_map.append('blue')
# plot the network using a random style
plt.figure(figsize=(8,6));
nx.draw_random(G, node_color = color_map, style='solid')
plt.title('Network')
plt.show()
/Users/duarteocarmo/anaconda/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:126: MatplotlibDeprecationWarning: pyplot.hold is deprecated. Future behavior will be consistent with the long-time default: plot commands add elements without first clearing the Axes and/or Figure. b = plt.ishold() /Users/duarteocarmo/anaconda/lib/python2.7/site-packages/networkx/drawing/nx_pylab.py:138: MatplotlibDeprecationWarning: pyplot.hold is deprecated. Future behavior will be consistent with the long-time default: plot commands add elements without first clearing the Axes and/or Figure. plt.hold(b) /Users/duarteocarmo/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:917: UserWarning: axes.hold is deprecated. Please remove it from your matplotlibrc and/or style files. warnings.warn(self.msg_depr_set % key) /Users/duarteocarmo/anaconda/lib/python2.7/site-packages/matplotlib/rcsetup.py:152: UserWarning: axes.hold is deprecated, will be removed in 3.0 warnings.warn("axes.hold is deprecated, will be removed in 3.0")
We then extract some basic properties.
# create a function that takes a graph and a name, and outputs properties of that same graph.
def network_prop(network, name):
print '------------------------------------------------------------------------'
average_degree = sum([network.degree()[element] for element in network.degree()]) / len([network.degree()[element] for element in network.degree()])
print 'These are the properties of network {}:'.format(name)
print 'The graph has {} nodes and {} edges.'.format(len(network.nodes()), len(network.edges()))
print 'The average node degree: ', average_degree
print 'Less connected node: ', min(network.degree(), key=network.degree().get)
print 'Most connecte node: ', max(network.degree(), key=network.degree().get)
print 'Average clustering coefficient: ', (nx.average_clustering(network))
print 'Average node connectivity: ', nx.average_node_connectivity(network)
print '------------------------------------------------------------------------'
network_prop(G, 'General')
------------------------------------------------------------------------ These are the properties of network General: The graph has 35 nodes and 86 edges. The average node degree: 4 Less connected node: straw Most connecte node: ethanol Average clustering coefficient: 0.654545818327 Average node connectivity: 2.30588235294 ------------------------------------------------------------------------
Another interesting observation is the number of patents over time, by country.
We start by making a call to the Neo4j databse into a dataframe
# query neo4j
patent_query = """ MATCH(n:PATENT)
RETURN n.country, n.year, count(n)
ORDER BY n.country"""
# convert to dataframe
patent_data = DataFrame(graph.data(patent_query)).as_matrix()
# a list of the years and a list of countries
years = range(int(np.amin(patent_data[:,2])), int(np.amax(patent_data[:,2])))
countries = list(set(patent_data[:, 1]))
Create a dictionnary where each key is the name of a country and each value is a list with the number of patents in each year.
patent_dict = {}
for country in countries:
per_year = []
for year in years:
found = 0
for i in range(patent_data.shape[0]):
if patent_data[i, :][1] == country and int(patent_data[i, :][2]) == year:
per_year.append(int(patent_data[i, :][0]))
found = 1
if found == 0:
per_year.append(0)
patent_dict[country] = per_year
Finally, we can plot the distribution
# variables to handle.
countries_excluded = ['Null']
number_of_countries = 16
years_to_plot = 17
fig_width = 18
fig_height = 7
# plot
fig=plt.figure(figsize=(fig_width, fig_height))
plt.style.use('seaborn')
plt.title('Patents over time', fontsize=13)
plt.xlabel('Years')
plt.ylabel('Number of Patents in Database')
for country in patent_dict.keys()[-number_of_countries:]:
if country in countries_excluded:
continue
else:
plt.plot(years[-years_to_plot:], patent_dict[country][-years_to_plot:], label=country)
plt.legend()
plt.show()
Another interesting question that surfaces in the presence of this database is the following question:
We start by designing a query capable of receiving a feedstock type and returning a ranking.
feedstocks = ['waste', 'sugarcane', 'wood']
feedstock_query = """ MATCH (asset:Asset)-[:CONTAINS]-> (f:Feedstock)
MATCH (asset:Asset)-[:LOCATED_IN]-> (c:Country)
WHERE f.term = "{}"
RETURN c.name, count(f)
ORDER BY count(f)
DESC
LIMIT 10"""
After, we iterate over the feedstocks list and make a ranking for the different elements in the list.
for resource in feedstocks:
resource_query = feedstock_query.format(resource)
resource_data = DataFrame(graph.data(resource_query)).as_matrix()
print 'Ranking for {}: '.format(resource)
for i in range(resource_data.shape[0]):
print '{}. {} with {} assets'.format(i+1, resource_data[i,0], resource_data[i,1])
print ''
Ranking for waste: 1. United States of America with 718 assets 2. People's Republic of China with 373 assets 3. Spain with 221 assets 4. United Kingdom with 209 assets 5. Germany with 203 assets 6. Italy with 169 assets 7. India with 163 assets 8. Netherlands with 116 assets 9. France with 113 assets 10. Sweden with 112 assets Ranking for sugarcane: 1. Brazil with 459 assets 2. United States of America with 109 assets 3. People's Republic of China with 23 assets 4. India with 21 assets 5. Sweden with 12 assets 6. Australia with 12 assets 7. Netherlands with 12 assets 8. United Kingdom with 11 assets 9. South Africa with 10 assets 10. Thailand with 10 assets Ranking for wood: 1. United States of America with 346 assets 2. Germany with 137 assets 3. Sweden with 105 assets 4. Finland with 86 assets 5. United Kingdom with 63 assets 6. Netherlands with 61 assets 7. Italy with 61 assets 8. People's Republic of China with 57 assets 9. Austria with 56 assets 10. Canada with 55 assets
# todo, make a map visualization for the rankings. Not limited, but with different shades for every country.