#!/usr/bin/env python # coding: utf-8 # # 1.Data Exploration # ## 1.1.Establishing a Connection # We start by importing all necessary libraries # In[2]: from py2neo import Graph # to connect to our neo4j Graph database from pandas import DataFrame import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import normalize import itertools #https://gist.github.com/DaniSancas/1d5265fc159a95ff457b940fc5046887 Cypher cheatsheet # Connect to running instance of Neo4J # In[3]: graph_url = "http://localhost:7474/db/data" graph = Graph(graph_url) # Test the connection by returning, for example, some of the countries in the database. # In[4]: test_query = """ MATCH (n:Country) WITH n.name AS Country RETURN Country LIMIT 10; """ DataFrame(graph.data(test_query)) # For another test, let us see the number of publications over the years in the database. # In[5]: # query date_query = """ MATCH (n:PUBLICATION) RETURN labels(n), n.year, count(*) ORDER BY n.year """ # Extract number of publications and years, remove last entry because its "Null" matrix = DataFrame(graph.data(date_query)).as_matrix() dates = matrix[:, 2][0:-1] numbers = matrix[:, 0][0:-1] # plot plt.title("Number of publications over time") plt.xlabel("Year") plt.ylabel("Number of Patents") plt.plot(dates, numbers) plt.grid() plt.show() # ## 1.2.Creating the global matrix # The goal of this part is to create a frequency matrix. As mentioned in the first meeting. # # This frequency matrix has the following items: # * Feedstock # * Processing Tech # * Output # # Furthermore, the matrix should be standardized. # Make query # In[6]: query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech) MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output) WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count RETURN Feedstock, Processing_Technology, Output, count ORDER BY count DESC LIMIT 50""" # DESC LIMIT 300 data = DataFrame(graph.data(query)).as_matrix() # Create the axis of the matrix # In[7]: feedstock = set(list(data[:, 0])) processing_technology = set(list(data[:, 1])) output = set(list(data[:, 2])) matrix_axis = list(feedstock) + list(processing_technology) + list(output) # Build Matrix # In[8]: # initialize matrix matrix = np.zeros([len(matrix_axis), len(matrix_axis)]) # find index of element in a given list def find_index(element, in_list): return in_list.index(element) # for every row in original response for row in data: # the last column is the frequency frequency = row[-1] indexes = [find_index(element, matrix_axis) for element in row[0:-1]] # add frequency value to matrix position for pair in itertools.combinations(indexes, 2): matrix[pair[0], pair[1]] += frequency matrix[pair[1], pair[0]] += frequency # Let us normalize and visualize the matrix as a heatmap # In[9]: # normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix) # plot a heatmap of the matrix fig, ax = plt.subplots(figsize=(8,8)) ax.imshow(normalized_matrix, cmap='hot', interpolation='none') # plt.cm.Reds # normalization affects simmetry. ax.set_xticklabels(matrix_axis) ax.set_yticklabels(matrix_axis) plt.title("Correlation Matrix: Maplotlib Version") plt.xticks(rotation=90) plt.show() # Let us see if seaborn can make this prettier. # In[10]: import seaborn as sns # plot a heatmap of the matrix fig, ax = plt.subplots(figsize=(8,8)) r = sns.heatmap(normalized_matrix, cmap='BuPu') ax.set_xticklabels(matrix_axis) ax.set_yticklabels(matrix_axis) plt.title("Correlation Matrix: Seaborn Version") plt.xticks(rotation=90) plt.yticks(rotation=0) plt.show() # The matrix is symmetrical as expected. With empty squares in the middle as expected. # ## 1.3. How does the matrix change from country to country? # In this part of the exploration we will try to compare the "frequency matrix" of two countries. # # We start by creating a function that given a country, returns its correlation matrix. Here we don't limit the query because the axis have already been determined in the first plot. # In[11]: def get_the_matrix_from(country): query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech) MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output) MATCH (a1:Asset)-[:LOCATED_IN]->(country:Country) WHERE country.name = "{}" WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count RETURN Feedstock, Processing_Technology, Output, count ORDER BY count """.format(country) data = DataFrame(graph.data(query)).as_matrix() # initialize matrix matrix = np.zeros([len(matrix_axis), len(matrix_axis)]) # for every row in original response for row in data: # the last column is the frequency frequency = row[-1] indexes = [find_index(element, matrix_axis) for element in row[0:-1] if element in matrix_axis] for pair in itertools.combinations(indexes, 2): matrix[pair[0], pair[1]] += frequency matrix[pair[1], pair[0]] += frequency # normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization normalized_matrix = normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix) return normalized_matrix # After definning the function, we apply it to two countries. # In[12]: country1 = 'United States of America' country2 = 'Brazil' country3 = 'Denmark' country4 = 'Norway' country1_matrix = get_the_matrix_from(country1) country2_matrix = get_the_matrix_from(country2) country3_matrix = get_the_matrix_from(country3) country4_matrix = get_the_matrix_from(country4) # After applying the function to the countries, the results are plotted. # In[13]: # first row plt.subplots(2,1,figsize=(15,15)) plt.subplot(121) sns.heatmap(country1_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis) plt.title('{}'.format(country1)) plt.subplot(122) sns.heatmap(country2_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False) plt.title('{}'.format(country2)) plt.show() #second row plt.subplots(2,1,figsize=(15,15)) plt.subplot(121) sns.heatmap(country3_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis) plt.title('{}'.format(country3)) plt.subplot(122) sns.heatmap(country4_matrix, cmap='BuPu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False) plt.title('{}'.format(country4)) plt.show() # > Severall differences can be noted in this plot: # > * When comparing the USA and Brazil, the USA has a bigger focus on using hydrolisis for attaining hydrolysis. # > * Brazil and norway appear to have a rather well distributed capability. # > * Norway and Denmark are less well distributed with a big focus on certain outputs. # ## 1.4. How does the matrix change over time? # After seeying how to matrix changes from country to country, it's also interesting to see how it changes from year to year. For example, is the country so focused on certain flows every year? # We start by creating a function that takes two dates and returns the frequency above, reutilizing much of the code. Here we don't limit the query because the axis have already been determined in the first plot. # In[14]: def get_the_matrix_from_dates(begin_date, end_date): query = """ MATCH p1=(feed:Feedstock)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(proc:ProcessingTech) MATCH p2=(proc:ProcessingTech)<-[:CONTAINS]-(a1:Asset)-[:CONTAINS]->(out:Output) WHERE (toInteger(a1.year)<={} AND toInteger(a1.year)>={}) WITH feed.term AS Feedstock, proc.term AS Processing_Technology, out.term AS Output, count(p1) AS count RETURN Feedstock, Processing_Technology, Output, count ORDER BY count """.format(end_date, begin_date) data = DataFrame(graph.data(query)).as_matrix() # initialize matrix matrix = np.zeros([len(matrix_axis), len(matrix_axis)]) # for every row in original response for row in data: # the last column is the frequency frequency = row[-1] indexes = [find_index(element, matrix_axis) for element in row[0:-1] if element in matrix_axis] for pair in itertools.combinations(indexes, 2): matrix[pair[0], pair[1]] += frequency matrix[pair[1], pair[0]] += frequency # normalize matrix using scipy normalization available here: http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization normalized_matrix = normalized_matrix = (matrix - np.mean(matrix)) / np.std(matrix) return normalized_matrix # Now, we apply to two different time intervals, here choosen "randomly" # In[15]: first_interval = [1990, 2005] second_interval = [2006, 2019] first_interval_matrix = get_the_matrix_from_dates(first_interval[0], first_interval[1]) second_interval_matrix = get_the_matrix_from_dates(second_interval[0], second_interval[1]) # We plot the two frquency matrixes corresponding to each time interval. # In[16]: # first row plt.subplots(2,1,figsize=(15,15)) plt.subplot(121) sns.heatmap(first_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis) plt.title('{} - {}'.format(first_interval[0], first_interval[1])) plt.subplot(122) sns.heatmap(second_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False) plt.title('{} - {}'.format(second_interval[0], second_interval[1])) plt.show() # > Here, there seem to be some divergences when comparing: # > * For example, in the first older interval, the output enzymatic hydrolisis seems to be more importnat than in the more recent interval. # > * At the same time, in the more recent interval, there seem to be more concentration. # ### 1.4.1. Comparing years # Another interesting measure is to understand if a particular year has divergences when compared to other years. For this, let's take a look at the year of 2016. Since most of the code is reused, there's not much to explain here. # In[17]: # set intervals first_interval = [2006, 2006] second_interval = [2016, 2016] first_interval_matrix = get_the_matrix_from_dates(first_interval[0], first_interval[1]) second_interval_matrix = get_the_matrix_from_dates(second_interval[0], second_interval[1]) # first row plt.subplots(2,1,figsize=(15,15)) # if you only want to plot half #mask = np.zeros_like(first_interval_matrix) #mask[np.triu_indices_from(mask)] = True # add to heatmap, property mask=mask plt.subplot(121) sns.heatmap(first_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=matrix_axis) plt.title('{} - {}'.format(first_interval[0], first_interval[1])) plt.subplot(122) sns.heatmap(second_interval_matrix, cmap='YlGnBu', cbar=None, square=True, xticklabels=matrix_axis, yticklabels=False) plt.title('{} - {}'.format(second_interval[0], second_interval[1])) plt.show() # ## 1.5.Network Properties # In this part of the exploration, we will take the first matrix that was extracted, (the non normalized, global, and absolute), and transform it into a network. Let's see what type of properties we get. # We start by importing networkX and transforming the adjency matrix into an undirected graph. # In[18]: import networkx as nx # convert matrix to graph G = nx.to_networkx_graph(matrix) # we cant use the normalized because we want an adjency matrix. # We then rename the nodes. # In[19]: # create a dictionnary to relabel nodes from numbers to asset name. (system req.) relabel_map = {} # for every number for number in G.nodes(): # the name is the position in the matrix_axis list. relabel_map[number] = matrix_axis[number] # use the dictionnary to relabel. nx.relabel_nodes(G, relabel_map, copy=False) # We color the nodes according to their properties (feedstock, processing technology, output). And we draw the graph # In[20]: # create a list of colors, these are related to each node. color_map = [] for node in G: if node in feedstock: # if node is in this category color_map.append('red') # add to list a color specific to the category if node in processing_technology: color_map.append('green') if node in output: color_map.append('blue') # plot the network using a random style plt.figure(figsize=(8,6)); nx.draw_random(G, node_color = color_map, style='solid') plt.title('Network') plt.show() # We then extract some basic properties. # In[21]: # create a function that takes a graph and a name, and outputs properties of that same graph. def network_prop(network, name): print '------------------------------------------------------------------------' average_degree = sum([network.degree()[element] for element in network.degree()]) / len([network.degree()[element] for element in network.degree()]) print 'These are the properties of network {}:'.format(name) print 'The graph has {} nodes and {} edges.'.format(len(network.nodes()), len(network.edges())) print 'The average node degree: ', average_degree print 'Less connected node: ', min(network.degree(), key=network.degree().get) print 'Most connecte node: ', max(network.degree(), key=network.degree().get) print 'Average clustering coefficient: ', (nx.average_clustering(network)) print 'Average node connectivity: ', nx.average_node_connectivity(network) print '------------------------------------------------------------------------' network_prop(G, 'General') # ## 1.6.Patents over time # Another interesting observation is the number of patents over time, by country. # We start by making a call to the Neo4j databse into a dataframe # In[22]: # query neo4j patent_query = """ MATCH(n:PATENT) RETURN n.country, n.year, count(n) ORDER BY n.country""" # convert to dataframe patent_data = DataFrame(graph.data(patent_query)).as_matrix() # a list of the years and a list of countries years = range(int(np.amin(patent_data[:,2])), int(np.amax(patent_data[:,2]))) countries = list(set(patent_data[:, 1])) # Create a dictionnary where each key is the name of a country and each value is a list with the number of patents in each year. # In[23]: patent_dict = {} for country in countries: per_year = [] for year in years: found = 0 for i in range(patent_data.shape[0]): if patent_data[i, :][1] == country and int(patent_data[i, :][2]) == year: per_year.append(int(patent_data[i, :][0])) found = 1 if found == 0: per_year.append(0) patent_dict[country] = per_year # Finally, we can plot the distribution # In[24]: # variables to handle. countries_excluded = ['Null'] number_of_countries = 16 years_to_plot = 17 fig_width = 18 fig_height = 7 # plot fig=plt.figure(figsize=(fig_width, fig_height)) plt.style.use('seaborn') plt.title('Patents over time', fontsize=13) plt.xlabel('Years') plt.ylabel('Number of Patents in Database') for country in patent_dict.keys()[-number_of_countries:]: if country in countries_excluded: continue else: plt.plot(years[-years_to_plot:], patent_dict[country][-years_to_plot:], label=country) plt.legend() plt.show() # ## 1.7.Feedstock Rankings # Another interesting question that surfaces in the presence of this database is the following question: # * Given a certain natural resource (feedstock), which countries/regions present the highest amount of records? # We start by designing a query capable of receiving a feedstock type and returning a ranking. # In[40]: feedstocks = ['waste', 'sugarcane', 'wood'] feedstock_query = """ MATCH (asset:Asset)-[:CONTAINS]-> (f:Feedstock) MATCH (asset:Asset)-[:LOCATED_IN]-> (c:Country) WHERE f.term = "{}" RETURN c.name, count(f) ORDER BY count(f) DESC LIMIT 10""" # After, we iterate over the feedstocks list and make a ranking for the different elements in the list. # In[41]: for resource in feedstocks: resource_query = feedstock_query.format(resource) resource_data = DataFrame(graph.data(resource_query)).as_matrix() print 'Ranking for {}: '.format(resource) for i in range(resource_data.shape[0]): print '{}. {} with {} assets'.format(i+1, resource_data[i,0], resource_data[i,1]) print '' # In[42]: # todo, make a map visualization for the rankings. Not limited, but with different shades for every country.