In the first part of the analysis we will focus on how the global capabilities change with time.
Let's start by importing all of the external libraries that will be useful during the analysis.
# python libraries
from py2neo import Graph
import numpy as np
from pandas import DataFrame
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import pandas as pd
import plotly
import plotly.graph_objs as go
import qgrid
from scipy import stats, spatial
from scipy.linalg import pinv
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics.pairwise import cosine_similarity
import operator
from IPython.display import display, HTML
from matplotlib.colors import ListedColormap
# connection to Neo4j
local_connection_url = "http://localhost:7474/db/data"
connection_to_graph = Graph(local_connection_url)
# plotly credentials
plotly_config = json.load(open('plotly_config.json'))
plotly.tools.set_credentials_file(username=plotly_config['username'], api_key=plotly_config['key'])
We start by geeting all the feedstock, processing technology and output terms.
f_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
o_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Output) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
pt_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:ProcessingTech) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
bbo = list(f_terms + pt_terms + o_terms)
print 'Number of terms:', len(bbo)
axis_names = bbo
print axis_names
We create a function that return the capability matrix of the whole database.
def get_total_matrix(normalization):
# define queries
# non intersecting part
q1 = """
MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
MATCH (a:Asset)-[:CONTAINS]->(out:Output)
MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
RETURN fs.term, pt.term, out.term, count(a)
"""
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
# interesecting part
q2 = """
MATCH (a:Asset)-[:CONTAINS]->(fs:{})
MATCH (a:Asset)-[:CONTAINS]->(t:{})
WHERE fs<>t
RETURN fs.term, t.term, count(a)
"""
# total assets of year
q3 = """
MATCH (n:Asset)
RETURN count(n)
"""
# treat incoming data
total_documents = DataFrame(connection_to_graph.data(q3)).as_matrix()[0][0]
# get data
data_q1 = DataFrame(connection_to_graph.data(q1)).as_matrix()
# create matrix
total_matrix = np.zeros([len(axis_names), len(axis_names)])
# for no intersections data
for row in data_q1:
# the last column is the frequency (count)
frequency = row[0]
indexes = [axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position not inter
for pair in itertools.combinations(indexes, 2):
total_matrix[pair[0], pair[1]] += frequency
total_matrix[pair[1], pair[0]] += frequency
# for intersecting data
for category in process_variables:
process_data = DataFrame(connection_to_graph.data(q2.format(category, category))).as_matrix()
for row in process_data:
frequency = row[0]
indexes = [axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position inter
for pair in itertools.combinations(indexes, 2):
total_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
total_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
# normalize
norm_total_matrix = total_matrix / total_documents
# dynamic return
if normalization == True:
return norm_total_matrix
else:
return total_matrix
Let us visualize the normalized and non normalized versions.
We create a function that gives borders to our graphs.
def borders(width, color, size=get_total_matrix(normalization=False).shape[1]):
plt.axhline(y=0, color='k',linewidth=width)
plt.axhline(y=size, color=color,linewidth=width)
plt.axvline(x=0, color='k',linewidth=width)
plt.axvline(x=size, color=color,linewidth=width)
And we plot.
## call functions
colors = 'binary'
year_in_focus = 2016
# create a subplot
plt.subplots(2,1,figsize=(17,17))
# first heatmap
plt.subplot(121)
vmax = 1000
sns.heatmap(get_total_matrix(normalization=False) , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmax=vmax)
borders(1.5, 'k')
plt.title('Capability Matrix Absolute')
# second heatmap
plt.subplot(122)
vmax = 0.1
sns.heatmap(get_total_matrix(normalization=True) , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmax=vmax)
borders(1.5, 'k')
plt.title('Capability Matrix Normalized')
plt.show()
whole_database = get_total_matrix(normalization=True)
a = sns.clustermap(whole_database, figsize=(12, 12), xticklabels = False, yticklabels=False, cmap='binary', square=True)
borders(1.5, 'k')
plt.show()
cluster_order = []
for i in a.dendrogram_row.reordered_ind:
cluster_order.append(axis_names[i])
print 'Extract of cluster order:'
cluster_order[50:70]
Not all years in the Neo4j database contain technological assets. For this reason, two lists will be created. A completely chronological one and a database one.
# query years
years_available_q = """ MATCH (n:Asset)
WITH n.year as YEAR
RETURN YEAR, count(YEAR)
ORDER BY YEAR ASC """
# create a list with the years where records exist
years_available = DataFrame(connection_to_graph.data(years_available_q)).as_matrix()[:, 0][:-1]
years_available = [int(year) for year in years_available]
# create a pure range list
first_year = int(years_available[0])
last_year = int(years_available[-1])
real_years = range(first_year, last_year + 1, 1)
# give information
print 'The database list starts in {}, ends in {} and contains {} years.'.format(years_available[0], years_available[-1], len(years_available))
print 'The real list starts in {}, ends in {} and contains {} years.'.format(real_years[0], real_years[-1], len(real_years))
Now that we have all of the years available, we can start building the technological capability matrixes.
The final list of terms has 352 terms.
Some auxiliary functions to help us build capability matrixes.
def number_of_documents(term, year):
q_term_total = """
MATCH (a:Asset)-[:CONTAINS]->(t)
WHERE t.term="{}" AND a.year = "{}"
RETURN count(a)
""".format(term, year)
return DataFrame(connection_to_graph.data(q_term_total)).as_matrix()[0][0]
We start by creating a function that given a certain year, returns the year's capability matrix.
def get_year_matrix(year, normalization=True, adjacency=True, diagonal=False):
# the values above are the default and run if not specified.
q1 = """
MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
MATCH (a:Asset)-[:CONTAINS]->(out:Output)
MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
WHERE a.year = "{}"
RETURN fs.term, pt.term, out.term, count(a)
""".format(year)
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
# interesecting part
q2 = """
MATCH (a:Asset)-[:CONTAINS]->(fs:{})
MATCH (a:Asset)-[:CONTAINS]->(t:{})
WHERE fs<>t AND a.year = "{}"
RETURN fs.term, t.term, count(a)
"""
# total assets of year
q3 = """
MATCH (n:Asset)
WITH n.year as YEAR
RETURN YEAR, count(YEAR)
ORDER BY YEAR ASC
"""
# treat incoming data
raw_data_q3 = DataFrame(connection_to_graph.data(q3)).as_matrix()
index_of_year = list(raw_data_q3[:, 0]).index('{}'.format(year))
total_documents = raw_data_q3[index_of_year, 1]
# get data
data_q1 = DataFrame(connection_to_graph.data(q1)).as_matrix()
# create empty matrix
year_matrix = np.zeros([len(axis_names), len(axis_names)])
if adjacency:
# for no intersections data
for row in data_q1:
# the last column is the frequency (count)
frequency = row[0]
indexes = [axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position not inter
for pair in itertools.combinations(indexes, 2):
year_matrix[pair[0], pair[1]] += frequency
year_matrix[pair[1], pair[0]] += frequency
# for intersecting data
for category in process_variables:
process_data = DataFrame(connection_to_graph.data(q2.format(category, category, year))).as_matrix()
for row in process_data:
frequency = row[0]
indexes = [axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position inter
for pair in itertools.combinations(indexes, 2):
year_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
year_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
if diagonal:
for index, name in enumerate(axis_names):
year_matrix[index, index] = number_of_documents(name, year)
# normalize or not
if normalization == True:
year_matrix = year_matrix / total_documents
return year_matrix
We finally test our function with the year 2016.
year = 2017
print 'The matrix from {} has shape {} a max value of {}, a min value of {} and a mean of {}.'.format(year, get_year_matrix(year).shape, np.amax(get_year_matrix(year)), np.amin(get_year_matrix(year)), np.mean(get_year_matrix(year)))
Let us print the capability matrices of 2016 normalized and absolute versions.
## call functions
colors = 'binary'
vmin = 0.0000
vmax = 0.05
year_in_focus = 2016
# create a subplot
plt.subplots(2,1,figsize=(17,17))
# first heatmap
plt.subplot(121)
sns.heatmap(get_year_matrix(year_in_focus, normalization=False, diagonal=True) , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False)
borders(1.5, 'k')
plt.title('Capability Matrix Absolute: {}'.format(year_in_focus))
# second heatmap
plt.subplot(122)
sns.heatmap(get_year_matrix(year_in_focus, normalization=True, diagonal=True) , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmin=vmin, vmax=vmax)
borders(1.5, 'k')
plt.title('Capability Matrix Normalized: {}'.format(year_in_focus))
plt.show()
## call functions
color1 = 'Blues'
color3 = 'Reds'
rwhite = ListedColormap(['white', 'red'])
gwhite = ListedColormap(['white', 'green'])
blwhite = ListedColormap(['white', 'blue'])
bwhite = ListedColormap(['white', 'grey'])
year_in_focus = 2017
graph_holder = 0.001
original = get_year_matrix(year_in_focus, normalization=False)
threshold = len(f_terms)
f_mask = np.ones(original.shape)
f_mask[0:threshold, 0:threshold] = 0
threshold = len(f_terms) + len(pt_terms)
pt_mask = np.ones(original.shape)
pt_mask[len(f_terms):threshold , len(f_terms):threshold] = 0
o_mask = np.ones(original.shape)
o_mask[threshold:: , threshold::] = 0
plt.subplots(1,1,figsize=(9, 9))
plt.subplot(111)
sns.heatmap(original, cmap=bwhite, center=0.001, cbar=None, square=True, xticklabels=False, yticklabels=False)
sns.heatmap(original, mask = f_mask, cmap=rwhite, center=graph_holder, cbar=None, square=True, xticklabels=False, yticklabels=False)
sns.heatmap(original, mask = pt_mask, cmap=gwhite, center=graph_holder, cbar=None, square=True, xticklabels=False, yticklabels=False)
sns.heatmap(original, mask = o_mask, cmap=blwhite, center=graph_holder, cbar=None, square=True, xticklabels=False, yticklabels=False)
borders(1.5, 'k')
plt.title('Capability Matrix Absolute: {}'.format(year_in_focus))
plt.show()
## call functions
colors = 'binary'
year_in_focus = 2017
# create a subplot
plt.subplots(1,1,figsize=(9, 9))
plt.subplot(111)
sns.heatmap(get_year_matrix(year_in_focus, normalization=True) , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmin=0.00, vmax=0.05)
borders(1.5, 'k')
plt.title('Capability Matrix Normalized: {}'.format(year_in_focus))
plt.show()
In order to analyse the correlation of the years between themselves, we will need to transform each year matrix into a list. Since the matrix is symmetrical, we will only need the upper triangle. For control purposes, we have designed our own upper triangulization matrix.
def get_list_from(matrix):
only_valuable = []
extension = 1
for row_number in range(matrix.shape[0]):
only_valuable.append(matrix[row_number, extension:matrix.shape[0]].tolist()) # numpy functions keep 0s so I hard coded it.
extension += 1
return [element for column in only_valuable for element in column ]
Let us print the capability lists of two example years.
# apply functions to both countries
a_list = get_list_from(get_year_matrix(2012, normalization=True))
b_list = get_list_from(get_year_matrix(2013, normalization=True))
# create a matrix where each row is a list of a country
corelation = np.vstack((a_list, b_list))
print corelation.shape
good_cols = [i for i in range(corelation.shape[1]) if np.sum(corelation[:, i]) != 0]
good_corelation = corelation[:, good_cols]
print good_corelation.shape
# plot the matrix
plt.subplots(1,1,figsize=(20, 5))
plt.subplot(111)
sns.heatmap(good_corelation,cmap=ListedColormap(['white', 'black']), center=0.00000001, cbar=None, square=False, yticklabels=['2012', '2013'], xticklabels=False)
plt.yticks(rotation=0)
plt.title('Year Capability List Visualization', size=15)
plt.show()
It is already apparent that these two consecutive years are highly correlated.
We create some functions to calculate the similarity between years.
def rv_coefficient(X, Y):
"""
Calculates the RV coefficient between two matrixes, according to:
http://production.datastore.cvt.dk/filestore?oid=539b95e864506c431f03ba0e&targetid=539b95e864506c431f03ba10
"""
XXYY = np.dot(np.dot(X, np.transpose(X)), np.dot(Y, np.transpose(Y)))
XX2 = np.dot(np.dot(X, np.transpose(X)), np.dot(X, np.transpose(X)))
YY2 = np.dot(np.dot(Y, np.transpose(Y)), np.dot(Y, np.transpose(Y)))
return np.trace(XXYY) / np.sqrt(np.trace(XX2) * np.trace(YY2))
# see more in https://tinyurl.com/machine-learning-dtu
x = np.random.rand(3,3)
y = np.random.rand(3,3)
print x
print y
print '-' * 10, 'RV COEF', '-' * 10
print rv_coefficient(x, y)
print rv_coefficient(y, x)
print rv_coefficient(x, x)
print rv_coefficient(y, y)
As previously done with countries, a year correlation matrix will be built.
We first define the scope of the matrix, by defining which years will be analyzed.
number_of_years = len(years_available)
years_in_matrix = years_available
years_correlation = np.zeros([number_of_years, number_of_years])
print years_in_matrix
By looping over each year and calculating its capability list, we create a correlation matrix.
We create a ductionnary where every key is a year, and its value the capability list of that same year. We do this to reduce memory:
year_capability_dictionnary = {}
for year in years_in_matrix:
year_capability_dictionnary[year] = get_list_from(get_year_matrix(year, normalization=True))
##########################################################
# Modifications should be done in two lines in this cell #
##########################################################
# for every year A
for row in range(number_of_years):
year_1 = years_in_matrix[row]
year_1_matrix = get_year_matrix(year_1) # <- MODIFY RIGHT HERE BY ADDING PARAMETERS
year_1_list = year_capability_dictionnary[years_in_matrix[row]]
print year_1, # I'm printing years just to know when loop is going to end..
# for every year B
for column in range(number_of_years):
year_2 = years_in_matrix[column]
year_2_matrix = get_year_matrix(year_2) # <- AND HERE AS WELL, IDEALLY THE SAME PARAMETERS AS ABOVE
years_correlation[row, column] = rv_coefficient(year_1_matrix, year_2_matrix)
We now print the correlation matrix.
plt.subplots(1,1,figsize=(9, 9))
plt.subplot(111)
sns.heatmap(years_correlation,square=True, cbar=True,cbar_kws={"shrink": .2}, yticklabels=years_in_matrix, xticklabels=years_in_matrix)
plt.title('Years Correlation Matrix: Unordered', size=13)
plt.show()
There seems to be a lot of data missing.
Let's plot the amount of records in our databse over time to get a better sense on how to approach the problem.
# get all of the data
data = DataFrame(connection_to_graph.data(years_available_q)).as_matrix()
raw = [int(a) for a in data[:-1, 0]]
timeline = range(min(raw), max(raw))
qtties = []
# build a timeline and number of records.
for year in timeline:
if year not in raw:
qtties.append(0)
else:
idx = list(data[:, 0]).index(str(year))
qtties.append(data[idx, 1])
# re arrange it
amountOfRecords = np.column_stack((timeline, qtties))
# plot the graph
plt.style.use('seaborn-darkgrid')
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.title("Number of assets over time")
plt.xlabel("Year")
plt.ylabel("Number of Available assets")
plt.plot(timeline, qtties)
plt.show()
To counteract the fact that our dataset is not uniformily distributed across the years, we will only consider the last 22 years. [1997-2018]
number_of_years = 22
numbers_of_years_available = years_correlation.shape[0] - 1
years_in_matrix = years_available[:-1][-number_of_years:]
years_correlation = years_correlation[numbers_of_years_available - number_of_years:numbers_of_years_available, numbers_of_years_available - number_of_years:numbers_of_years_available]
We now rebuild and plot the heatmap of correlations.
plt.subplots(1,1,figsize=(8, 8))
plt.subplot(111)
sns.heatmap(years_correlation, cbar=True, cbar_kws={"shrink": .5},square=True, yticklabels=years_in_matrix, xticklabels=years_in_matrix)
plt.title('Years Correlation Matrix: Chronologically Ordered, last 22 years', size=13)
plt.show()
We save the matrix to a file
np.savetxt("Exports/years_correlation.csv", years_correlation, delimiter=";")
Let us reorder the heatmap according to hierarchical clustering.
# plot the clustermap
a = sns.clustermap(years_correlation, figsize=(8, 8), xticklabels = years_in_matrix, yticklabels=years_in_matrix)
plt.show()
Let us see how related is each year in our matrix with the one before it. In this way we might more easily detect discripancies.
# remove first year
advanced_timeline = years_in_matrix[1::]
corr_with_pre = []
# iterate years and see their correlation
row = 1
col = 0
for year in advanced_timeline:
corr_with_pre.append(years_correlation[row, col])
row = row + 1
col = col + 1
# plot
plt.subplots(1,1,figsize=(15,7))
pal = sns.color_palette("Reds", len(data))
sns.barplot(np.arange(len(corr_with_pre)), corr_with_pre, palette=np.array(pal[::-1])[np.asarray(corr_with_pre).argsort().argsort()] )
plt.xticks(np.arange(len(corr_with_pre)), advanced_timeline, rotation=90, fontsize=11)
plt.title('Correlation of year with previous year')
plt.ylabel('Pearson Correlation Index')
plt.show()
Some years, such as 2006 or 2007 appear to have very low correlations with the years after. There seems to be an overall tendency of augmenting correlation with the years.
And we save to file.
np.savetxt("Exports/corellation with previous.csv", corr_with_pre, delimiter=";")
The following part of the analysis wil focus on how certain process variables (Feedstocks, Processing Technologies and Outputs) evolve over time.
This can help in answering questions such as for example:
Let's start by creating a function such as:
f(term, type of process variable) = [array with the number of records containing the term in each year]
from __future__ import division
def get_records_of(startYear, endYear, term, process_type):
# make query
yearRangeQuery = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
WHERE fs.term = "{}"
AND (toInteger(a.year)>={} AND toInteger(a.year)<={})
AND NOT a.year = "Null"
RETURN a.year, count(a)
ORDER BY a.year """.format(process_type, term, startYear, endYear)
# extract matrix
rawQuery = DataFrame(connection_to_graph.data(yearRangeQuery)).as_matrix()
# create matrix to store years, docs and total docs
normalTimeline = np.arange(startYear, endYear + 1)
completeMatrix = np.transpose(np.vstack((normalTimeline, normalTimeline, normalTimeline, normalTimeline)))
completeMatrix[:, 1::] = 0
# add number of docs found by query to matrix
for i in range(len(rawQuery[:, 0])):
for j in range(len(completeMatrix[:, 0])):
if int(rawQuery[i, 0]) == completeMatrix[j, 0]:
completeMatrix[j, 1] = rawQuery[i, 1]
# add total number of docs in that year to matrix
for i in range(len(completeMatrix[:, 0])):
for j in range(len(amountOfRecords[:, 0])):
if completeMatrix[i, 0] == amountOfRecords[j, 0]:
completeMatrix[i, 2] = amountOfRecords[j, 1]
# create a list of the normalized results
normalizedRecords = []
for i in range(len(completeMatrix[:, 0])):
if completeMatrix[i, 2] != 0:
normalizedRecords.append(float(completeMatrix[i, 1])/float(completeMatrix[i, 2]))
else:
normalizedRecords.append(0)
# return a dictionnary for easy access to all variables
result = {}
result['range'] = completeMatrix[:, 0].tolist()
result['nominal'] = completeMatrix[:, 1].tolist()
result['total'] = completeMatrix[:, 2].tolist()
result['normalized'] = normalizedRecords
return result
Now that the function is built, we can plot virtually any evolution.
Let us see the evolution of records of biogas Vs. ethanol as an example.
listOfOutputs = ['biogas', 'ethanol', 'biodiesel']
start_year = 1990
end_year = 2017
# plot the graph
plt.style.use('seaborn-darkgrid')
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.title("Evolution of Records with focus on Output")
plt.xlabel("Year")
plt.ylabel("Normalized Quantity")
for name in listOfOutputs:
nameData = get_records_of(start_year,end_year,name, 'Output')
plt.plot(nameData['range'], nameData['normalized'], label=name)
plt.legend()
plt.show()
Let us develop the same procedure for some processing technologies.
listOfProcTech = ['fermentation','enzymatic hydrolysis','hydrolysis' ]
start_year = 1990
end_year = 2017
# plot the graph
plt.style.use('seaborn-darkgrid')
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.title("Evolution of Records with focus on Processing Technologies")
plt.xlabel("Year")
plt.ylabel("Normalized Quantity")
for name in listOfProcTech:
nameData = get_records_of(start_year,end_year,name, 'ProcessingTech')
plt.plot(nameData['range'], nameData['normalized'], label=name)
plt.legend()
plt.show()
Let us develop the same procedure for feedstock.
listOfFeed = ['sugar','wood','paper', 'algae', 'waste']
start_year = 1990
end_year = 2017
# plot the graph
plt.style.use('seaborn-darkgrid')
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.title("Evolution of Records with focus on Feedstocks")
plt.xlabel("Year")
plt.ylabel("Normalized Quantity")
for name in listOfFeed:
nameData = get_records_of(start_year,end_year,name, 'Feedstock')
plt.plot(nameData['range'], nameData['normalized'], label=name)
plt.legend()
plt.show()
We start by comparing the evolution of the outputs above studied with the average oil price per gallon found in the following website.
We import the data, and convert monthly prices to yearly averages with the bellow code.
# get price per gallon in US dollars
oil_data = pd.read_csv('Data/GasData.csv', delimiter=',', header=None).as_matrix()[1::, :]
gallon = []
oil_years = list(set([int(e[0:4]) for e in oil_data[:, 0]]))[:-1]
for year in oil_years:
addition = 0
months = 0
for row in oil_data:
if str(year) in row[0]:
addition += float(row[1])
months += 1
average = addition / months
gallon.append(average)
# get price per barrel data
barrel = pd.read_csv('Data/GasDataNormalized.csv', delimiter=';', header=None).as_matrix()[:, 1].tolist()
oil_index = {'gallon':gallon, 'barrel':barrel}
Relationship Over Time
Let us visualize how the evolution of the price of gas relates to the normalized quantity of assets over time, in a chronological graph.
# define subplots
fig, ax1 = plt.subplots(figsize=(15,7))
listOfOutputs = ['biogas', 'bioplastic', 'butanol']
colors = ['b', 'y', 'g']
start_year = 1990
end_year = 2017
price_type = 'barrel'
# first axis
for position, outputName in enumerate(listOfOutputs):
nameData = get_records_of(start_year, end_year, outputName, 'Output')
ax1.plot(nameData['range'], nameData['normalized'], label=outputName, color=colors[position], ls='--', alpha=0.5)
ax1.set_xlabel('Years')
ax1.set_ylabel('Number of relative records')
ax1.tick_params('y')
ax1.set_title('Oil Price Vs. Asset Quantity')
ax1.legend(loc=2, frameon=True)
ax1.grid(False)
# second axis
ax2 = ax1.twinx()
ax2.plot(oil_years,oil_index[price_type], color='r', label='Oil Price')
ax2.set_ylabel('Price of {} of oil $US'.format(price_type), color='r')
ax2.tick_params('y', colors='r')
ax2.legend(loc=1, frameon=True)
# expose
plt.show()
Scatter Visualization
To study this relationship in a more in depth fashion we create a process that given a certain term gives us the relationship with the price of gas.
# define terms
outPutToCompare = 'butanol'
typeOfProcessVariable = 'Output'
price_type = 'gallon'
# get data
data = get_records_of(1990, 2017, outPutToCompare, typeOfProcessVariable)['normalized']
# plot the figure
fig, ax1 = plt.subplots(figsize=(15,7))
sns.regplot(np.asarray(oil_index[price_type]), np.asarray(data) ,fit_reg=True, marker="+", color = 'g')
plt.title('Gas price relation with quantity of Assets: {}'.format(outPutToCompare))
plt.xlabel('Price of {} of oil in US$ in Year'.format(price_type))
plt.ylabel('Quantity of Asset {} in Year'.format(outPutToCompare))
plt.show()
# get correlation indexes
correlationIndexes = stats.pearsonr(np.asarray(oil_index[price_type]), np.asarray(get_records_of(1990, 2017, outPutToCompare, 'Output')['normalized']))
print 'Pearson Correlation Index: ', correlationIndexes[0]
print 'P-value: ', correlationIndexes[1]
In the above graph each datapoint corresponds to a year.
Biggest Positive Correlations
# query for data
term_names_query = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Output)
WHERE (toInteger(a.year)>=1990 AND toInteger(a.year)<=2017)
AND NOT a.year = "Null"
RETURN fs.term, count(a)
ORDER BY count(a) DESC"""
# get data from past scripts
oil_type = 'gallon'
term_names = list(DataFrame(connection_to_graph.data(term_names_query)).as_matrix()[:, 1].tolist())
correlations = []
p_values = []
# for every term, get its correlation with the price of oil
for term in term_names:
data = get_records_of(1990, 2017, term, 'Output')['normalized']
correlations.append(stats.pearsonr(data, oil_index[oil_type])[0])
p_values.append(stats.pearsonr(data, oil_index[oil_type])[1])
# create a pandas dataframe for pretty printing.
oilDataFrame = pd.DataFrame(
{'Output Name': term_names,
'Pearson Correlation Index': correlations,
'P-value': p_values
})
oilDataFrame = oilDataFrame.sort_values('Pearson Correlation Index', ascending=False)
# print context
print 'The relationship between relative number of documents and price of oil over time:'
top = 10
# print data
print 'TOP {}:'.format(top)
display(oilDataFrame[:top])
Biggest Negative Correlations
# same approach but value negative correlations
term_names_query = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Output)
WHERE (toInteger(a.year)>=1990 AND toInteger(a.year)<=2017)
AND NOT a.year = "Null"
RETURN fs.term, count(a)
ORDER BY count(a) DESC"""
oil_type = 'gallon'
term_names = list(DataFrame(connection_to_graph.data(term_names_query)).as_matrix()[:, 1].tolist())
correlations = []
p_values = []
for term in term_names:
data = get_records_of(1990, 2017, term, 'Output')['normalized']
correlations.append(stats.pearsonr(data, oil_index[oil_type])[0])
p_values.append(stats.pearsonr(data, oil_index[oil_type])[1])
oilDataFrame = pd.DataFrame(
{'Output Name': term_names,
'Pearson Correlation Index': correlations,
'P-value': p_values
})
oilDataFrame = oilDataFrame.sort_values('Pearson Correlation Index', ascending=False)
print 'The relationship between relative number of documents and price of oil over time:'
bottom = -10
print 'BOTTOM {}:'.format(bottom)
display(oilDataFrame[bottom:])
In this part we will make the same analysis but taking an example of a feedstock: sugar.
Data was obtained here.
We start by importing the data.
sugar_data = pd.read_csv('Data/Sugar_Price.csv', delimiter=';', header=None).as_matrix()
sugar = {}
sugar['years'] = [int(e) for e in sugar_data[:, 0]]
sugar['nominal'] = [e for e in sugar_data[:, 1]]
sugar['real'] = [e for e in sugar_data[:, 2]]
Relationship Over Time
Let us see the evolution of Sugar prices side by side with the evolution of certain feedstocks in our database.
# define subplots
fig, ax1 = plt.subplots(figsize=(15,7))
feedstock_list = ['sugar', 'wood', 'sugarcane', 'sugar beet', 'cellulosic sugars']
colors = ['gold', 'mediumblue', 'm', 'green', 'k']
start_year = 1990
end_year = 2017
sugar_price_type = 'real'
# first axis
for position,feedstock in enumerate(feedstock_list):
data = get_records_of(start_year, end_year, feedstock, 'Feedstock')
ax1.plot(data['range'], data['normalized'], label=feedstock, ls='--', color=colors[position])
ax1.set_xlabel('Years')
ax1.set_ylabel('Relative number of records')
ax1.tick_params('y')
ax1.set_title('Sugar Prices Vs. Asset Quantity')
ax1.legend(loc=3, frameon=True)
ax1.grid(False)
# second axis
ax2 = ax1.twinx()
ax2.plot(sugar['years'], sugar[sugar_price_type], color='r', label='Sugar Price', ls='-')
ax2.set_ylabel('Price per kilo of sugar in $US (inflation adjusted)', color='r')
ax2.tick_params('y', colors='r')
ax2.legend(loc=1, frameon=True)
# expose
plt.show()
Scatter Example
Let us see a scatter plot where each point is a year and the x and y axis correpond to the price of sugar and quantity of assets respectively.
outPutToCompare = 'sugarcane'
typeOfProcessVariable = 'Feedstock'
price_type = 'real'
data = get_records_of(1990, 2017, outPutToCompare, typeOfProcessVariable)['normalized']
fig, ax1 = plt.subplots(figsize=(15,7))
sns.regplot(np.asarray(sugar[price_type]), np.asarray(data) ,fit_reg=True, marker="+", color = 'b')
plt.title('Sugar price relation with quantity of Assets: {}'.format(outPutToCompare))
plt.xlabel('Price of sugar US$ per kilo in Year ({})'.format(price_type))
plt.ylabel('Quantity of Asset {} in Year'.format(outPutToCompare))
plt.show()
Biggest Positive Correlations
Which are the feedstocks who are more related to the price of sugar per kilo in what regards the number of records?
term_names_query = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
WHERE (toInteger(a.year)>=1990 AND toInteger(a.year)<=2017)
AND NOT a.year = "Null"
RETURN fs.term, count(a)
ORDER BY count(a) DESC"""
price_type = 'nominal'
term_names = list(DataFrame(connection_to_graph.data(term_names_query)).as_matrix()[:, 1].tolist())
correlations = []
p_values = []
for term in term_names:
data = get_records_of(1990, 2017, term, 'Feedstock')['normalized']
correlations.append(stats.pearsonr(data, sugar[price_type])[0])
p_values.append(stats.pearsonr(data, sugar[price_type])[1])
sugarDataframe = pd.DataFrame(
{'Feedstock Name': term_names,
'Pearson Correlation Index': correlations,
'P-value': p_values
})
sugarDataframe = sugarDataframe.sort_values('Pearson Correlation Index', ascending=False)
print 'The relationship between relative number of documents and price per kilo of sugar:'
top = 10
print 'TOP {}:'.format(top)
display(sugarDataframe[:top])
Biggest Negative Correlations
term_names_query = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
WHERE (toInteger(a.year)>=1990 AND toInteger(a.year)<=2017)
AND NOT a.year = "Null"
RETURN fs.term, count(a)
ORDER BY count(a) DESC"""
price_type = 'nominal'
term_names = list(DataFrame(connection_to_graph.data(term_names_query)).as_matrix()[:, 1].tolist())
correlations = []
p_values = []
for term in term_names:
data = get_records_of(1990, 2017, term, 'Feedstock')['normalized']
correlations.append(stats.pearsonr(data, sugar[price_type])[0])
p_values.append(stats.pearsonr(data, sugar[price_type])[1])
sugarDataframe = pd.DataFrame(
{'Feedstock Name': term_names,
'Pearson Correlation Index': correlations,
'P-value': p_values
})
sugarDataframe = sugarDataframe.sort_values('Pearson Correlation Index', ascending=False)
print 'The relationship between relative number of documents and price per kilo of sugar:'
bottom = -10
print 'Bottom {}:'.format(bottom * -1)
display(sugarDataframe[bottom:])
NON SERIES TIME ANALYSIS IS A LIMITATION.
In this part of the analysis the goal is two understand what exact capabilities differ from year to year. More exactly, how does one particular capability evolve over the course of two or more years.
For example, if in year X1, Y1% of the assets related to sugar, what is the percentage Y2% in year X2?
Let us visualize two different years side by side.
## call functions
first_year = 2010
second_year = 2017
colors='binary'
graph_holder = 0.005
fst_year_matrix = get_year_matrix(first_year, normalization=False)
scnd_year_matrix = get_year_matrix(second_year, normalization=False)
# create a subplot
plt.subplots(2,1,figsize=(17,17))
# first heatmap
plt.subplot(121)
sns.heatmap(fst_year_matrix , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmax=graph_holder)
borders(1.5, 'k')
plt.title('Capability Matrix: {}'.format(first_year))
# second heatmap
plt.subplot(122)
sns.heatmap(scnd_year_matrix , cmap=colors, cbar=True,cbar_kws={"shrink": .2}, square=True, xticklabels=False, yticklabels=False, vmax=graph_holder)
borders(1.5, 'k')
plt.title('Capability Matrix: {}'.format(second_year))
plt.show()