In this part of the analysis, we will focus on organizations as the unit of technological capability.
Let us import all relevant libraries
from py2neo import Graph
import numpy as np
from pandas import DataFrame
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import pandas as pd
import plotly
import plotly.graph_objs as go
import qgrid
from scipy import stats, spatial
from sklearn.cluster.bicluster import SpectralBiclustering
import operator
from IPython.display import display, HTML
from matplotlib.colors import ListedColormap
# please add your plotly api credentials to plotly_config in your own machine. Visit https://plot.ly/python/getting-started/
plotly_config = json.load(open('plotly_config.json'))
plotly.tools.set_credentials_file(username=plotly_config['username'], api_key=plotly_config['key'])
local_connection_url = "http://localhost:7474/db/data"
connection_to_graph = Graph(local_connection_url)
Let us create a list of the danish organization that we want to focus on:
dk_organizations = ["TECH UNIV DENMARK",
"NOVOZYMES AS",
"AALBORG UNIV",
"UNIV COPENHAGEN",
"AARHUS UNIV",
"UNIV SO DENMARK",
"RISO DTU",
"DONG ENERGY"]
Then we query all of the organziations in the database.
# query orgs
org_available_q = """ MATCH (n:Asset)
WITH n.owner as ORG
RETURN ORG, count(ORG)
ORDER BY count(ORG) DESC"""
# create a list with the years where records exist
raw_data = DataFrame(connection_to_graph.data(org_available_q)).as_matrix()[:, 0]
organizations = list(raw_data)
# print an example organization
print 'We found {} organizations'.format(len(organizations))
Let us get the names of all of the terms in the biofuel dictionnary
f_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
o_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Output) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
pt_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:ProcessingTech) RETURN fs.term, count(a)')).as_matrix()[:, 1]))
bbo = list(set(f_terms + pt_terms + o_terms))
print len(bbo)
matrix_axis_names = bbo
def find_index(something, in_list):
return in_list.index(something)
We create a function that given an organization, returns its capability matrix
def get_org_matrix(org, normalization=True):
# define queries
org_no_interestions = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
MATCH (a:Asset)-[:CONTAINS]->(out:Output)
MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
WHERE a.owner CONTAINS "{}"
RETURN fs.term, pt.term, out.term, count(a)
""".format(org)
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
org_intersections = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
MATCH (a:Asset)-[:CONTAINS]->(t:{})
WHERE fs<>t AND a.owner CONTAINS "{}"
RETURN fs.term, t.term, count(a)
"""
total_documents_q = """ MATCH (n:Asset)
WHERE n.owner contains "{}"
RETURN count(n)""".format(org)
total_docs = DataFrame(connection_to_graph.data(total_documents_q)).as_matrix()[0][0]
# get data
data_no_intersections = DataFrame(connection_to_graph.data(org_no_interestions)).as_matrix()
# create matrix
org_matrix = np.zeros([len(matrix_axis_names), len(matrix_axis_names)])
# for no intersections data
for row in data_no_intersections:
# the last column is the frequency (count)
frequency = row[0]
indexes = [find_index(element, matrix_axis_names) for element in row[1::]]
# add frequency value to matrix position
for pair in itertools.combinations(indexes, 2):
org_matrix[pair[0], pair[1]] += frequency
org_matrix[pair[1], pair[0]] += frequency
# for intersecting data
for category in process_variables:
process_data = DataFrame(connection_to_graph.data(org_intersections.format(category, category, org))).as_matrix()
for row in process_data:
frequency = row[0]
indexes = [find_index(element, matrix_axis_names) for element in row[1::]]
# add frequency value to matrix position
for pair in itertools.combinations(indexes, 2):
org_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
org_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
# normalize
normalized_org_matrix = org_matrix / total_docs
# dynamic return
if normalization == True:
return normalized_org_matrix
else:
return org_matrix
Some auxiliary functions to study organizations:
def check_symmetric(a, tol):
return np.allclose(a, a.T, atol=tol)
def basic_stats(a_matrix):
print 'Rows:', a_matrix.shape[0]
print 'Columns:', a_matrix.shape[1]
print 'Mean: ', np.mean(a_matrix)
print 'Standart Deviation', np.std(a_matrix)
print 'Max: ', np.amax(a_matrix)
print 'Min: ', np.amin(a_matrix)
print 'Symmetry: ', check_symmetric(a_matrix, 1e-8)
print ''
Let's look at DTU's matrix:
univ = 'TECH UNIV DENMARK'
univ_matrix = get_org_matrix(univ, normalization=True)
basic_stats(univ_matrix)
plt.subplots(1,1,figsize=(8, 8))
plt.subplot(111)
sns.heatmap(univ_matrix, cmap='binary', cbar_kws={"shrink": .2},cbar=True, square=True, yticklabels=False, xticklabels=False)
plt.show()
i,j = np.unravel_index(univ_matrix.argmax(), univ_matrix.shape)
print 'The maximum value of the {} matrix is in position {} with value {} and concerns {} and {}.'.format(univ, (i, j), univ_matrix[i,j], matrix_axis_names[i], matrix_axis_names[j])
A function that returns the capability list from the capability matrix
def get_list_from(matrix):
only_valuable = []
extension = 1
for row_number in range(matrix.shape[0]):
only_valuable.append(matrix[row_number, extension:matrix.shape[0]].tolist()) # numpy functions keep 0s so I hard coded it.
extension += 1
return [element for column in only_valuable for element in column ]
We now add the list of DK organizations and the organizations with more than 7 assets:
# query orgs
assetLimit = 7
org_available_q = """ MATCH (n:Asset)
WITH n.owner as ORG
RETURN ORG, count(ORG)
ORDER BY count(ORG) DESC"""
# create a list with the years where records exist from 1 to remove null
raw_data = DataFrame(connection_to_graph.data(org_available_q)).as_matrix()[1::]
list_of_organizations = []
for row in raw_data:
if row[1] >= assetLimit:
list_of_organizations.append(row[0])
organizations = list(set(list_of_organizations + dk_organizations))
print 'The list of organizations now has {} organizations.'.format(len(organizations))
Create a dictionnary of organization capability lists for efficiency.
# create dictionnary
org_capability_dict = {}
counter = 0
# iterate through countries
for org in organizations:
counter += 1
org_matrix = get_org_matrix(org.encode('utf-8').strip(), normalization=True)
org_list = get_list_from(org_matrix)
# discart if no information
if np.all(np.isnan(org_matrix)) or sum(org_list)==0.0:
print org
continue
else:
org_capability_dict[org] = get_list_from(org_matrix)
A function that calculates the correlation:
def calculate_org_correlation(org1_list, org2_list, stat):
avg_dif = np.mean(org1_list - org2_list)
abs_avg_dif = abs(avg_dif)
if stat.lower() == 'absolute average difference': # return absolute average difference
return abs_avg_dif
if stat == 'Pearson': # return Pearson coef
return stats.pearsonr(org1_list, org2_list)[0]
if stat == 'P-value': # return P-value
return stats.pearsonr(org1_list, org2_list)[1]
Let's print all the organizations:
org_names = org_capability_dict.keys()
org_names.sort()
number_of_orgs = len(org_names)
print 'There are {} organizations.'.format(number_of_orgs)
print 'List of Organizations:'
print org_names
And we build the correlation matrix
idxs = [org_names.index(e) for e in dk_organizations]
my_label_colors = {}
for name in org_names:
if name in dk_organizations:
my_label_colors[name] = 'r'
else:
my_label_colors[name] = 'black'
org_correlation = np.zeros([number_of_orgs, number_of_orgs])
for row in range(number_of_orgs):
org_1 = org_names[row]
org_1_list = np.asarray(org_capability_dict[org_1])
for column in range(number_of_orgs):
org_2 = org_names[column]
org_2_list = np.asarray(org_capability_dict[org_2])
org_correlation[row, column] = calculate_org_correlation(org_1_list, org_2_list, 'Pearson')
print len(sorted(org_capability_dict.keys()))
print 'Minimum correlation value is {} for organizations {} and {}.'.format(org_correlation[np.unravel_index(org_correlation.argmin(),
org_correlation.shape)[0],
np.unravel_index(org_correlation.argmin(),
org_correlation.shape)[1]],
org_names[np.unravel_index(org_correlation.argmin(),
org_correlation.shape)[0]],
org_names[np.unravel_index(org_correlation.argmin(),
org_correlation.shape)[1]])
plt.subplots(1,1,figsize=(20, 20))
plt.subplot(111)
sns.heatmap(org_correlation, cbar=True, cbar_kws={"shrink": .5}, square=True, yticklabels=org_names, xticklabels=org_names)
ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
lbl.set_color(my_label_colors[lbl.get_text()])
ylbls = ax.get_ymajorticklabels()
for lbl in ylbls:
lbl.set_color(my_label_colors[lbl.get_text()])
plt.title('Organization Correlation Matrix: Unordered', size=13)
plt.show()
We apply hierarchical clustering:
# plot the clustermap
g = sns.clustermap(org_correlation, figsize=(20, 20), xticklabels = org_names, yticklabels=org_names)
teste = list(g.dendrogram_col.reordered_ind)
for tick_label in g.ax_heatmap.axes.get_yticklabels():
tick_label.set_color(my_label_colors[tick_label.get_text()])
for tick_label in g.ax_heatmap.axes.get_xticklabels():
tick_label.set_color(my_label_colors[tick_label.get_text()])
plt.show()
We select some organizations:
foc_org = ['TECH UNIV DENMARK', 'TSINGHUA UNIV', 'NOVOZYMES AS', 'UNIV CALIF BERKELEY']
And plot their profiles:
# for each country selected
for org in foc_org:
# find the matrix slice
org_index = find_index(org, org_names)
histogram_data = org_correlation[:, org_index]
# remove the country itself from data and labels
histogram_data = np.delete(histogram_data, org_index)
clean_org_names = np.delete(org_names, org_index)
# sort labels and data
sorted_names = [name for _,name in sorted(zip(histogram_data, clean_org_names))]
histogram_data.sort()
#plot
plt.subplots(1,1,figsize=(20,7))
sns.barplot(np.arange(len(histogram_data)), histogram_data * 100, palette="Reds_d")
plt.xticks(np.arange(len(histogram_data)), sorted_names, rotation=90, fontsize=11)
plt.title('Organization Profile: {}'.format(org))
plt.ylabel('Correlation Percentage')
ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
lbl.set_color(my_label_colors[lbl.get_text()])
plt.show()