Micro Level Analysis: Focusing on Organizations

In this part of the analysis, we will focus on organizations as the unit of technological capability.

Let us import all relevant libraries

In [1]:
from py2neo import Graph
import numpy as np 
from pandas import DataFrame
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import pandas as pd
import plotly 
import plotly.graph_objs as go
import qgrid
from scipy import stats, spatial
from sklearn.cluster.bicluster import SpectralBiclustering
import operator
from IPython.display import display, HTML
from matplotlib.colors import ListedColormap

# please add your plotly api credentials to plotly_config in your own machine. Visit https://plot.ly/python/getting-started/
plotly_config = json.load(open('plotly_config.json'))
plotly.tools.set_credentials_file(username=plotly_config['username'], api_key=plotly_config['key'])

local_connection_url = "http://localhost:7474/db/data"
connection_to_graph = Graph(local_connection_url)

1.Characterizing organizations

1.1.Preparations

Let us create a list of the danish organization that we want to focus on:

In [2]:
dk_organizations = ["TECH UNIV DENMARK", 
                   "NOVOZYMES AS", 
                   "AALBORG UNIV", 
                   "UNIV COPENHAGEN", 
                   "AARHUS UNIV", 
                   "UNIV SO DENMARK", 
                   "RISO DTU", 
                   "DONG ENERGY"]

Then we query all of the organziations in the database.

In [3]:
# query orgs
org_available_q = """       MATCH (n:Asset)
                            WITH n.owner as ORG
                            RETURN ORG, count(ORG)
                            ORDER BY count(ORG) DESC"""

# create a list with the years where records exist
raw_data = DataFrame(connection_to_graph.data(org_available_q)).as_matrix()[:, 0]
organizations = list(raw_data)

# print an example organization
print 'We found {} organizations'.format(len(organizations))
We found 10638 organizations

Let us get the names of all of the terms in the biofuel dictionnary

In [4]:
f_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock) RETURN  fs.term, count(a)')).as_matrix()[:, 1]))
o_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:Output) RETURN  fs.term, count(a)')).as_matrix()[:, 1]))
pt_terms = list(set(DataFrame(connection_to_graph.data('MATCH (a:Asset)-[:CONTAINS]->(fs:ProcessingTech) RETURN  fs.term, count(a)')).as_matrix()[:, 1]))
bbo = list(set(f_terms + pt_terms + o_terms))
print len(bbo)
matrix_axis_names = bbo
352
In [5]:
def find_index(something, in_list):
    return in_list.index(something)

We create a function that given an organization, returns its capability matrix

In [6]:
def get_org_matrix(org, normalization=True):
    
    # define queries
    org_no_interestions = """   MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
                                    MATCH (a:Asset)-[:CONTAINS]->(out:Output)
                                    MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
                                    WHERE a.owner CONTAINS "{}"
                                    RETURN fs.term, pt.term, out.term, count(a)
                                    """.format(org)
    
    process_variables = ['Feedstock', 'Output', 'ProcessingTech']
    
    org_intersections = """     MATCH (a:Asset)-[:CONTAINS]->(fs:{})
                                    MATCH (a:Asset)-[:CONTAINS]->(t:{})
                                    WHERE fs<>t AND a.owner CONTAINS "{}"
                                    RETURN fs.term, t.term, count(a)
                                    """
    
    
    total_documents_q = """ MATCH (n:Asset)
                            WHERE n.owner contains "{}"
                            RETURN  count(n)""".format(org)

    total_docs = DataFrame(connection_to_graph.data(total_documents_q)).as_matrix()[0][0]

    
    # get data
    data_no_intersections = DataFrame(connection_to_graph.data(org_no_interestions)).as_matrix()
    
    # create matrix
    org_matrix = np.zeros([len(matrix_axis_names), len(matrix_axis_names)])
    
    # for no intersections data
    for row in data_no_intersections:
        # the last column is the frequency (count)
        frequency = row[0]
        indexes = [find_index(element, matrix_axis_names) for element in row[1::]]
        # add frequency value to matrix position
        for pair in itertools.combinations(indexes, 2):
            org_matrix[pair[0], pair[1]] += frequency
            org_matrix[pair[1], pair[0]] += frequency
    
    # for intersecting data
    for category in process_variables:
        process_data = DataFrame(connection_to_graph.data(org_intersections.format(category, category, org))).as_matrix()
        for row in process_data:
            frequency = row[0]
            indexes = [find_index(element, matrix_axis_names) for element in row[1::]]
            # add frequency value to matrix position
            for pair in itertools.combinations(indexes, 2):
                org_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
                org_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
    
    # normalize
    normalized_org_matrix = org_matrix / total_docs
    
    # dynamic return 
    if normalization == True:
        return normalized_org_matrix
    else: 
        return org_matrix

Some auxiliary functions to study organizations:

In [7]:
def check_symmetric(a, tol):
    return np.allclose(a, a.T, atol=tol)

def basic_stats(a_matrix):
    print 'Rows:', a_matrix.shape[0]
    print 'Columns:', a_matrix.shape[1]
    print 'Mean: ', np.mean(a_matrix)
    print 'Standart Deviation', np.std(a_matrix)
    print 'Max: ', np.amax(a_matrix)
    print 'Min: ', np.amin(a_matrix)
    print 'Symmetry: ', check_symmetric(a_matrix, 1e-8)
    print ''

1.2.Capability Matrix of the Technical University of Dennmark

Let's look at DTU's matrix:

In [8]:
univ = 'TECH UNIV DENMARK'
univ_matrix = get_org_matrix(univ, normalization=True)
basic_stats(univ_matrix)


plt.subplots(1,1,figsize=(8, 8))
plt.subplot(111)
sns.heatmap(univ_matrix, cmap='binary', cbar_kws={"shrink": .2},cbar=True, square=True, yticklabels=False, xticklabels=False)
plt.show()


i,j = np.unravel_index(univ_matrix.argmax(), univ_matrix.shape)
print 'The maximum value of the {} matrix is in position {} with value {} and concerns {} and {}.'.format(univ, (i, j), univ_matrix[i,j], matrix_axis_names[i], matrix_axis_names[j])
Rows: 352
Columns: 352
Mean:  0.0003732728564049587
Standart Deviation 0.007802062471353913
Max:  0.828125
Min:  0.0
Symmetry:  True

The maximum value of the TECH UNIV DENMARK matrix is in position (90, 164) with value 0.828125 and concerns  anaerobic digestion and biogas.

2.Organization correlation matrix and organization profiles

2.1.Correlation Matrices

A function that returns the capability list from the capability matrix

In [9]:
def get_list_from(matrix):
    only_valuable = []
    extension = 1
    for row_number in range(matrix.shape[0]):
        only_valuable.append(matrix[row_number, extension:matrix.shape[0]].tolist()) # numpy functions keep 0s so I hard coded it. 
        extension += 1 
    return [element for column in only_valuable for element in column ]

We now add the list of DK organizations and the organizations with more than 7 assets:

In [10]:
# query orgs
assetLimit = 7
org_available_q = """     MATCH (n:Asset)
                            WITH n.owner as ORG
                            RETURN ORG, count(ORG)
                            ORDER BY count(ORG) DESC"""

# create a list with the years where records exist from 1 to remove null
raw_data = DataFrame(connection_to_graph.data(org_available_q)).as_matrix()[1::] 
list_of_organizations = []
for row in raw_data:
    if row[1] >= assetLimit:
        list_of_organizations.append(row[0])

organizations = list(set(list_of_organizations + dk_organizations))
print 'The list of organizations now has {} organizations.'.format(len(organizations))
The list of organizations now has 115 organizations.

Create a dictionnary of organization capability lists for efficiency.

In [11]:
# create dictionnary
org_capability_dict = {}
counter = 0

# iterate through countries
for org in organizations:
    
    counter += 1
    org_matrix = get_org_matrix(org.encode('utf-8').strip(), normalization=True)
    org_list = get_list_from(org_matrix)
    
    
    # discart if no information
    if np.all(np.isnan(org_matrix)) or sum(org_list)==0.0:
        print org
        continue
        
    else: 
        org_capability_dict[org] = get_list_from(org_matrix)
        
UNIV SOUTH CHINA TECHNOLOGY
PEUGEOT CITROEN AUTOMOBILES SA
L'ORANGE GMBH

A function that calculates the correlation:

In [12]:
def calculate_org_correlation(org1_list, org2_list, stat):
    avg_dif = np.mean(org1_list - org2_list)
    abs_avg_dif = abs(avg_dif)
    
    if stat.lower() == 'absolute average difference': # return absolute average difference
        return abs_avg_dif
    if stat == 'Pearson':                             # return Pearson coef
        return stats.pearsonr(org1_list, org2_list)[0]
    if stat == 'P-value':                             # return P-value
        return stats.pearsonr(org1_list, org2_list)[1]

Let's print all the organizations:

In [13]:
org_names = org_capability_dict.keys()

org_names.sort()
number_of_orgs = len(org_names)
print 'There are {} organizations.'.format(number_of_orgs)
print 'List of Organizations:'
print org_names
There are 112 organizations.
List of Organizations:
['AALBORG UNIV', 'AARHUS UNIV', u'ABELLON CLEANENERGY LTD', u'ABENGOA BIOENERGIA NUEVAS TECNOLOGIAS SA', u'ABENGOA BIOENERGY CORP.', u'AGENCIA ESTATAL CONSEJO SUPERIOR DEINVESTIGACIONES CIENTIFICAS', u'AMYRIS, INC.', u'ARCHER DANIELS MIDLAND', u'ARCHER-DANIELS MIDLAND CO', u'ARGONNE NATL LAB', u'ARISTOTLE UNIV THESSALONIKI', u'ARS', u'BAYER CROPSCIENCE NV', u'BEIJING FORESTRY UNIV', u'BEIJING UNIV CHEM TECHNOL', u'BETA RENEWABLES', u'BUNGE ACUCAR E BIOENERGIA LTDA', u'CHEVRON USA INC;CHEVRON USA INC', u'CHINA AGR UNIV', u'CHINESE ACAD SCI', u'CHINESE ACAD SCI;CHINESE ACAD SCI', u'CHINESE ACAD SCI;UNIV CHINESE ACAD SCI', u'COPERSUCAR S.A.', u'CORNELL UNIV;CORNELL UNIV', u'CSIR', u'CSIR IICT', 'DONG ENERGY', u'E CHINA UNIV SCI & TECHNOL', u'ECOLE POLYTECH FED LAUSANNE', u'EQUOS RES KK', u'FLINT HILLS RESOURCES, LP', u'FOREST CONCEPTS LLC', u'GREEN PLAINS RENEWABLE ENERGY', u'HELIAE DEV LLC', u'INDIAN INST CHEM TECHNOL', u'INDIAN INST TECHNOL', u'INDIAN INST TECHNOL;INDIAN INST TECHNOL', u'IOWA STATE UNIV', u'IOWA STATE UNIV;IOWA STATE UNIV', u'IOWA STATE UNIV;IOWA STATE UNIV;IOWA STATE UNIV', u'JIANGNAN UNIV', u'KOREA ADV INST SCI & TECHNOL', u'KOREA INST ENERGY RES', u'KOREA INST SCI & TECHNOLOGY', u'LOUISIANA STATE UNIV', u'LUND UNIV', u'MAX-PLANCK-GESELLSCHAFT ZUR FORDERUNG DER WISSENSCHAFTEN EV', u'MICHIGAN STATE UNIV', u'MICHIGAN STATE UNIV;MICHIGAN STATE UNIV', u'MONSANTO TECHNOLOGY LLC', u'MONSANTO TECHNOLOGY LLC;MONSANTO TECHNOLOGY LLC', u'N CAROLINA STATE UNIV', u'NANDAN BIOMATRIX LTD', u'NANYANG TECHNOL UNIV', u'NATL CHENG KUNG UNIV;NATL CHENG KUNG UNIV;NATL CHENG KUNG UNIV', u'NATL RENEWABLE ENERGY LAB', u'NATL UNIV SINGAPORE', 'NOVOZYMES AS', u'ODEBRECHT AGROINDUSTRIAL S.A.', u'OHIO STATE UNIV', u'QINGDAO JINXIU SHUIYUAN TRADING CO LTD', u'RAA\xadZEN ENERGIA S.A.', 'RISO DTU', u'ROCKWELL AUTOMATION TECHNOLOGIES INC', u'RURAL DEV ADMINISTRATION', u'S CHINA UNIV TECHNOL', u'SHANGHAI JIAO TONG UNIV', u'SHELL INT RES MIJ BV;SHELL OIL CO', u'SONY CORP;SONY CORP', u'SOUTHEAST UNIV', u'SWEDISH UNIV AGR SCI', u'TECH UNIV DENMARK', u'TEXAS A&M UNIV', u'TIANJIN UNIV', u'TOYOTA JIDOSHA KK', u'TSINGHUA UNIV', u'UNICAMP UNIV ESTADUAL CAMPINAS', u'UNIV ALBERTA', u'UNIV ALMERIA', u'UNIV BRITISH COLUMBIA', u'UNIV CALIF BERKELEY', u'UNIV CALIF DAVIS', u'UNIV CALIFORNIA', u'UNIV CALIFORNIA;UNIV CALIFORNIA', 'UNIV COPENHAGEN', u'UNIV FEDERAL DO PARANA', u'UNIV GEORGIA', u'UNIV GEORGIA RES FOUND INC', u'UNIV KOREA RES & BUSINESS FOUND', u'UNIV MINHO', u'UNIV MINNESOTA', u'UNIV MONTREAL', u'UNIV OVIEDO', u'UNIV PATRAS', u'UNIV SAINS MALAYSIA', 'UNIV SO DENMARK', u'UNIV STELLENBOSCH', u'UNIV TEXAS SYSTEM', u'UNIV UTRECHT', u'UNIV WASHINGTON', u'UNIV WESTERN ONTARIO', u'UOP LLC', u'UOP LLC;UOP LLC', u'USDA ARS', u'USINA DE AA\xa7A\xbaCAR SANTA TEREZINHA LTDA', u'UT-BATTELLE LLC', u'VALERO RENEWABLE FUELS', u'VALERO RENEWABLE FUELS COMPANY, LLC', u'WASHINGTON STATE UNIV', u'WISCONSIN ALUMNI RES FOUND', u'XINAO SCI & TECHNOLOGY DEV CO LTD', u'ZHEJIANG UNIV']

And we build the correlation matrix

In [14]:
idxs = [org_names.index(e) for e in dk_organizations]
my_label_colors = {}
for name in org_names:
        if name in dk_organizations:
            my_label_colors[name] = 'r'
        else:
            my_label_colors[name] = 'black'
In [15]:
org_correlation = np.zeros([number_of_orgs, number_of_orgs])
for row in range(number_of_orgs):
    org_1 = org_names[row]
    org_1_list = np.asarray(org_capability_dict[org_1])
    for column in range(number_of_orgs):
        org_2 = org_names[column]
        org_2_list = np.asarray(org_capability_dict[org_2])

        org_correlation[row, column] = calculate_org_correlation(org_1_list, org_2_list, 'Pearson')
        
In [16]:
print len(sorted(org_capability_dict.keys()))
112
In [17]:
print 'Minimum correlation value is {} for organizations {} and {}.'.format(org_correlation[np.unravel_index(org_correlation.argmin(),
                                             org_correlation.shape)[0],
                            np.unravel_index(org_correlation.argmin(), 
                                             org_correlation.shape)[1]],
        org_names[np.unravel_index(org_correlation.argmin(), 
                                       org_correlation.shape)[0]], 
        org_names[np.unravel_index(org_correlation.argmin(), 
                                       org_correlation.shape)[1]])
Minimum correlation value is -0.00220367057387 for organizations ABENGOA BIOENERGIA NUEVAS TECNOLOGIAS SA and FOREST CONCEPTS LLC.
In [18]:
plt.subplots(1,1,figsize=(20, 20))
plt.subplot(111)
sns.heatmap(org_correlation, cbar=True, cbar_kws={"shrink": .5}, square=True, yticklabels=org_names, xticklabels=org_names)
ax = plt.gca()
xlbls = ax.get_xmajorticklabels()
for lbl in xlbls:
    lbl.set_color(my_label_colors[lbl.get_text()])
ylbls = ax.get_ymajorticklabels()
for lbl in ylbls:
    lbl.set_color(my_label_colors[lbl.get_text()])
plt.title('Organization Correlation Matrix: Unordered', size=13)
plt.show()

We apply hierarchical clustering:

In [19]:
# plot the clustermap
g = sns.clustermap(org_correlation,  figsize=(20, 20), xticklabels = org_names, yticklabels=org_names)
teste = list(g.dendrogram_col.reordered_ind)

for tick_label in g.ax_heatmap.axes.get_yticklabels():
    tick_label.set_color(my_label_colors[tick_label.get_text()])
for tick_label in g.ax_heatmap.axes.get_xticklabels():
    tick_label.set_color(my_label_colors[tick_label.get_text()])

plt.show()

2.2.Organization profiles

We select some organizations:

In [20]:
foc_org = ['TECH UNIV DENMARK', 'TSINGHUA UNIV', 'NOVOZYMES AS', 'UNIV CALIF BERKELEY']

And plot their profiles:

In [21]:
# for each country selected
for org in foc_org:
    
    # find the matrix slice
    org_index = find_index(org, org_names)
    histogram_data = org_correlation[:, org_index]
    
    # remove the country itself from data and labels
    histogram_data = np.delete(histogram_data, org_index)
    clean_org_names = np.delete(org_names, org_index)
    

    # sort labels and data
    sorted_names = [name for _,name in sorted(zip(histogram_data, clean_org_names))]
    histogram_data.sort()
    
    #plot
    plt.subplots(1,1,figsize=(20,7))
    sns.barplot(np.arange(len(histogram_data)), histogram_data * 100, palette="Reds_d")
    plt.xticks(np.arange(len(histogram_data)), sorted_names, rotation=90, fontsize=11)
    plt.title('Organization Profile: {}'.format(org))
    plt.ylabel('Correlation Percentage')
    
    ax = plt.gca()
    xlbls = ax.get_xmajorticklabels()
    for lbl in xlbls:
        lbl.set_color(my_label_colors[lbl.get_text()])
    
    
    plt.show()