Capability Correlations

Let's start by importing all of the necessary libraries to conduct the analysis.

In [1]:
from py2neo import Graph
import numpy as np 
from pandas import DataFrame
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import pandas as pd
import plotly 
import plotly.graph_objs as go
import qgrid
from scipy import stats, spatial
from sklearn.cluster.bicluster import SpectralBiclustering
import operator
import math
from IPython.display import display, HTML
import collections

# please add your plotly api credentials to plotly_config in your own machine. Visit https://plot.ly/python/getting-started/
plotly_config = json.load(open('plotly_config.json'))
plotly.tools.set_credentials_file(username=plotly_config['username'], api_key=plotly_config['key'])

1. Patent and Publication differences on term pairs

In order to establish a base for understading the basis of the work, we wish to understand the occurence of severall feedstocks, processing technologies and outputs in our database.

For example, how many assets (patents, papers, etc) contain the mix of processing technology X for output Y?

To understand this in a more general way, the AMICA database will be transformed in an co-occurence matrix. This means, in the above described example that the number of assets that contain that mix will be an entry in a matrix, whyle the corresponfing technology and output will be columns/lines of the matrix.

1.1. Axis building

We start by creating, like previsouly a list of terms (feedstocks, processing technologies and outputs) so these become the axis of the capability matrix.

In [2]:
local_connection_url = "http://localhost:7474/db/data"
connection_to_graph = Graph(local_connection_url)
In [3]:
query_no_interestions = """     MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
                                MATCH (a:Asset)-[:CONTAINS]->(out:Output)
                                MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
                                RETURN fs.term, pt.term, out.term, count(a)
                        """
In [4]:
# issue: this query needs to be divided by two when building the matrix -> NON OPTIMIZED
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
query_intersections = """       MATCH (a:Asset)-[:CONTAINS]->(fs:{})
                                MATCH (a:Asset)-[:CONTAINS]->(t:{})
                                WHERE fs<>t 
                                RETURN fs.term, t.term, count(a)
                      """
In [5]:
# Return query as pandas dataframe 
data_no_intersections = DataFrame(connection_to_graph.data(query_no_interestions)).as_matrix()

# Get axis names from columns and append to list
feedstock_names = set(list(data_no_intersections[:, 1]))
processing_technology_names = set(list(data_no_intersections[:, 2]))
output_names = set(list(data_no_intersections[:, 3]))
matrix_axis_names = list(feedstock_names) + list(processing_technology_names) + list(output_names)

# Extra labels that only appear in non-intersection queries
for category in process_variables:
    data_no_intersections = DataFrame(connection_to_graph.data(query_intersections.format(category, category))).as_matrix()
    for column_number in range(1,3):
        column = data_no_intersections[:, column_number]
        for name in column:
            if name not in matrix_axis_names:
                matrix_axis_names.append(name)
                
print len(matrix_axis_names)
342

Our matrix will have a total of 342 rows and 342 columns.

1.2. Function Design

We start by creating a function that given an asset (e.g."Patent", or "Publication"), returns a matrix where each entry correponds to the number of documents containing a certain term. For example:

matrix[i, j] = z

norm_matrix[i, j] = w

There are z documents containing the term i and j.

Where: norm_matrix[i,j] = (matrix[i, j] - mean(matrix)) / standard_deviation(matrix)

In [6]:
def get_asset_matrix(asset, normalization=True):
    
    # define queries
    asset_no_interestions = """     MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
                                    MATCH (a:Asset)-[:CONTAINS]->(out:Output)
                                    MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
                                    WHERE a.type = "{}"
                                    RETURN fs.term, pt.term, out.term, count(a)
                                    """.format(asset)
    
    process_variables = ['Feedstock', 'Output', 'ProcessingTech']
    
    asset_intersections = """       MATCH (a:Asset)-[:CONTAINS]->(fs:{})
                                    MATCH (a:Asset)-[:CONTAINS]->(t:{})
                                    WHERE fs<>t AND a.type = "{}"
                                    RETURN fs.term, t.term, count(a)
                                    """
    # get data
    data_no_intersections = DataFrame(connection_to_graph.data(asset_no_interestions)).as_matrix()
    
    # create matrix
    asset_matrix = np.zeros([len(matrix_axis_names), len(matrix_axis_names)])
    
    # for no intersections data
    for row in data_no_intersections:
        # the last column is the frequency (count)
        frequency = row[0]
        indexes = [matrix_axis_names.index(element) for element in row[1::]]
        # add frequency value to matrix position
        for pair in itertools.combinations(indexes, 2):
            asset_matrix[pair[0], pair[1]] += frequency
            asset_matrix[pair[1], pair[0]] += frequency
    
    # for intersecting data
    for category in process_variables:
        process_data = DataFrame(connection_to_graph.data(asset_intersections.format(category, category, asset))).as_matrix()
        for row in process_data:
            frequency = row[0]
            indexes = [matrix_axis_names.index(element) for element in row[1::]]
            # add frequency value to matrix position
            for pair in itertools.combinations(indexes, 2):
                asset_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
                asset_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
    
    # normalize
    normalized_asset_matrix = (asset_matrix - np.mean(asset_matrix)) / np.std(asset_matrix)
    
    # dynamic return 
    if normalization == True:
        return normalized_asset_matrix
    else: 
        return asset_matrix

1.3. Patents and Publication Matrixes

We can now create the capability matrixes of publications and patents.

First, a query is created:

In [7]:
norm_mode = False
publication_matrix = get_asset_matrix('PUBLICATION', normalization=norm_mode)
patent_matrix = get_asset_matrix('PATENT', normalization=norm_mode)

Consequently, both matrixes are plotted.

In [8]:
# create subplots
plt.subplots(2,1,figsize=(17,17))

plt.subplot(121)
sns.heatmap(publication_matrix,  cbar=None,cmap='BuPu_r', square=True, xticklabels=False, yticklabels=False)
plt.title('Publications Heatmap')

plt.subplot(122)
sns.heatmap(patent_matrix, cbar=None, square=True,cmap='BuPu_r', xticklabels=False, yticklabels=False)
plt.title('Patents Heatmap')
plt.show()

Due to the scarcity of documents in certain intersepting categories, the matrixes are rather hard to read.

We analyse some basic stats of these two matrixes..

In [9]:
print 'PATENTS:'
print 'Rows:', patent_matrix.shape[0]
print 'Columns:', patent_matrix.shape[1]
print 'Mean: ', np.mean(patent_matrix)
print 'Standart Deviation', np.std(patent_matrix)
print 'Max: ', np.amax(patent_matrix)
print 'Min: ', np.amin(patent_matrix)
PATENTS:
Rows: 342
Columns: 342
Mean:  0.7012927054478301
Standart Deviation 6.683876527537948
Max:  883.0
Min:  0.0
In [10]:
print 'PUBLICATIONS:'
print 'Rows:', publication_matrix.shape[0]
print 'Columns:', publication_matrix.shape[1]
print 'Mean: ', np.mean(publication_matrix)
print 'Standart Deviation', np.std(publication_matrix)
print 'Max: ', np.amax(publication_matrix)
print 'Min: ', np.amin(publication_matrix)
PUBLICATIONS:
Rows: 342
Columns: 342
Mean:  1.963835026161896
Standart Deviation 22.45221872398744
Max:  2169.0
Min:  0.0

Clearly, there are more publications, for instants, the average cell in the matrix has a value of 1.96, which compared to patents (0.7), is more than the double.

Moreover, due to very high max values, (883, 2169) the matrixes are very irregular.

1.4. Analysing the differences

Now, we create a matrix where every entry correponds to the following:

difference[i,j] = patents[i,j] - publications[i,j].

This difference matrix, will give us a feel for what combination of terms are more patented Vs. researched or vice-versa.

We subtract the matrixes:

In [11]:
differences = patent_matrix - publication_matrix

And we plot the diferences matrix.

In [12]:
plt.subplots(1,1,figsize=(9,9))

plt.subplot(111)
sns.heatmap(differences, square=True, xticklabels=False, yticklabels=False)
plt.title('The heatmap of differences')
plt.show()

Due to the high volume of entries, it can be hard to visualize what is happening.

Therefore, let's create a table with the combination of terms that are the most dicrepant.

In [13]:
# list where all the values and indexes of matrix are stored
values = []
indexes = []
no_duplicates = np.abs(np.triu(differences, 1))

# loop through the matrix
for row_n in range(differences.shape[0]):
    for col_n in range(differences.shape[1]):
        values.append(no_duplicates[row_n, col_n])
        indexes.append((row_n, col_n))

Z = [indexes for _,indexes in sorted(zip(values,indexes))]

Let us create a dataframe of the most negative and positive relations for easy visualization.

In [14]:
term_Dataframe = pd.DataFrame(
    {'First Term': [matrix_axis_names[e[0]] for e in Z],
     'Second Term': [matrix_axis_names[e[1]] for e in Z],
     'Patents': [patent_matrix[e[0], e[1]] for e in Z], 
     'Publications': [publication_matrix[e[0], e[1]] for e in Z], 
     'Difference': [no_duplicates[e[0], e[1]] for e in Z]
    })

term_Dataframe = term_Dataframe[['First Term', 'Second Term', 'Patents', 'Publications', 'Difference']]
term_Dataframe = term_Dataframe.sort_values('Difference', ascending=False).head(n=15)
print 'Absolute:'
display(HTML(term_Dataframe.to_html(index=False)))
Absolute:
First Term Second Term Patents Publications Difference
biogas anaerobic digestion 20.0 1889.0 1869.0
ethanol fermentation 883.0 2169.0 1286.0
ethanol hydrolysis 432.0 1569.0 1137.0
bioethanol fermentation 105.0 1066.0 961.0
bio-oil pyrolysis 23.0 942.0 919.0
bioethanol hydrolysis 66.0 775.0 709.0
biodiesel transesterification 122.0 826.0 704.0
waste biogas 59.0 710.0 651.0
ethanol enzymatic hydrolysis 106.0 755.0 649.0
biodiesel catalysis 20.0 613.0 593.0
cellulose ethanol 194.0 755.0 561.0
biogas hydrolysis 48.0 567.0 519.0
ethanol catalysis 13.0 529.0 516.0
waste anaerobic digestion 17.0 510.0 493.0
sugar ethanol 269.0 753.0 484.0
In [15]:
total_patents = 4585
total_publications = 5313

patent_matrix = patent_matrix / total_patents
publication_matrix = publication_matrix / total_publications
differences = patent_matrix - publication_matrix

# list where all the values and indexes of matrix are stored
values = []
indexes = []
no_duplicates = np.abs(np.triu(differences, 1))

# loop through the matrix
for row_n in range(differences.shape[0]):
    for col_n in range(differences.shape[1]):
        values.append(no_duplicates[row_n, col_n])
        indexes.append((row_n, col_n))

Z = [indexes for _,indexes in sorted(zip(values,indexes))]


term_Dataframe = pd.DataFrame(
    {'First Term': [matrix_axis_names[e[0]] for e in Z],
     'Second Term': [matrix_axis_names[e[1]] for e in Z],
     'Patents': [patent_matrix[e[0], e[1]] for e in Z], 
     'Publications': [publication_matrix[e[0], e[1]] for e in Z], 
     'Difference': [no_duplicates[e[0], e[1]] for e in Z]
    })

term_Dataframe = term_Dataframe[['First Term', 'Second Term', 'Patents', 'Publications', 'Difference']]
term_Dataframe = term_Dataframe.sort_values('Difference', ascending=False).head(n=15)
print 'Absolute:'
display(HTML(term_Dataframe.to_html(index=False)))
Absolute:
First Term Second Term Patents Publications Difference
biogas anaerobic digestion 0.004362 0.355543 0.351181
ethanol fermentation 0.192585 0.408244 0.215659
ethanol hydrolysis 0.094220 0.295313 0.201093
bioethanol fermentation 0.022901 0.200640 0.177739
bio-oil pyrolysis 0.005016 0.177301 0.172285
bioethanol hydrolysis 0.014395 0.145869 0.131474
biodiesel transesterification 0.026609 0.155468 0.128859
waste biogas 0.012868 0.133634 0.120766
ethanol enzymatic hydrolysis 0.023119 0.142104 0.118985
biodiesel catalysis 0.004362 0.115377 0.111015
cellulose ethanol 0.042312 0.142104 0.099792
ethanol catalysis 0.002835 0.099567 0.096732
biogas hydrolysis 0.010469 0.106719 0.096250
waste anaerobic digestion 0.003708 0.095991 0.092283
cellulose fermentation 0.034242 0.118577 0.084335

2. Patents and Publication Matrixes: Chronological Evolution

2.1. Absolute Evolution

We start by creating a function that given a certain asset type and a timeline, returns the number of documents of that type for every year.

In [16]:
def getTotalDocuments(assetType, startYear, endYear):
    assetQuery = """MATCH (a:Asset)
                    WHERE  a.type="{}"
                    AND toInteger(a.year)>={} AND toInteger(a.year)<={} 
                    AND NOT a.year = "Null"
                    RETURN  a.year, count(a)
                    ORDER BY a.year""".format(assetType, startYear, endYear)
    
    dataReturn = DataFrame(connection_to_graph.data(assetQuery)).as_matrix()
    timeLine = np.arange(startYear, endYear + 1)
    finalMatrix = np.transpose(np.vstack((timeLine, timeLine)))
    
    for i in range(finalMatrix.shape[0]):
        finalMatrix[i, 1] = 0
        for j in range(dataReturn.shape[0]):
            if finalMatrix[i, 0] == int(dataReturn[j, 0]):
                finalMatrix[i, 1] = dataReturn[j, 1]
                
    toReturn = {}
    
    toReturn['Years'] = finalMatrix[:, 0]
    toReturn['Quantity'] = finalMatrix[:, 1]
                
    return toReturn    

We now test the function

In [17]:
# define data 
startYear = 1990
endYear = 2017
patentTimeline = getTotalDocuments('PATENT', startYear, endYear)
publicationTimeline = getTotalDocuments('PUBLICATION', startYear, endYear)
title = 'Evolution of asset quantity over time'
x_label = 'Years'
y_label = 'Number of Records'

# plot evolution
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.plot(patentTimeline['Years'], patentTimeline['Quantity'], label='Patents')
plt.plot(publicationTimeline['Years'], publicationTimeline['Quantity'], label='Publications')
plt.legend()
plt.xticks(publicationTimeline['Years'])
plt.title('Evolution of asset quantity over time')
plt.xlabel('Years') 
plt.ylabel('Number of Records') 
plt.show()

There is a clear relationship between the volume of patents and publications. Moreover, the number of patents seems to be inferior on average to the number of publications.

However, there is a period where the number of patents is superior to the number of publications. Particularly the period between 2005 and 2011.

2.2. Comparing the chronological evolution of asset types

A function that gives the chronological evolution of a certain asset.

In [18]:
def getDocuments(processType, processTerm, assetType, startYear, endYear):
    assetQuery = """MATCH (a:Asset)-[:CONTAINS]->(fs:{})
                    WHERE  fs.term = "{}" AND a.type="{}"
                    AND toInteger(a.year)>={} AND toInteger(a.year)<={} 
                    AND NOT a.year = "Null"
                    RETURN  a.year, count(a)
                    ORDER BY a.year""".format(processType, processTerm, assetType, startYear, endYear)
    
    assetTotalQuery = """   MATCH (a:Asset)
                            WHERE  a.type="{}"
                            AND toInteger(a.year)>={} AND toInteger(a.year)<={}
                            AND NOT a.year = "Null"
                            RETURN  a.year, count(a)
                            ORDER BY a.year""".format(assetType, startYear, endYear)
    
    dataReturn = DataFrame(connection_to_graph.data(assetQuery)).as_matrix()
    dataNormReturn = DataFrame(connection_to_graph.data(assetTotalQuery)).as_matrix()
     
    timeLine = np.arange(startYear, endYear + 1)
    finalMatrix = np.transpose(np.vstack((timeLine, timeLine, timeLine)))
    
    for i in range(finalMatrix.shape[0]):
        finalMatrix[i, 1] = 0
        finalMatrix[i, 2] = 0
        for j in range(dataReturn.shape[0]):
            if finalMatrix[i, 0] == int(dataReturn[j, 0]):
                finalMatrix[i, 1] = dataReturn[j, 1]
                
        for k in range(dataNormReturn.shape[0]):
            if finalMatrix[i, 0] == int(dataNormReturn[k, 0]):
                    finalMatrix[i, 2] = dataNormReturn[k, 1]
                
    toReturn = {}
    
    toReturn['Years'] = finalMatrix[:, 0]
    toReturn['Quantity'] = finalMatrix[:, 1]
    toReturn['NormQuantity'] = [finalMatrix[e, 1] / float(finalMatrix[e, 2])  if finalMatrix[e, 2] != 0 else 0 for e in range(finalMatrix.shape[0])]
                    
    return toReturn    

2.2.1. Feedstocks

We then test the function for severall different values of Feedstocks, in this case, the ones that are more proiminent in the database.

In [19]:
feedstockList = ['waste', 'algae', 'cellulose', 'sugar', 'paper', 'wood', 'residues', 'corn']

palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))

for term in feedstockList:
    termPat = getDocuments('Feedstock', term, 'PATENT', 1990, 2017)
    termPub = getDocuments('Feedstock', term, 'PUBLICATION', 1990, 2017)
    plt.subplot(2,4, plotCounter)
    plt.plot(termPat['Years'], termPat['Quantity'], label = 'Patents', color = palette(colorCounter))
    plt.plot(termPub['Years'], termPub['Quantity'], label = 'Publications', color = palette(colorCounter + 1))
    plt.xlim(1990,2017)
    plt.ylim(-10,310)
    plt.title(term.upper())
    plt.legend()
    plotCounter += 1
    colorCounter += 2
    
plt.show()

It appears that the number of publications is on average, far superior to the number of patents. However, the behaviour of these assets appears to follow the behaviour of the general dataset.(e.g. Small period where patents are more important.)

2.2.2. Outputs

What about outputs as process variables?

In [20]:
outputList = ["ethanol", "biodiesel", "biogas", "bioethanol", "bio-oil", "gasoline", "methanol", "butanol"]

palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))

for term in outputList:
    termPat = getDocuments('Output', term, 'PATENT', 1990, 2017)
    termPub = getDocuments('Output', term, 'PUBLICATION', 1990, 2017)

    plt.subplot(2,4, plotCounter)
    plt.plot(termPat['Years'], termPat['Quantity'], label = 'Patents', color = palette(colorCounter))
    plt.plot(termPub['Years'], termPub['Quantity'] , label = 'Publications', color = palette(colorCounter + 1))
    plt.xlim(1990,2017)
    plt.ylim(-10,310)
    plt.title(term.upper())
    plt.legend()
    plotCounter += 1
    colorCounter += 2
    
plt.show()

The behaviour does not appear to differ greatly from the feedstocks.

2.2.3. Normalized Feedstocks

In [21]:
feedstockList = ['waste', 'algae', 'cellulose', 'sugar', 'paper', 'wood', 'residues', 'corn']


palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))

for term in feedstockList:
    termPat = getDocuments('Feedstock', term, 'PATENT', 2000, 2017)
    termPub = getDocuments('Feedstock', term, 'PUBLICATION', 2000, 2017)
    plt.subplot(2,4, plotCounter)
    plt.plot(termPat['Years'], termPat['NormQuantity'], label = 'Patents', color = palette(colorCounter))
    plt.plot(termPub['Years'], termPub['NormQuantity'], label = 'Publications', color = palette(colorCounter + 1))
    plt.grid()
    plt.xlim(2000,2017)
    plt.ylim(-0.05,0.5)
    plt.title(term.upper())
    plt.legend()
    plotCounter += 1
    colorCounter += 2
    
plt.show()

3. Comparing the patenting and publication rates of individual terms

3.1. Function design

We create a function that given a certain process type (e.g. Output, ProcTech or Feedstock), returns the total assets in terms of patents and publications.

In [22]:
def get_asset_distribution(processType):
    """
    This function takes a process type, say Feedstocks and returns the total assets in terms of Patents
    and Publications for that same asset. 
    """
    q = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
            WHERE a.type="PATENT" OR a.type="PUBLICATION"
            RETURN  fs.term,a.type, count(a)
            ORDER BY fs.term""".format(processType)
    
    q_total = """   MATCH (a:Asset)-[:CONTAINS]->(fs:{})
                    WHERE a.type="PATENT" OR a.type="PUBLICATION"
                    RETURN  a.type, count(a)""".format(processType)
    
    data = DataFrame(connection_to_graph.data(q)).as_matrix()
    patent_total = DataFrame(connection_to_graph.data(q_total)).as_matrix()[0, 1]
    publication_total = DataFrame(connection_to_graph.data(q_total)).as_matrix()[1, 1]

    terms = list(set(data[:, 2]))
    patents = []
    publications = []
    
    
    for term in terms: 
        publications.append(0)
        patents.append(0)
        for data_row in data: 
            if data_row[2] == term and data_row[0] == "PUBLICATION":
                publications = publications[:-1] + [data_row[1]]
            if data_row[2] == term and data_row[0] == "PATENT":
                patents = patents[:-1] + [data_row[1]]
                
    distribution = {}
    distribution['terms'] = terms
    distribution['publications'] = publications
    distribution['patents'] = patents
    distribution['publications_norm'] = [e / float(publication_total) for e in distribution['publications']]
    distribution['patents_norm'] = [e / float(patent_total) for e in distribution['patents']]

    return distribution

3.2. Feedstock

We start by analysing Feedstock terms.

In [23]:
processType = 'Feedstock'
distribution = get_asset_distribution(processType)

def get_closest_in_line(i):
    x0 = distribution['publications_norm'][i]
    y0 = distribution['patents_norm'][i]
    m = 1
    k = 0
    x = (x0 + m * y0 - m * k)/((m ** 2) + 1)
    y = m * ((x0 + m * y0 - m*k)/((m**2) + 1)) + k
    return [x, y]


fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='purple')
plt.plot([-0.1, 0.11], [-0.1, 0.11], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
    other_ = get_closest_in_line(index)
    if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
        color = 'red'
    else: 
        color = 'blue'
        
    plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7)


plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.11])
plt.ylim([-0.01, 0.11])
plt.show()

There appears to be a high positive correlation, the more patented a term is, the more researched it is.

Every term has been normalized.

Outlier Detection

We create a function that returns the outliers of a given list.

In [24]:
def distance_to_mean(i):
    x1 = distribution['publications_norm'][i]
    y1 = distribution['patents_norm'][i]
    x2 = get_closest_in_line(i)[0]
    y2 = get_closest_in_line(i)[1]
    
    distance = math.sqrt(((x1 - x2)**2) + ((y1 - y2)**2))
    return distance

def winner(i):
    x1 = distribution['publications_norm'][i]
    y1 = distribution['patents_norm'][i]
    if x1 > y1:
        return 'Publications'
    else:
        return 'Patents'
    
# create dataframe
term_Dataframe = pd.DataFrame(
    {'Name': distribution['terms'],
    'Patent Percentage': distribution['patents_norm'],
     'Publications Percentage': distribution['publications_norm'],
     'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
     'Bias':  [winner(i) for i in range(len(distribution['terms']))]
    })

# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))

counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])

print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name Patent Percentage Publications Percentage Distance to Mean Bias
starch 0.030417 0.008816 0.015274 Patents
grain 0.029412 0.008946 0.014472 Patents
agriculture 0.008798 0.024893 0.011381 Publications
sugar 0.064605 0.050823 0.009745 Patents
waste water 0.006662 0.020420 0.009729 Publications
algae 0.077049 0.063788 0.009376 Patents
paper 0.031674 0.044470 0.009048 Publications
blend 0.023504 0.010956 0.008873 Patents
energy crops 0.004022 0.016271 0.008661 Publications
sewage 0.010935 0.022689 0.008311 Publications
In 170 terms, 80 appear more in patents and 90 appear more in publications.

We then plot the above lists's boxplot.

3.3. Processing Technologies

In [25]:
processType = 'ProcessingTech'
distribution = get_asset_distribution(processType)

fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='green')
plt.plot([-0.1, 0.15], [-0.1, 0.15], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
    other_ = get_closest_in_line(index)
    if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
        color = 'red'
    else: 
        color = 'blue'
        
    plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7)


plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.15])
plt.ylim([-0.01, 0.15])
plt.show()

On average, processing technologies are more researched than patented. Which makes sense because they concern technologies and not processes.

Outliers

In [26]:
# create dataframe
term_Dataframe = pd.DataFrame(
    {'Name': distribution['terms'],
    'Patent Percentage': distribution['patents_norm'],
     'Publications Percentage': distribution['publications_norm'],
     'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
     'Bias':  [winner(i) for i in range(len(distribution['terms']))]
    })

# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))

counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])

print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name Patent Percentage Publications Percentage Distance to Mean Bias
pressing 0.134257 0.012747 0.085921 Patents
anaerobic digestion 0.014385 0.130609 0.082183 Publications
fermentation 0.274907 0.188580 0.061042 Patents
catalysis 0.015450 0.077353 0.043772 Publications
hydrolysis 0.116676 0.142483 0.018249 Publications
fast pyrolysis 0.004795 0.029335 0.017352 Publications
pyrolysis 0.072456 0.095512 0.016303 Publications
enzymatic hydrolysis 0.027704 0.049939 0.015723 Publications
gasification 0.052211 0.033525 0.013213 Patents
pyrolysis oil 0.022909 0.009080 0.009779 Patents
In 48 terms, 25 appear more in patents and 23 appear more in publications.

3.4. Output

We take a look at the outputs.

In [27]:
processType = 'Output'
distribution = get_asset_distribution(processType)

fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='brown')
plt.plot([-0.1, 0.32], [-0.1, 0.32], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
    other_ = get_closest_in_line(index)
    if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
        color = 'red'
        label = 'Patent Bias'
    else: 
        color = 'blue'
        label = 'Publication Bias'
        
    plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7, label=label)


plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.31])
plt.ylim([-0.01, 0.31])
plt.show()

Outputs appear to have the same behaviour as processing technologies, a higher tendency to be published rather than patented.

Outliers

In [28]:
# create dataframe
term_Dataframe = pd.DataFrame(
    {'Name': distribution['terms'],
    'Patent Percentage': distribution['patents_norm'],
     'Publications Percentage': distribution['publications_norm'],
     'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
     'Bias':  [winner(i) for i in range(len(distribution['terms']))]
    })

# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))

counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])

print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name Patent Percentage Publications Percentage Distance to Mean Bias
biogas 0.053657 0.140277 0.061249 Publications
bio-oil 0.019483 0.087388 0.048016 Publications
butanol 0.091025 0.032547 0.041351 Patents
ethanol 0.303417 0.245403 0.041023 Patents
gasoline 0.088790 0.037266 0.036433 Patents
bioethanol 0.045992 0.077787 0.022483 Publications
biodiesel 0.157458 0.179333 0.015468 Publications
cellulosic ethanol 0.007027 0.023108 0.011371 Publications
cellulosic biofuel 0.003833 0.019528 0.011098 Publications
pellets 0.022996 0.009927 0.009241 Patents
In 45 terms, 25 appear more in patents and 20 appear more in publications.

3.5. Comparison

In [29]:
processTypes = ['Feedstock', 'Output', 'ProcessingTech']
colors = ['purple', 'green', 'red']


fig, ax1 = plt.subplots(figsize=(9,9))
for master_idx, process in enumerate(processTypes):
    distribution = get_asset_distribution(process)
    
    plt.scatter(distribution['publications_norm'], distribution['patents_norm'], color=colors[master_idx], lw=0.7, marker='.', label=process)

    
plt.plot([-0.1, 0.32], [-0.1, 0.32], ls='--', color='black')
plt.legend()
plt.xlabel('Publications')
plt.ylabel('Patents')
plt.show()