Let's start by importing all of the necessary libraries to conduct the analysis.
from py2neo import Graph
import numpy as np
from pandas import DataFrame
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import json
import math
import pandas as pd
import plotly
import plotly.graph_objs as go
import qgrid
from scipy import stats, spatial
from sklearn.cluster.bicluster import SpectralBiclustering
import operator
import math
from IPython.display import display, HTML
import collections
# please add your plotly api credentials to plotly_config in your own machine. Visit https://plot.ly/python/getting-started/
plotly_config = json.load(open('plotly_config.json'))
plotly.tools.set_credentials_file(username=plotly_config['username'], api_key=plotly_config['key'])
In order to establish a base for understading the basis of the work, we wish to understand the occurence of severall feedstocks, processing technologies and outputs in our database.
For example, how many assets (patents, papers, etc) contain the mix of processing technology X for output Y?
To understand this in a more general way, the AMICA database will be transformed in an co-occurence matrix. This means, in the above described example that the number of assets that contain that mix will be an entry in a matrix, whyle the corresponfing technology and output will be columns/lines of the matrix.
We start by creating, like previsouly a list of terms (feedstocks, processing technologies and outputs) so these become the axis of the capability matrix.
local_connection_url = "http://localhost:7474/db/data"
connection_to_graph = Graph(local_connection_url)
query_no_interestions = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
MATCH (a:Asset)-[:CONTAINS]->(out:Output)
MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
RETURN fs.term, pt.term, out.term, count(a)
"""
# issue: this query needs to be divided by two when building the matrix -> NON OPTIMIZED
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
query_intersections = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
MATCH (a:Asset)-[:CONTAINS]->(t:{})
WHERE fs<>t
RETURN fs.term, t.term, count(a)
"""
# Return query as pandas dataframe
data_no_intersections = DataFrame(connection_to_graph.data(query_no_interestions)).as_matrix()
# Get axis names from columns and append to list
feedstock_names = set(list(data_no_intersections[:, 1]))
processing_technology_names = set(list(data_no_intersections[:, 2]))
output_names = set(list(data_no_intersections[:, 3]))
matrix_axis_names = list(feedstock_names) + list(processing_technology_names) + list(output_names)
# Extra labels that only appear in non-intersection queries
for category in process_variables:
data_no_intersections = DataFrame(connection_to_graph.data(query_intersections.format(category, category))).as_matrix()
for column_number in range(1,3):
column = data_no_intersections[:, column_number]
for name in column:
if name not in matrix_axis_names:
matrix_axis_names.append(name)
print len(matrix_axis_names)
342
Our matrix will have a total of 342 rows and 342 columns.
We start by creating a function that given an asset (e.g."Patent", or "Publication"), returns a matrix where each entry correponds to the number of documents containing a certain term. For example:
matrix[i, j] = z
norm_matrix[i, j] = w
There are z documents containing the term i and j.
Where: norm_matrix[i,j] = (matrix[i, j] - mean(matrix)) / standard_deviation(matrix)
def get_asset_matrix(asset, normalization=True):
# define queries
asset_no_interestions = """ MATCH (a:Asset)-[:CONTAINS]->(fs:Feedstock)
MATCH (a:Asset)-[:CONTAINS]->(out:Output)
MATCH (a:Asset)-[:CONTAINS]->(pt:ProcessingTech)
WHERE a.type = "{}"
RETURN fs.term, pt.term, out.term, count(a)
""".format(asset)
process_variables = ['Feedstock', 'Output', 'ProcessingTech']
asset_intersections = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
MATCH (a:Asset)-[:CONTAINS]->(t:{})
WHERE fs<>t AND a.type = "{}"
RETURN fs.term, t.term, count(a)
"""
# get data
data_no_intersections = DataFrame(connection_to_graph.data(asset_no_interestions)).as_matrix()
# create matrix
asset_matrix = np.zeros([len(matrix_axis_names), len(matrix_axis_names)])
# for no intersections data
for row in data_no_intersections:
# the last column is the frequency (count)
frequency = row[0]
indexes = [matrix_axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position
for pair in itertools.combinations(indexes, 2):
asset_matrix[pair[0], pair[1]] += frequency
asset_matrix[pair[1], pair[0]] += frequency
# for intersecting data
for category in process_variables:
process_data = DataFrame(connection_to_graph.data(asset_intersections.format(category, category, asset))).as_matrix()
for row in process_data:
frequency = row[0]
indexes = [matrix_axis_names.index(element) for element in row[1::]]
# add frequency value to matrix position
for pair in itertools.combinations(indexes, 2):
asset_matrix[pair[0], pair[1]] += frequency / 2 # Divided by two because query not optimized
asset_matrix[pair[1], pair[0]] += frequency / 2 # Divided by two because query not optimized
# normalize
normalized_asset_matrix = (asset_matrix - np.mean(asset_matrix)) / np.std(asset_matrix)
# dynamic return
if normalization == True:
return normalized_asset_matrix
else:
return asset_matrix
We can now create the capability matrixes of publications and patents.
First, a query is created:
norm_mode = False
publication_matrix = get_asset_matrix('PUBLICATION', normalization=norm_mode)
patent_matrix = get_asset_matrix('PATENT', normalization=norm_mode)
Consequently, both matrixes are plotted.
# create subplots
plt.subplots(2,1,figsize=(17,17))
plt.subplot(121)
sns.heatmap(publication_matrix, cbar=None,cmap='BuPu_r', square=True, xticklabels=False, yticklabels=False)
plt.title('Publications Heatmap')
plt.subplot(122)
sns.heatmap(patent_matrix, cbar=None, square=True,cmap='BuPu_r', xticklabels=False, yticklabels=False)
plt.title('Patents Heatmap')
plt.show()
Due to the scarcity of documents in certain intersepting categories, the matrixes are rather hard to read.
We analyse some basic stats of these two matrixes..
print 'PATENTS:'
print 'Rows:', patent_matrix.shape[0]
print 'Columns:', patent_matrix.shape[1]
print 'Mean: ', np.mean(patent_matrix)
print 'Standart Deviation', np.std(patent_matrix)
print 'Max: ', np.amax(patent_matrix)
print 'Min: ', np.amin(patent_matrix)
PATENTS: Rows: 342 Columns: 342 Mean: 0.7012927054478301 Standart Deviation 6.683876527537948 Max: 883.0 Min: 0.0
print 'PUBLICATIONS:'
print 'Rows:', publication_matrix.shape[0]
print 'Columns:', publication_matrix.shape[1]
print 'Mean: ', np.mean(publication_matrix)
print 'Standart Deviation', np.std(publication_matrix)
print 'Max: ', np.amax(publication_matrix)
print 'Min: ', np.amin(publication_matrix)
PUBLICATIONS: Rows: 342 Columns: 342 Mean: 1.963835026161896 Standart Deviation 22.45221872398744 Max: 2169.0 Min: 0.0
Clearly, there are more publications, for instants, the average cell in the matrix has a value of 1.96, which compared to patents (0.7), is more than the double.
Moreover, due to very high max values, (883, 2169) the matrixes are very irregular.
Now, we create a matrix where every entry correponds to the following:
difference[i,j] = patents[i,j] - publications[i,j].
This difference matrix, will give us a feel for what combination of terms are more patented Vs. researched or vice-versa.
We subtract the matrixes:
differences = patent_matrix - publication_matrix
And we plot the diferences
matrix.
plt.subplots(1,1,figsize=(9,9))
plt.subplot(111)
sns.heatmap(differences, square=True, xticklabels=False, yticklabels=False)
plt.title('The heatmap of differences')
plt.show()
Due to the high volume of entries, it can be hard to visualize what is happening.
Therefore, let's create a table with the combination of terms that are the most dicrepant.
# list where all the values and indexes of matrix are stored
values = []
indexes = []
no_duplicates = np.abs(np.triu(differences, 1))
# loop through the matrix
for row_n in range(differences.shape[0]):
for col_n in range(differences.shape[1]):
values.append(no_duplicates[row_n, col_n])
indexes.append((row_n, col_n))
Z = [indexes for _,indexes in sorted(zip(values,indexes))]
Let us create a dataframe of the most negative and positive relations for easy visualization.
term_Dataframe = pd.DataFrame(
{'First Term': [matrix_axis_names[e[0]] for e in Z],
'Second Term': [matrix_axis_names[e[1]] for e in Z],
'Patents': [patent_matrix[e[0], e[1]] for e in Z],
'Publications': [publication_matrix[e[0], e[1]] for e in Z],
'Difference': [no_duplicates[e[0], e[1]] for e in Z]
})
term_Dataframe = term_Dataframe[['First Term', 'Second Term', 'Patents', 'Publications', 'Difference']]
term_Dataframe = term_Dataframe.sort_values('Difference', ascending=False).head(n=15)
print 'Absolute:'
display(HTML(term_Dataframe.to_html(index=False)))
Absolute:
First Term | Second Term | Patents | Publications | Difference |
---|---|---|---|---|
biogas | anaerobic digestion | 20.0 | 1889.0 | 1869.0 |
ethanol | fermentation | 883.0 | 2169.0 | 1286.0 |
ethanol | hydrolysis | 432.0 | 1569.0 | 1137.0 |
bioethanol | fermentation | 105.0 | 1066.0 | 961.0 |
bio-oil | pyrolysis | 23.0 | 942.0 | 919.0 |
bioethanol | hydrolysis | 66.0 | 775.0 | 709.0 |
biodiesel | transesterification | 122.0 | 826.0 | 704.0 |
waste | biogas | 59.0 | 710.0 | 651.0 |
ethanol | enzymatic hydrolysis | 106.0 | 755.0 | 649.0 |
biodiesel | catalysis | 20.0 | 613.0 | 593.0 |
cellulose | ethanol | 194.0 | 755.0 | 561.0 |
biogas | hydrolysis | 48.0 | 567.0 | 519.0 |
ethanol | catalysis | 13.0 | 529.0 | 516.0 |
waste | anaerobic digestion | 17.0 | 510.0 | 493.0 |
sugar | ethanol | 269.0 | 753.0 | 484.0 |
total_patents = 4585
total_publications = 5313
patent_matrix = patent_matrix / total_patents
publication_matrix = publication_matrix / total_publications
differences = patent_matrix - publication_matrix
# list where all the values and indexes of matrix are stored
values = []
indexes = []
no_duplicates = np.abs(np.triu(differences, 1))
# loop through the matrix
for row_n in range(differences.shape[0]):
for col_n in range(differences.shape[1]):
values.append(no_duplicates[row_n, col_n])
indexes.append((row_n, col_n))
Z = [indexes for _,indexes in sorted(zip(values,indexes))]
term_Dataframe = pd.DataFrame(
{'First Term': [matrix_axis_names[e[0]] for e in Z],
'Second Term': [matrix_axis_names[e[1]] for e in Z],
'Patents': [patent_matrix[e[0], e[1]] for e in Z],
'Publications': [publication_matrix[e[0], e[1]] for e in Z],
'Difference': [no_duplicates[e[0], e[1]] for e in Z]
})
term_Dataframe = term_Dataframe[['First Term', 'Second Term', 'Patents', 'Publications', 'Difference']]
term_Dataframe = term_Dataframe.sort_values('Difference', ascending=False).head(n=15)
print 'Absolute:'
display(HTML(term_Dataframe.to_html(index=False)))
Absolute:
First Term | Second Term | Patents | Publications | Difference |
---|---|---|---|---|
biogas | anaerobic digestion | 0.004362 | 0.355543 | 0.351181 |
ethanol | fermentation | 0.192585 | 0.408244 | 0.215659 |
ethanol | hydrolysis | 0.094220 | 0.295313 | 0.201093 |
bioethanol | fermentation | 0.022901 | 0.200640 | 0.177739 |
bio-oil | pyrolysis | 0.005016 | 0.177301 | 0.172285 |
bioethanol | hydrolysis | 0.014395 | 0.145869 | 0.131474 |
biodiesel | transesterification | 0.026609 | 0.155468 | 0.128859 |
waste | biogas | 0.012868 | 0.133634 | 0.120766 |
ethanol | enzymatic hydrolysis | 0.023119 | 0.142104 | 0.118985 |
biodiesel | catalysis | 0.004362 | 0.115377 | 0.111015 |
cellulose | ethanol | 0.042312 | 0.142104 | 0.099792 |
ethanol | catalysis | 0.002835 | 0.099567 | 0.096732 |
biogas | hydrolysis | 0.010469 | 0.106719 | 0.096250 |
waste | anaerobic digestion | 0.003708 | 0.095991 | 0.092283 |
cellulose | fermentation | 0.034242 | 0.118577 | 0.084335 |
We start by creating a function that given a certain asset type and a timeline, returns the number of documents of that type for every year.
def getTotalDocuments(assetType, startYear, endYear):
assetQuery = """MATCH (a:Asset)
WHERE a.type="{}"
AND toInteger(a.year)>={} AND toInteger(a.year)<={}
AND NOT a.year = "Null"
RETURN a.year, count(a)
ORDER BY a.year""".format(assetType, startYear, endYear)
dataReturn = DataFrame(connection_to_graph.data(assetQuery)).as_matrix()
timeLine = np.arange(startYear, endYear + 1)
finalMatrix = np.transpose(np.vstack((timeLine, timeLine)))
for i in range(finalMatrix.shape[0]):
finalMatrix[i, 1] = 0
for j in range(dataReturn.shape[0]):
if finalMatrix[i, 0] == int(dataReturn[j, 0]):
finalMatrix[i, 1] = dataReturn[j, 1]
toReturn = {}
toReturn['Years'] = finalMatrix[:, 0]
toReturn['Quantity'] = finalMatrix[:, 1]
return toReturn
We now test the function
# define data
startYear = 1990
endYear = 2017
patentTimeline = getTotalDocuments('PATENT', startYear, endYear)
publicationTimeline = getTotalDocuments('PUBLICATION', startYear, endYear)
title = 'Evolution of asset quantity over time'
x_label = 'Years'
y_label = 'Number of Records'
# plot evolution
plt.subplots(1,1,figsize=(16, 5))
plt.subplot(111)
plt.plot(patentTimeline['Years'], patentTimeline['Quantity'], label='Patents')
plt.plot(publicationTimeline['Years'], publicationTimeline['Quantity'], label='Publications')
plt.legend()
plt.xticks(publicationTimeline['Years'])
plt.title('Evolution of asset quantity over time')
plt.xlabel('Years')
plt.ylabel('Number of Records')
plt.show()
There is a clear relationship between the volume of patents and publications. Moreover, the number of patents seems to be inferior on average to the number of publications.
However, there is a period where the number of patents is superior to the number of publications. Particularly the period between 2005 and 2011.
A function that gives the chronological evolution of a certain asset.
def getDocuments(processType, processTerm, assetType, startYear, endYear):
assetQuery = """MATCH (a:Asset)-[:CONTAINS]->(fs:{})
WHERE fs.term = "{}" AND a.type="{}"
AND toInteger(a.year)>={} AND toInteger(a.year)<={}
AND NOT a.year = "Null"
RETURN a.year, count(a)
ORDER BY a.year""".format(processType, processTerm, assetType, startYear, endYear)
assetTotalQuery = """ MATCH (a:Asset)
WHERE a.type="{}"
AND toInteger(a.year)>={} AND toInteger(a.year)<={}
AND NOT a.year = "Null"
RETURN a.year, count(a)
ORDER BY a.year""".format(assetType, startYear, endYear)
dataReturn = DataFrame(connection_to_graph.data(assetQuery)).as_matrix()
dataNormReturn = DataFrame(connection_to_graph.data(assetTotalQuery)).as_matrix()
timeLine = np.arange(startYear, endYear + 1)
finalMatrix = np.transpose(np.vstack((timeLine, timeLine, timeLine)))
for i in range(finalMatrix.shape[0]):
finalMatrix[i, 1] = 0
finalMatrix[i, 2] = 0
for j in range(dataReturn.shape[0]):
if finalMatrix[i, 0] == int(dataReturn[j, 0]):
finalMatrix[i, 1] = dataReturn[j, 1]
for k in range(dataNormReturn.shape[0]):
if finalMatrix[i, 0] == int(dataNormReturn[k, 0]):
finalMatrix[i, 2] = dataNormReturn[k, 1]
toReturn = {}
toReturn['Years'] = finalMatrix[:, 0]
toReturn['Quantity'] = finalMatrix[:, 1]
toReturn['NormQuantity'] = [finalMatrix[e, 1] / float(finalMatrix[e, 2]) if finalMatrix[e, 2] != 0 else 0 for e in range(finalMatrix.shape[0])]
return toReturn
We then test the function for severall different values of Feedstocks, in this case, the ones that are more proiminent in the database.
feedstockList = ['waste', 'algae', 'cellulose', 'sugar', 'paper', 'wood', 'residues', 'corn']
palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))
for term in feedstockList:
termPat = getDocuments('Feedstock', term, 'PATENT', 1990, 2017)
termPub = getDocuments('Feedstock', term, 'PUBLICATION', 1990, 2017)
plt.subplot(2,4, plotCounter)
plt.plot(termPat['Years'], termPat['Quantity'], label = 'Patents', color = palette(colorCounter))
plt.plot(termPub['Years'], termPub['Quantity'], label = 'Publications', color = palette(colorCounter + 1))
plt.xlim(1990,2017)
plt.ylim(-10,310)
plt.title(term.upper())
plt.legend()
plotCounter += 1
colorCounter += 2
plt.show()
It appears that the number of publications is on average, far superior to the number of patents. However, the behaviour of these assets appears to follow the behaviour of the general dataset.(e.g. Small period where patents are more important.)
What about outputs as process variables?
outputList = ["ethanol", "biodiesel", "biogas", "bioethanol", "bio-oil", "gasoline", "methanol", "butanol"]
palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))
for term in outputList:
termPat = getDocuments('Output', term, 'PATENT', 1990, 2017)
termPub = getDocuments('Output', term, 'PUBLICATION', 1990, 2017)
plt.subplot(2,4, plotCounter)
plt.plot(termPat['Years'], termPat['Quantity'], label = 'Patents', color = palette(colorCounter))
plt.plot(termPub['Years'], termPub['Quantity'] , label = 'Publications', color = palette(colorCounter + 1))
plt.xlim(1990,2017)
plt.ylim(-10,310)
plt.title(term.upper())
plt.legend()
plotCounter += 1
colorCounter += 2
plt.show()
The behaviour does not appear to differ greatly from the feedstocks.
feedstockList = ['waste', 'algae', 'cellulose', 'sugar', 'paper', 'wood', 'residues', 'corn']
palette = plt.get_cmap('tab20')
plotCounter = 1
colorCounter = 0
plt.subplots(1,1,figsize=(30, 10))
for term in feedstockList:
termPat = getDocuments('Feedstock', term, 'PATENT', 2000, 2017)
termPub = getDocuments('Feedstock', term, 'PUBLICATION', 2000, 2017)
plt.subplot(2,4, plotCounter)
plt.plot(termPat['Years'], termPat['NormQuantity'], label = 'Patents', color = palette(colorCounter))
plt.plot(termPub['Years'], termPub['NormQuantity'], label = 'Publications', color = palette(colorCounter + 1))
plt.grid()
plt.xlim(2000,2017)
plt.ylim(-0.05,0.5)
plt.title(term.upper())
plt.legend()
plotCounter += 1
colorCounter += 2
plt.show()
We create a function that given a certain process type (e.g. Output, ProcTech or Feedstock), returns the total assets in terms of patents and publications.
def get_asset_distribution(processType):
"""
This function takes a process type, say Feedstocks and returns the total assets in terms of Patents
and Publications for that same asset.
"""
q = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
WHERE a.type="PATENT" OR a.type="PUBLICATION"
RETURN fs.term,a.type, count(a)
ORDER BY fs.term""".format(processType)
q_total = """ MATCH (a:Asset)-[:CONTAINS]->(fs:{})
WHERE a.type="PATENT" OR a.type="PUBLICATION"
RETURN a.type, count(a)""".format(processType)
data = DataFrame(connection_to_graph.data(q)).as_matrix()
patent_total = DataFrame(connection_to_graph.data(q_total)).as_matrix()[0, 1]
publication_total = DataFrame(connection_to_graph.data(q_total)).as_matrix()[1, 1]
terms = list(set(data[:, 2]))
patents = []
publications = []
for term in terms:
publications.append(0)
patents.append(0)
for data_row in data:
if data_row[2] == term and data_row[0] == "PUBLICATION":
publications = publications[:-1] + [data_row[1]]
if data_row[2] == term and data_row[0] == "PATENT":
patents = patents[:-1] + [data_row[1]]
distribution = {}
distribution['terms'] = terms
distribution['publications'] = publications
distribution['patents'] = patents
distribution['publications_norm'] = [e / float(publication_total) for e in distribution['publications']]
distribution['patents_norm'] = [e / float(patent_total) for e in distribution['patents']]
return distribution
We start by analysing Feedstock terms.
processType = 'Feedstock'
distribution = get_asset_distribution(processType)
def get_closest_in_line(i):
x0 = distribution['publications_norm'][i]
y0 = distribution['patents_norm'][i]
m = 1
k = 0
x = (x0 + m * y0 - m * k)/((m ** 2) + 1)
y = m * ((x0 + m * y0 - m*k)/((m**2) + 1)) + k
return [x, y]
fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='purple')
plt.plot([-0.1, 0.11], [-0.1, 0.11], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
other_ = get_closest_in_line(index)
if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
color = 'red'
else:
color = 'blue'
plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7)
plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.11])
plt.ylim([-0.01, 0.11])
plt.show()
There appears to be a high positive correlation, the more patented a term is, the more researched it is.
Every term has been normalized.
Outlier Detection
We create a function that returns the outliers of a given list.
def distance_to_mean(i):
x1 = distribution['publications_norm'][i]
y1 = distribution['patents_norm'][i]
x2 = get_closest_in_line(i)[0]
y2 = get_closest_in_line(i)[1]
distance = math.sqrt(((x1 - x2)**2) + ((y1 - y2)**2))
return distance
def winner(i):
x1 = distribution['publications_norm'][i]
y1 = distribution['patents_norm'][i]
if x1 > y1:
return 'Publications'
else:
return 'Patents'
# create dataframe
term_Dataframe = pd.DataFrame(
{'Name': distribution['terms'],
'Patent Percentage': distribution['patents_norm'],
'Publications Percentage': distribution['publications_norm'],
'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
'Bias': [winner(i) for i in range(len(distribution['terms']))]
})
# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))
counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])
print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name | Patent Percentage | Publications Percentage | Distance to Mean | Bias |
---|---|---|---|---|
starch | 0.030417 | 0.008816 | 0.015274 | Patents |
grain | 0.029412 | 0.008946 | 0.014472 | Patents |
agriculture | 0.008798 | 0.024893 | 0.011381 | Publications |
sugar | 0.064605 | 0.050823 | 0.009745 | Patents |
waste water | 0.006662 | 0.020420 | 0.009729 | Publications |
algae | 0.077049 | 0.063788 | 0.009376 | Patents |
paper | 0.031674 | 0.044470 | 0.009048 | Publications |
blend | 0.023504 | 0.010956 | 0.008873 | Patents |
energy crops | 0.004022 | 0.016271 | 0.008661 | Publications |
sewage | 0.010935 | 0.022689 | 0.008311 | Publications |
In 170 terms, 80 appear more in patents and 90 appear more in publications.
We then plot the above lists's boxplot.
processType = 'ProcessingTech'
distribution = get_asset_distribution(processType)
fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='green')
plt.plot([-0.1, 0.15], [-0.1, 0.15], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
other_ = get_closest_in_line(index)
if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
color = 'red'
else:
color = 'blue'
plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7)
plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.15])
plt.ylim([-0.01, 0.15])
plt.show()
On average, processing technologies are more researched than patented. Which makes sense because they concern technologies and not processes.
Outliers
# create dataframe
term_Dataframe = pd.DataFrame(
{'Name': distribution['terms'],
'Patent Percentage': distribution['patents_norm'],
'Publications Percentage': distribution['publications_norm'],
'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
'Bias': [winner(i) for i in range(len(distribution['terms']))]
})
# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))
counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])
print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name | Patent Percentage | Publications Percentage | Distance to Mean | Bias |
---|---|---|---|---|
pressing | 0.134257 | 0.012747 | 0.085921 | Patents |
anaerobic digestion | 0.014385 | 0.130609 | 0.082183 | Publications |
fermentation | 0.274907 | 0.188580 | 0.061042 | Patents |
catalysis | 0.015450 | 0.077353 | 0.043772 | Publications |
hydrolysis | 0.116676 | 0.142483 | 0.018249 | Publications |
fast pyrolysis | 0.004795 | 0.029335 | 0.017352 | Publications |
pyrolysis | 0.072456 | 0.095512 | 0.016303 | Publications |
enzymatic hydrolysis | 0.027704 | 0.049939 | 0.015723 | Publications |
gasification | 0.052211 | 0.033525 | 0.013213 | Patents |
pyrolysis oil | 0.022909 | 0.009080 | 0.009779 | Patents |
In 48 terms, 25 appear more in patents and 23 appear more in publications.
We take a look at the outputs.
processType = 'Output'
distribution = get_asset_distribution(processType)
fig, ax1 = plt.subplots(figsize=(9,9))
plt.scatter(np.asarray(distribution['publications_norm']), np.asarray(distribution['patents_norm']), marker=".", color='brown')
plt.plot([-0.1, 0.32], [-0.1, 0.32], ls='--', color='black')
for index, term in enumerate(distribution['terms']):
other_ = get_closest_in_line(index)
if distribution['publications_norm'][index] < distribution['patents_norm'][index]:
color = 'red'
label = 'Patent Bias'
else:
color = 'blue'
label = 'Publication Bias'
plt.plot([distribution['publications_norm'][index], other_[0]], [distribution['patents_norm'][index], other_[1]], color=color, lw=0.7, label=label)
plt.title('{} asset distribution.'.format(processType))
plt.xlabel('{} Publications'.format(processType))
plt.ylabel('{} Patents'.format(processType))
plt.xlim([-0.01, 0.31])
plt.ylim([-0.01, 0.31])
plt.show()
Outputs appear to have the same behaviour as processing technologies, a higher tendency to be published rather than patented.
Outliers
# create dataframe
term_Dataframe = pd.DataFrame(
{'Name': distribution['terms'],
'Patent Percentage': distribution['patents_norm'],
'Publications Percentage': distribution['publications_norm'],
'Distance to Mean': [distance_to_mean(i) for i in range(len(distribution['terms']))],
'Bias': [winner(i) for i in range(len(distribution['terms']))]
})
# prepare dataframe
term_Dataframe = term_Dataframe[['Name', 'Patent Percentage','Publications Percentage', 'Distance to Mean', 'Bias']]
term_Dataframe = term_Dataframe.sort_values('Distance to Mean', ascending=False).head(n=10)
display(HTML(term_Dataframe.to_html(index=False)))
counter=collections.Counter([winner(i) for i in range(len(distribution['terms']))])
print 'In {} terms, {} appear more in patents and {} appear more in publications.'.format(len(distribution['terms']), counter['Patents'], counter['Publications'])
Name | Patent Percentage | Publications Percentage | Distance to Mean | Bias |
---|---|---|---|---|
biogas | 0.053657 | 0.140277 | 0.061249 | Publications |
bio-oil | 0.019483 | 0.087388 | 0.048016 | Publications |
butanol | 0.091025 | 0.032547 | 0.041351 | Patents |
ethanol | 0.303417 | 0.245403 | 0.041023 | Patents |
gasoline | 0.088790 | 0.037266 | 0.036433 | Patents |
bioethanol | 0.045992 | 0.077787 | 0.022483 | Publications |
biodiesel | 0.157458 | 0.179333 | 0.015468 | Publications |
cellulosic ethanol | 0.007027 | 0.023108 | 0.011371 | Publications |
cellulosic biofuel | 0.003833 | 0.019528 | 0.011098 | Publications |
pellets | 0.022996 | 0.009927 | 0.009241 | Patents |
In 45 terms, 25 appear more in patents and 20 appear more in publications.
processTypes = ['Feedstock', 'Output', 'ProcessingTech']
colors = ['purple', 'green', 'red']
fig, ax1 = plt.subplots(figsize=(9,9))
for master_idx, process in enumerate(processTypes):
distribution = get_asset_distribution(process)
plt.scatter(distribution['publications_norm'], distribution['patents_norm'], color=colors[master_idx], lw=0.7, marker='.', label=process)
plt.plot([-0.1, 0.32], [-0.1, 0.32], ls='--', color='black')
plt.legend()
plt.xlabel('Publications')
plt.ylabel('Patents')
plt.show()