Predicting DWPC Query runtime ahead of time

In [3]:
import json

import matplotlib.pyplot
import pandas
import numpy
import seaborn
import mpld3

%matplotlib inline
In [4]:
path = '../all-features/data/metapaths.json'
with open(path) as fp:
    metapaths = json.load(fp)
In [5]:
dwpc_df = pandas.read_table('../all-features/data/dwpc.tsv.bz2')
dwpc_df.head(2)
/home/dhimmels/anaconda3/lib/python3.5/site-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/home/dhimmels/anaconda3/lib/python3.5/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):
Out[5]:
hetnet compound_id disease_id metapath PC w DWPC seconds
0 rephetio-v2.0_perm-5 DB00014 DOID:0060073 CpDpCpD 0 0.4 0.0 0.04323
1 rephetio-v2.0 DB00014 DOID:10283 CpDpCpD 0 0.4 0.0 0.04684
In [6]:
# Number of queries
len(dwpc_df)
Out[6]:
46867572
In [8]:
time_df = dwpc_df.groupby('metapath').seconds.mean().reset_index()
len(time_df)
Out[8]:
1206
In [9]:
cols = ['sequential_complexity', 'optimal_join_complexity', 'midpoint_join_complexity']

rows = [[
        item['abbreviation'], 
        item['join_complexities'][item['midpoint_index']], 
        item['join_complexities'][item['optimal_join_index']],
        item['join_complexities'][-1],
        item['join_complexities'][0],
    ] for item in metapaths]
complexity_df = pandas.DataFrame(rows, columns=
    ['metapath', 'midpoint_complexity', 'optimal_complexity', 'forward_complexity', 'backward_complexity'])
complexity_df = time_df.merge(complexity_df)
complexity_df['log10_seconds_per_query'] = numpy.log10(complexity_df['seconds'])
In [10]:
complexity_df.head(2)
Out[10]:
metapath seconds midpoint_complexity optimal_complexity forward_complexity backward_complexity log10_seconds_per_query
0 CbG<rG<rGaD 0.035994 3.10150 2.859092 2.859092 3.913263 -1.443772
1 CbG<rG<rGdD 0.022836 2.90328 2.640056 2.640056 3.694227 -1.641372

sequential complexity

In [11]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('forward_complexity', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()
Out[11]:

optimal join complexity

In [12]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('optimal_complexity', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()
Out[12]:

midpoint_join_complexity

In [13]:
matplotlib.pyplot.figure(figsize=(10, 7))
ax = seaborn.regplot('midpoint_complexity', 'log10_seconds_per_query', data=complexity_df,
    lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'black'}, ci=False)
points = ax.collections[0]
labels = complexity_df.metapath.tolist()
tooltip = mpld3.plugins.PointLabelTooltip(points, labels)
mpld3.plugins.connect(ax.figure, tooltip)
mpld3.display()
Out[13]: