from setup_corpus import build_corpora
corpora = build_corpora()
print(corpora)
Selected Category: description description has 280 samples; installation has 70 samples; invocation has 70 samples; citation has 70 samples; Selected Category: installation description has 200 samples; installation has 800 samples; invocation has 200 samples; citation has 200 samples; Selected Category: invocation description has 279 samples; installation has 279 samples; invocation has 1118 samples; citation has 279 samples; Selected Category: citation description has 77 samples; installation has 77 samples; invocation has 77 samples; citation has 309 samples; {'description': excerpt description 0 Puppeteer is a Node library which provides a h... True 1 The major contributors of this repository incl... True 2 Integral Regression is initially described in ... True 3 We build a 3D pose estimation system based mai... True 4 The Integral Regression is also known as soft-... True 5 This is an official implementation for Integra... True 6 The original implementation is based on our in... True 7 LibGEOS is a LGPL-licensed package for manipul... True 8 Among other things, it allows you to parse Wel... True 9 This repository contains the experiments in th... True 10 For the results presented in the paper, we did... True 11 Batch normalization is currently not supported... True 12 Open-source Ground Penetrating Radar processin... True 13 Pytorch implementation for high-resolution (e.... True 14 The PVGeo Python package contains VTK powered ... True 15 A PyVista (and VTK) interface for the Open Min... True 16 GeoNotebook is an application that provides cl... True 17 Fiona is OGR's neat and nimble API for Python ... True 18 Fiona is designed to be simple and dependable.... True 19 Shapely is a BSD-licensed Python package for m... True 20 Rain streaks can severely degrade the visibili... True 21 The pytorch branch contains: True 22 the pytorch implementation of Peak Response Ma... True 23 the PASCAL-VOC demo (training, inference, and ... True 24 Lithology and stratigraphic logs for wells and... True 25 This Python module allows you to: True 26 Interactively control an instance of ANSYS v14... True 27 Extract data directly from binary ANSYS v14.5+... True 28 Rapidly read in binary result (.rst), binary m... True 29 Official implementation of GANimation. In this... True .. ... ... 460 year={2018} False 461 } False 462 pages={262--277}, False 463 @InProceedings{Lim_2017_CVPR_Workshops, False 464 author = {Lars Mescheder and Sebastian Nowozin... False 465 year = {2018} False 466 Dieter Werthmüller, prisae False 467 Citation False 468 @inproceedings{DeepMVS, False 469 Matteo Ravasi, mrava87 False 470 year={2018} False 471 Key Laboratory of Machine Perception, Shenzhen... False 472 } False 473 } False 474 Author = {Xizhou Zhu, Yujie Wang, Jifeng Dai, ... False 475 @inproceedings{tang2018quantized, False 476 [Paper Link] (CVPR'18) False 477 Title = {Flow-Guided Feature Aggregation for V... False 478 booktitle = {IEEE Conferene on Computer Vision... False 479 Year = {2016} False 480 booktitle = "Conference on Computer Vision and... False 481 pages = {1450}, False 482 Yu, (2018). PyGeoPressure: Geopressure Predict... False 483 year = {2018}, False 484 year = {2018} False 485 Conference = {ICCV}, False 486 title={Scale-recurrent Network for Deep Image ... False 487 BibTex: False 488 Key Laboratory of Machine Perception (MOE), Sc... False 489 title={Image Generation from Scene Graphs}, False [490 rows x 2 columns], 'installation': excerpt installation 0 ocker is an operating-system-level-visualizati... False 1 TensorFlow is an open source software library ... False 2 Puppeteer is a Node library which provides a h... False 3 The Laplacian Pyramid Super-Resolution Network... False 4 Segyio is a small LGPL licensed C library for ... False 5 GeoNotebook is an application that provides cl... False 6 RetinaNet False 7 This repository only contains the core compone... False 8 Hankel transforms (wavenumber-frequency to spa... False 9 Below we show some example scene graphs along ... False 10 All planar measurements are expected to follow... False 11 Currently, segyio supports: False 12 construction and interactive editing of spatia... False 13 Declarative: React makes it painless to create... False 14 Luckily, many iterative methods (e.g. cg, lsqr... False 15 A highly efficient JavaScript library for slic... False 16 We proposed to evaluate the detection accuracy... False 17 If you use our codes or datasets in your work,... False 18 This code was made public to share our researc... False 19 It now takes a period of time closer to 24 hou... False 20 In addition to applications in teaching and re... False 21 dipole: infinitesimal small dipoles oriented a... False 22 Rapidly read in binary result (.rst), binary m... False 23 Theano allows the automated computation of gra... False 24 Intuitive plotting routines with matplotlib si... False 25 A Python package for pore pressure prediction ... False 26 The pytorch branch contains: False 27 Additional backbone architectures may be easil... False 28 Introduction False 29 This code is written in Chainer. For PyTorch u... False ... ... ... 1370 @article{sun2018integral, False 1371 booktitle={arXiv}, False 1372 title = {Enhanced Deep Residual Networks for S... False 1373 Booktitle = {European Conference on Compu... False 1374 author = {Xinlei Chen and Li-Jia Li and Li Fei... False 1375 } False 1376 year = {2018} False 1377 year={2018} False 1378 Xin Tao, Hongyun Gao, Xiaoyong Shen, Jue Wang,... False 1379 CVPR 2018 False 1380 False 1381 year = {2018} False 1382 booktitle = {CVPR}, False 1383 Author = {Xizhou Zhu, Yujie Wang, Jifeng Dai, ... False 1384 Dieter Werthmüller, prisae False 1385 This software is based on ideas published ther... False 1386 @INPROCEEDINGS{Mescheder2018ICML, False 1387 Title = {Flow-Guided Feature Aggregation for V... False 1388 @inproceedings{tao2018srndeblur, False 1389 If you find the code and datasets useful in yo... False 1390 de la Varga, M., Schaaf, A., and Wellmann, F.:... False 1391 @inproceedings{tang2018quantized, False 1392 @InProceedings{kato2018renderer False 1393 All releases have a Zenodo-DOI, provided on th... False 1394 journal = {Journal of Open Source Software} False 1395 journal={arXiv preprint arXiv:1809.06079}, False 1396 year={2018} False 1397 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... False 1398 } False 1399 Huikai Wu, Shuai Zheng, Junge Zhang, Kaiqi Huang False [1400 rows x 2 columns], 'invocation': excerpt invocation 0 This repository is implemented by Yuqing Zhu, ... False 1 Python/Cython wrapper of Marco Attene's wonder... False 2 This baseline is run on dbnet-2018 challenge d... False 3 All planar measurements are expected to follow... False 4 If you give it a collection of years of tweet ... False 5 Introduction False 6 In addition to applications in teaching and re... False 7 Quadrature with extrapolation QWE False 8 Introduction False 9 Basically, he wears a top hat, lives in your c... False 10 The input is assumed to represent a single clo... False 11 Pytorch implementation for high-resolution (e.... False 12 This project aims to automate the manual proce... False 13 Learn Once, Write Anywhere: We don't make assu... False 14 mplleaflet is a Python library that converts a... False 15 GeoNotebook is an application that provides cl... False 16 Faster R-CNN False 17 A Jupyter / Leaflet bridge enabling interactiv... False 18 This is the code for the paper False 19 FGFA is end-to-end trainable for the task of v... False 20 RPN False 21 Very lite but extendable mapping framework to ... False 22 Eaton's method and Parameter Optimization False 23 VGG16 False 24 We build a 3D pose estimation system based mai... False 25 PyVista is a helper module for the Visualizati... False 26 Introduction False 27 project loading False 28 Fiona is designed to be simple and dependable.... False 29 RetinaNet False ... ... ... 1925 @inproceedings{chen2018domain, False 1926 @article{yu2018pygeopressure, False 1927 title = {{PyGeoPressure}: {Geopressure} {Predi... False 1928 pages = {922} False 1929 Ting-Chun Wang1, Ming-Yu Liu1, Jun-Yan Zhu2, G... False 1930 url = {https://doi.org/10.21105/joss.01450}, False 1931 title={Domain Adaptive Faster R-CNN for Object... False 1932 pages={6546--6555}, False 1933 title={Recurrent Squeeze-and-Excitation Contex... False 1934 booktitle = {IEEE Conference on Computer Visio... False 1935 Matteo Ravasi, mrava87 False 1936 } False 1937 year={2018} False 1938 booktitle={ECCV}, False 1939 month = {may}, False 1940 All releases have a Zenodo-DOI, provided on th... False 1941 @INPROCEEDINGS{Mescheder2018ICML, False 1942 Video-to-Video Synthesis False 1943 title={An Integral Pose Regression System for ... False 1944 booktitle={Computer Vision and Pattern Regogni... False 1945 year = {2018} False 1946 title = {Spatial Memory for Context Reasoning ... False 1947 year={2017} False 1948 Proceedings of the IEEE Conference on Computer... False 1949 Tristan van Leeuwen, TristanvanLeeuwen False 1950 } False 1951 booktitle={CVPR}, False 1952 year={2018}, False 1953 title={CU-Net: Coupled U-Nets}, False 1954 } False [1955 rows x 2 columns], 'citation': excerpt citation 0 Below we show some example scene graphs along ... False 1 Faster, Better and Lighter for image processin... False 2 To reproduce the quantitative results shown in... False 3 Bowers' method and Parameter Optimization False 4 Tilematrix supports metatiling and tile buffer... False 5 Direct access to mesh analysis and transformat... False 6 Luckily, many iterative methods (e.g. cg, lsqr... False 7 the PASCAL-VOC demo (training, inference, and ... False 8 gpr: calculates the ground-penetrating radar r... False 9 A highly efficient JavaScript library for slic... False 10 Intuitive plotting routines with matplotlib si... False 11 Each branch in the git repository corresponds ... False 12 Fast R-CNN False 13 SEG-Y Revisions False 14 If you give it all of OpenStreetMap and zoom o... False 15 Lithology and stratigraphic logs for wells and... False 16 personal website + blog for every github user False 17 Note this is not a package for reading LiDAR d... False 18 This is a yeoman generator for ArcGIS API for ... False 19 For simplicity, each dot represents one U-Net.... False 20 The electromagnetic modeller empymod can model... False 21 Remote Geomod: From GoogleEarth to 3-D Geology False 22 The mapshaper command line program supports es... False 23 PySAL, the Python spatial analysis library, is... False 24 For the results presented in the paper, we did... False 25 Very lite but extendable mapping framework to ... False 26 A Python package for pore pressure prediction ... False 27 VGG16 False 28 This is a Python 2.7 and 3.3+ package to read ... False 29 We proposed to evaluate the detection accuracy... False .. ... ... 510 booktitle = {Computer Vision and Pattern Recog... True 511 year={2018} True 512 } True 513 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True 514 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True 515 @InProceedings{Lim_2017_CVPR_Workshops, True 516 author = {Lim, Bee and Son, Sanghyun and Kim, ... True 517 title = {Enhanced Deep Residual Networks for S... True 518 booktitle = {The IEEE Conference on Computer V... True 519 month = {July}, True 520 year = {2017} True 521 } True 522 @inproceedings{zhang2018residual, True 523 title={Residual Dense Network for Image Super-... True 524 author={Zhang, Yulun and Tian, Yapeng and Kong... True 525 booktitle={CVPR}, True 526 year={2018} True 527 @article{zhang2018rdnir, True 528 title={Residual Dense Network for Image Restor... True 529 booktitle={arXiv}, True 530 @inproceedings{tang2018quantized, True 531 title={Quantized densely connected U-Nets for ... True 532 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True 533 booktitle={ECCV}, True 534 year={2018} True 535 } True 536 @inproceedings{tang2018cu, True 537 title={CU-Net: Coupled U-Nets}, True 538 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True 539 booktitle={BMVC}, True [540 rows x 2 columns]}
doing something haha
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
for category in corpora:
scores = cross_val_score(pipeline, corpora[category].excerpt, corpora[category][category], cv=cv)
print(f"Category: {category}\nScores: {scores}\nAccuracy: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")
Category: description Scores: [0.74489796 0.89795918 0.85714286 0.87755102 0.80612245] Accuracy: 0.8367 (+/- 0.1103) Category: installation Scores: [0.925 0.90357143 0.88214286 0.925 0.87142857] Accuracy: 0.9014 (+/- 0.0437) Category: invocation Scores: [0.87244898 0.81632653 0.84910486 0.86923077 0.85384615] Accuracy: 0.8522 (+/- 0.0400) Category: citation Scores: [0.9266055 0.84259259 0.87037037 0.9537037 0.85981308] Accuracy: 0.8906 (+/- 0.0846)
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
X = corpora['description'].excerpt
y = corpora['description'].description
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
print('Description ROC')
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Description Classification')
plt.legend(loc="lower right")
plt.show()
Description ROC
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])
plt.step(recall, precision, alpha=0.2,
where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Description Precision-Recall curve'.format(
average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()
X = corpora['installation'].excerpt
y = corpora['installation'].installation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Installation Classification')
plt.legend(loc="lower right")
plt.show()
Installation ROC
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])
plt.step(recall, precision, alpha=0.2,
where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Installation Precision-Recall curve'.format(
average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()
X = corpora['invocation'].excerpt
y = corpora['invocation'].invocation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Invocation Classification')
plt.legend(loc="lower right")
plt.show()
Installation ROC
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])
plt.step(recall, precision, alpha=0.2,
where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Invocation Precision-Recall curve'.format(
average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()
X = corpora['citation'].excerpt
y = corpora['citation'].citation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
print('Citation ROC')
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Citation Classification')
plt.legend(loc="lower right")
plt.show()
Citation ROC
for train, test in cv.split(X, y):
probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])
plt.step(recall, precision, alpha=0.2,
where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Citation Precision-Recall curve'.format(
average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()