Tf-idf Vectorizer, Logistic Regression¶

In [1]:

from setup_corpus import build_corpora
corpora = build_corpora()
print(corpora)

Selected Category: description
description has 280 samples;
installation has 70 samples;
invocation has 70 samples;
citation has 70 samples;
Selected Category: installation
description has 200 samples;
installation has 800 samples;
invocation has 200 samples;
citation has 200 samples;
Selected Category: invocation
description has 279 samples;
installation has 279 samples;
invocation has 1118 samples;
citation has 279 samples;
Selected Category: citation
description has 77 samples;
installation has 77 samples;
invocation has 77 samples;
citation has 309 samples;
{'description': excerpt description
0 Puppeteer is a Node library which provides a h... True
1 The major contributors of this repository incl... True
2 Integral Regression is initially described in ... True
3 We build a 3D pose estimation system based mai... True
4 The Integral Regression is also known as soft-... True
5 This is an official implementation for Integra... True
6 The original implementation is based on our in... True
7 LibGEOS is a LGPL-licensed package for manipul... True
8 Among other things, it allows you to parse Wel... True
9 This repository contains the experiments in th... True
10 For the results presented in the paper, we did... True
11 Batch normalization is currently not supported... True
12 Open-source Ground Penetrating Radar processin... True
13 Pytorch implementation for high-resolution (e.... True
14 The PVGeo Python package contains VTK powered ... True
15 A PyVista (and VTK) interface for the Open Min... True
16 GeoNotebook is an application that provides cl... True
17 Fiona is OGR's neat and nimble API for Python ... True
18 Fiona is designed to be simple and dependable.... True
19 Shapely is a BSD-licensed Python package for m... True
20 Rain streaks can severely degrade the visibili... True
21 The pytorch branch contains: True
22 the pytorch implementation of Peak Response Ma... True
23 the PASCAL-VOC demo (training, inference, and ... True
24 Lithology and stratigraphic logs for wells and... True
25 This Python module allows you to: True
26 Interactively control an instance of ANSYS v14... True
27 Extract data directly from binary ANSYS v14.5+... True
28 Rapidly read in binary result (.rst), binary m... True
29 Official implementation of GANimation. In this... True
.. ... ...
460 year={2018} False
461 } False
462 pages={262--277}, False
463 @InProceedings{Lim_2017_CVPR_Workshops, False
464 author = {Lars Mescheder and Sebastian Nowozin... False
465 year = {2018} False
466 Dieter Werthmüller, prisae False
467 Citation False
468 @inproceedings{DeepMVS, False
469 Matteo Ravasi, mrava87 False
470 year={2018} False
471 Key Laboratory of Machine Perception, Shenzhen... False
472 } False
473 } False
474 Author = {Xizhou Zhu, Yujie Wang, Jifeng Dai, ... False
475 @inproceedings{tang2018quantized, False
476 [Paper Link] (CVPR'18) False
477 Title = {Flow-Guided Feature Aggregation for V... False
478 booktitle = {IEEE Conferene on Computer Vision... False
479 Year = {2016} False
480 booktitle = "Conference on Computer Vision and... False
481 pages = {1450}, False
482 Yu, (2018). PyGeoPressure: Geopressure Predict... False
483 year = {2018}, False
484 year = {2018} False
485 Conference = {ICCV}, False
486 title={Scale-recurrent Network for Deep Image ... False
487 BibTex: False
488 Key Laboratory of Machine Perception (MOE), Sc... False
489 title={Image Generation from Scene Graphs}, False

[490 rows x 2 columns], 'installation': excerpt installation
0 ocker is an operating-system-level-visualizati... False
1 TensorFlow is an open source software library ... False
2 Puppeteer is a Node library which provides a h... False
3 The Laplacian Pyramid Super-Resolution Network... False
4 Segyio is a small LGPL licensed C library for ... False
5 GeoNotebook is an application that provides cl... False
6 RetinaNet False
7 This repository only contains the core compone... False
8 Hankel transforms (wavenumber-frequency to spa... False
9 Below we show some example scene graphs along ... False
10 All planar measurements are expected to follow... False
11 Currently, segyio supports: False
12 construction and interactive editing of spatia... False
13 Declarative: React makes it painless to create... False
14 Luckily, many iterative methods (e.g. cg, lsqr... False
15 A highly efficient JavaScript library for slic... False
16 We proposed to evaluate the detection accuracy... False
17 If you use our codes or datasets in your work,... False
18 This code was made public to share our researc... False
19 It now takes a period of time closer to 24 hou... False
20 In addition to applications in teaching and re... False
21 dipole: infinitesimal small dipoles oriented a... False
22 Rapidly read in binary result (.rst), binary m... False
23 Theano allows the automated computation of gra... False
24 Intuitive plotting routines with matplotlib si... False
25 A Python package for pore pressure prediction ... False
26 The pytorch branch contains: False
27 Additional backbone architectures may be easil... False
28 Introduction False
29 This code is written in Chainer. For PyTorch u... False
... ... ...
1370 @article{sun2018integral, False
1371 booktitle={arXiv}, False
1372 title = {Enhanced Deep Residual Networks for S... False
1373 Booktitle = {European Conference on Compu... False
1374 author = {Xinlei Chen and Li-Jia Li and Li Fei... False
1375 } False
1376 year = {2018} False
1377 year={2018} False
1378 Xin Tao, Hongyun Gao, Xiaoyong Shen, Jue Wang,... False
1379 CVPR 2018 False
1380 False
1381 year = {2018} False
1382 booktitle = {CVPR}, False
1383 Author = {Xizhou Zhu, Yujie Wang, Jifeng Dai, ... False
1384 Dieter Werthmüller, prisae False
1385 This software is based on ideas published ther... False
1386 @INPROCEEDINGS{Mescheder2018ICML, False
1387 Title = {Flow-Guided Feature Aggregation for V... False
1388 @inproceedings{tao2018srndeblur, False
1389 If you find the code and datasets useful in yo... False
1390 de la Varga, M., Schaaf, A., and Wellmann, F.:... False
1391 @inproceedings{tang2018quantized, False
1392 @InProceedings{kato2018renderer False
1393 All releases have a Zenodo-DOI, provided on th... False
1394 journal = {Journal of Open Source Software} False
1395 journal={arXiv preprint arXiv:1809.06079}, False
1396 year={2018} False
1397 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... False
1398 } False
1399 Huikai Wu, Shuai Zheng, Junge Zhang, Kaiqi Huang False

[1400 rows x 2 columns], 'invocation': excerpt invocation
0 This repository is implemented by Yuqing Zhu, ... False
1 Python/Cython wrapper of Marco Attene's wonder... False
2 This baseline is run on dbnet-2018 challenge d... False
3 All planar measurements are expected to follow... False
4 If you give it a collection of years of tweet ... False
5 Introduction False
6 In addition to applications in teaching and re... False
7 Quadrature with extrapolation QWE False
8 Introduction False
9 Basically, he wears a top hat, lives in your c... False
10 The input is assumed to represent a single clo... False
11 Pytorch implementation for high-resolution (e.... False
12 This project aims to automate the manual proce... False
13 Learn Once, Write Anywhere: We don't make assu... False
14 mplleaflet is a Python library that converts a... False
15 GeoNotebook is an application that provides cl... False
16 Faster R-CNN False
17 A Jupyter / Leaflet bridge enabling interactiv... False
18 This is the code for the paper False
19 FGFA is end-to-end trainable for the task of v... False
20 RPN False
21 Very lite but extendable mapping framework to ... False
22 Eaton's method and Parameter Optimization False
23 VGG16 False
24 We build a 3D pose estimation system based mai... False
25 PyVista is a helper module for the Visualizati... False
26 Introduction False
27 project loading False
28 Fiona is designed to be simple and dependable.... False
29 RetinaNet False
... ... ...
1925 @inproceedings{chen2018domain, False
1926 @article{yu2018pygeopressure, False
1927 title = {{PyGeoPressure}: {Geopressure} {Predi... False
1928 pages = {922} False
1929 Ting-Chun Wang1, Ming-Yu Liu1, Jun-Yan Zhu2, G... False
1930 url = {https://doi.org/10.21105/joss.01450}, False
1931 title={Domain Adaptive Faster R-CNN for Object... False
1932 pages={6546--6555}, False
1933 title={Recurrent Squeeze-and-Excitation Contex... False
1934 booktitle = {IEEE Conference on Computer Visio... False
1935 Matteo Ravasi, mrava87 False
1936 } False
1937 year={2018} False
1938 booktitle={ECCV}, False
1939 month = {may}, False
1940 All releases have a Zenodo-DOI, provided on th... False
1941 @INPROCEEDINGS{Mescheder2018ICML, False
1942 Video-to-Video Synthesis False
1943 title={An Integral Pose Regression System for ... False
1944 booktitle={Computer Vision and Pattern Regogni... False
1945 year = {2018} False
1946 title = {Spatial Memory for Context Reasoning ... False
1947 year={2017} False
1948 Proceedings of the IEEE Conference on Computer... False
1949 Tristan van Leeuwen, TristanvanLeeuwen False
1950 } False
1951 booktitle={CVPR}, False
1952 year={2018}, False
1953 title={CU-Net: Coupled U-Nets}, False
1954 } False

[1955 rows x 2 columns], 'citation': excerpt citation
0 Below we show some example scene graphs along ... False
1 Faster, Better and Lighter for image processin... False
2 To reproduce the quantitative results shown in... False
3 Bowers' method and Parameter Optimization False
4 Tilematrix supports metatiling and tile buffer... False
5 Direct access to mesh analysis and transformat... False
6 Luckily, many iterative methods (e.g. cg, lsqr... False
7 the PASCAL-VOC demo (training, inference, and ... False
8 gpr: calculates the ground-penetrating radar r... False
9 A highly efficient JavaScript library for slic... False
10 Intuitive plotting routines with matplotlib si... False
11 Each branch in the git repository corresponds ... False
12 Fast R-CNN False
13 SEG-Y Revisions False
14 If you give it all of OpenStreetMap and zoom o... False
15 Lithology and stratigraphic logs for wells and... False
16 personal website + blog for every github user False
17 Note this is not a package for reading LiDAR d... False
18 This is a yeoman generator for ArcGIS API for ... False
19 For simplicity, each dot represents one U-Net.... False
20 The electromagnetic modeller empymod can model... False
21 Remote Geomod: From GoogleEarth to 3-D Geology False
22 The mapshaper command line program supports es... False
23 PySAL, the Python spatial analysis library, is... False
24 For the results presented in the paper, we did... False
25 Very lite but extendable mapping framework to ... False
26 A Python package for pore pressure prediction ... False
27 VGG16 False
28 This is a Python 2.7 and 3.3+ package to read ... False
29 We proposed to evaluate the detection accuracy... False
.. ... ...
510 booktitle = {Computer Vision and Pattern Recog... True
511 year={2018} True
512 } True
513 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True
514 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True
515 @InProceedings{Lim_2017_CVPR_Workshops, True
516 author = {Lim, Bee and Son, Sanghyun and Kim, ... True
517 title = {Enhanced Deep Residual Networks for S... True
518 booktitle = {The IEEE Conference on Computer V... True
519 month = {July}, True
520 year = {2017} True
521 } True
522 @inproceedings{zhang2018residual, True
523 title={Residual Dense Network for Image Super-... True
524 author={Zhang, Yulun and Tian, Yapeng and Kong... True
525 booktitle={CVPR}, True
526 year={2018} True
527 @article{zhang2018rdnir, True
528 title={Residual Dense Network for Image Restor... True
529 booktitle={arXiv}, True
530 @inproceedings{tang2018quantized, True
531 title={Quantized densely connected U-Nets for ... True
532 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True
533 booktitle={ECCV}, True
534 year={2018} True
535 } True
536 @inproceedings{tang2018cu, True
537 title={CU-Net: Coupled U-Nets}, True
538 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True
539 booktitle={BMVC}, True

[540 rows x 2 columns]}

doing something haha

In [2]:

from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))

cv = StratifiedKFold(n_splits = 5, shuffle=True)

In [3]:

for category in corpora:
    scores = cross_val_score(pipeline, corpora[category].excerpt, corpora[category][category], cv=cv)
    print(f"Category: {category}\nScores: {scores}\nAccuracy: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

Category: description
Scores: [0.74489796 0.89795918 0.85714286 0.87755102 0.80612245]
Accuracy: 0.8367 (+/- 0.1103)
Category: installation
Scores: [0.925      0.90357143 0.88214286 0.925      0.87142857]
Accuracy: 0.9014 (+/- 0.0437)
Category: invocation
Scores: [0.87244898 0.81632653 0.84910486 0.86923077 0.85384615]
Accuracy: 0.8522 (+/- 0.0400)
Category: citation
Scores: [0.9266055  0.84259259 0.87037037 0.9537037  0.85981308]
Accuracy: 0.8906 (+/- 0.0846)

In [4]:

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt

In [5]:

X = corpora['description'].excerpt
y = corpora['description'].description
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Description ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Description Classification')
plt.legend(loc="lower right")
plt.show()

Description ROC

In [6]:

for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Description Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()

In [7]:

X = corpora['installation'].excerpt
y = corpora['installation'].installation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Installation Classification')
plt.legend(loc="lower right")
plt.show()

Installation ROC

In [8]:

for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Installation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()

In [9]:

X = corpora['invocation'].excerpt
y = corpora['invocation'].invocation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Invocation Classification')
plt.legend(loc="lower right")
plt.show()

Installation ROC

In [10]:

for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Invocation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()

In [11]:

X = corpora['citation'].excerpt
y = corpora['citation'].citation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Citation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Citation Classification')
plt.legend(loc="lower right")
plt.show()

Citation ROC

In [12]:

for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    precision, recall, _ = precision_recall_curve(y[test], probas_[:,1])

    plt.step(recall, precision, alpha=0.2,
             where='post', label=f'average precision={average_precision_score(y[test], probas_[:,1])}')

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
plt.title('Citation Precision-Recall curve'.format(
          average_precision_score(y[test], probas_[:,1])))
plt.legend(loc="lower right")
plt.show()