Tf-idf Vectorizer, Naive Bayes¶

In [1]:

from setup_corpus import build_corpora
corpora = build_corpora()
print(corpora)

Selected Category: description
description has 280 samples;
installation has 70 samples;
invocation has 70 samples;
citation has 70 samples;
Selected Category: installation
description has 200 samples;
installation has 800 samples;
invocation has 200 samples;
citation has 200 samples;
Selected Category: invocation
description has 279 samples;
installation has 279 samples;
invocation has 1118 samples;
citation has 279 samples;
Selected Category: citation
description has 77 samples;
installation has 77 samples;
invocation has 77 samples;
citation has 309 samples;
{'description': excerpt description
0 Puppeteer is a Node library which provides a h... True
1 The major contributors of this repository incl... True
2 Integral Regression is initially described in ... True
3 We build a 3D pose estimation system based mai... True
4 The Integral Regression is also known as soft-... True
5 This is an official implementation for Integra... True
6 The original implementation is based on our in... True
7 LibGEOS is a LGPL-licensed package for manipul... True
8 Among other things, it allows you to parse Wel... True
9 This repository contains the experiments in th... True
10 For the results presented in the paper, we did... True
11 Batch normalization is currently not supported... True
12 Open-source Ground Penetrating Radar processin... True
13 Pytorch implementation for high-resolution (e.... True
14 The PVGeo Python package contains VTK powered ... True
15 A PyVista (and VTK) interface for the Open Min... True
16 GeoNotebook is an application that provides cl... True
17 Fiona is OGR's neat and nimble API for Python ... True
18 Fiona is designed to be simple and dependable.... True
19 Shapely is a BSD-licensed Python package for m... True
20 Rain streaks can severely degrade the visibili... True
21 The pytorch branch contains: True
22 the pytorch implementation of Peak Response Ma... True
23 the PASCAL-VOC demo (training, inference, and ... True
24 Lithology and stratigraphic logs for wells and... True
25 This Python module allows you to: True
26 Interactively control an instance of ANSYS v14... True
27 Extract data directly from binary ANSYS v14.5+... True
28 Rapidly read in binary result (.rst), binary m... True
29 Official implementation of GANimation. In this... True
.. ... ...
460 author = {Xinlei Chen and Li-Jia Li and Li Fei... False
461 journal={arXiv preprint arXiv:1809.06079}, False
462 booktitle = {Proceedings of the IEEE Conferenc... False
463 booktitle = {IEEE Conference on Computer Visio... False
464 @article{yu2018pygeopressure, False
465 Tristan van Leeuwen, TristanvanLeeuwen False
466 year={2018} False
467 @inproceedings{chen18iterative, False
468 Dieter Werthmüller, prisae False
469 } False
470 title = {Two-Stream Convolutional Networks for... False
471 If you find our work useful in your research, ... False
472 booktitle = {International Conference on Machi... False
473 } False
474 volume = {3}, False
475 Citation False
476 author = {Lim, Bee and Son, Sanghyun and Kim, ... False
477 Citation False
478 Title = {{R-FCN}: Object Detection via Region-... False
479 M. Attene. A lightweight approach to repairing... False
480 year={2018} False
481 False
482 @InProceedings{kato2018renderer False
483 year = {2018} False
484 Learning Spatio-Temporal Features with 3D Resi... False
485 author={Chen, Yuhua and Li, Wen and Sakaridis,... False
486 } False
487 Calcagno, P., Chilès, J. P., Courrioux, G., & ... False
488 Year = {2017} False
489 HyVR can be attributed by citing the following... False

[490 rows x 2 columns], 'installation': excerpt installation
0 Neural Renderer (this repository) False
1 This repository only contains the core compone... False
2 Additionally, the aim is not to support the fu... False
3 Lithology and stratigraphic logs for wells and... False
4 Faster R-CNN False
5 Basically, he wears a top hat, lives in your c... False
6 A Jupyter / Leaflet bridge enabling interactiv... False
7 mplstereonet provides lower-hemisphere equal-a... False
8 By default, a modified Kamb method with expone... False
9 Detectron is Facebook AI Research's software s... False
10 This work is based on our research paper, whic... False
11 This Python module allows you to: False
12 spatial regression and statistical modeling on... False
13 Rain streaks can severely degrade the visibili... False
14 PySAL, the Python spatial analysis library, is... False
15 Shapely is a BSD-licensed Python package for m... False
16 At FAIR, Detectron has enabled numerous resear... False
17 Import meshes from many common formats (use py... False
18 This is a NodeJS port of pymasker. It provides... False
19 For simplicity, each dot represents one U-Net.... False
20 Export meshes as VTK, STL, OBJ, or PLY file types False
21 Sandbox False
22 This is the code for the paper False
23 Modelling routines: False
24 The Laplacian Pyramid Super-Resolution Network... False
25 fdesign: Design digital linear filters for the... False
26 The goal of Detectron is to provide a high-qua... False
27 Airwave (semi-analytical in the case of step r... False
28 Geographic information systems use GeoTIFF and... False
29 tiles server for live feedback when coding False
... ... ...
1370 Algorithm and Citation Policy False
1371 Title = {Multi{P}ose{N}et: Fast Multi... False
1372 volume = {4}, False
1373 @inproceedings{LapSRN, False
1374 @inproceedings{tesfaldet2018, False
1375 author = {Yiping Chen and Jingkang Wang and Jo... False
1376 If you use this code or pre-trained models, pl... False
1377 Lajaunie, C., Courrioux, G., & Manuel, L. (199... False
1378 } False
1379 and Michael J. Black False
1380 Title = {{R-FCN}: Object Detection via Region-... False
1381 For a more detailed elaboration of the theory ... False
1382 To better understand how the algorithm works, ... False
1383 author={Sun, Xiao and Xiao, Bin and Liang, Shu... False
1384 booktitle = {The IEEE Conference on Computer V... False
1385 } False
1386 @article{zhang2018rdnir, False
1387 title={Integral human pose regression}, False
1388 booktitle={The IEEE Conference on Computer Vis... False
1389 Citation False
1390 } False
1391 } False
1392 year = {2018} False
1393 Citing DaSiamRPN False
1394 title = {Detectron}, False
1395 booktitle = {The IEEE Conference on Computer V... False
1396 year = {2017} False
1397 Learning Spatio-Temporal Features with 3D Resi... False
1398 Xia Li, Jianlong Wu, Zhouchen Lin, Hong Liu, H... False
1399 @inproceedings{wang2018vid2vid, False

[1400 rows x 2 columns], 'invocation': excerpt invocation
0 Just so you get an idea, it took NYPL staff co... False
1 This repository contains the experiments in th... False
2 The code is built on EDSR (Torch) and tested o... False
3 Surface contact points: 3D coordinates of poin... False
4 Additionally, the aim is not to support the fu... False
5 Renderer backend for tilelive.js that uses nod... False
6 Resulting tiles conform to the JSON equivalent... False
7 construction of graphs from spatial data False
8 Single-image 3D mesh reconstruction False
9 model - model spatial relationships in data wi... False
10 The original motivation for HyVR was the lack ... False
11 SEG-Y Revisions False
12 gprMax is principally written in Python 3 with... False
13 Among other things, it allows you to parse Wel... False
14 Note this is not a package for reading LiDAR d... False
15 TetGen is a program to generate tetrahedral me... False
16 project loading False
17 PyVista is a helper module for the Visualizati... False
18 Segyio can handle a lot of files that are SEG-... False
19 In this repository, we release demo code and p... False
20 tiles server for live feedback when coding False
21 Complete full-space (electric and magnetic sou... False
22 analytical: interface to the analytical, space... False
23 Linear operators and inverse problems are at t... False
24 Nikos Kolotouros provides PyTorch re-implement... False
25 TetGen provides various features to generate g... False
26 For now, only Carto based projects are support... False
27 Tilematrix supports metatiling and tile buffer... False
28 This Python module is an interface to Hang Si'... False
29 A highly efficient JavaScript library for slic... False
... ... ...
1925 title={Scale-recurrent Network for Deep Image ... False
1926 Huikai Wu, Shuai Zheng, Junge Zhang, Kaiqi Huang False
1927 booktitle={CVPR}, False
1928 @inproceedings{li2018recurrent, False
1929 Presented at CVPR 2018 False
1930 {ethanlee, jlwu1992, zlin, hongliu}@pku.edu.cn... False
1931 booktitle = {IEEE Conferene on Computer Vision... False
1932 Citation False
1933 title = {Two-Stream Convolutional Networks for... False
1934 @inproceedings{tao2018srndeblur, False
1935 References False
1936 Key Laboratory of Machine Perception (MOE), Sc... False
1937 title = {{PyVista}: 3D plotting and mesh analy... False
1938 HyVR can be attributed by citing the following... False
1939 } False
1940 journal = {Journal of Open Source Software} False
1941 Year = {2018} False
1942 } False
1943 Tristan van Leeuwen, TristanvanLeeuwen False
1944 @inproceedings{zhang2018residual, False
1945 journal={arXiv preprint arXiv:1711.08229}, False
1946 title = {Detectron}, False
1947 } False
1948 If you use Detectron in your research or wish ... False
1949 booktitle={BMVC}, False
1950 booktitle={Proceedings of the European Confere... False
1951 author = {Xinlei Chen and Abhinav Gupta}, False
1952 @inproceedings{LapSRN, False
1953 url = {https://doi.org/10.21105/joss.01450}, False
1954 publisher = {The Open Journal}, False

[1955 rows x 2 columns], 'citation': excerpt citation
0 model - model spatial relationships in data wi... False
1 Features False
2 A scene graph is a structured representation o... False
3 Renderer backend for tilelive.js that uses nod... False
4 The input is assumed to represent a single clo... False
5 GemPy was designed from the beginning to suppo... False
6 Complete full-space (electric and magnetic sou... False
7 The mapshaper command line program supports es... False
8 Very lite but extendable mapping framework to ... False
9 Learn Once, Write Anywhere: We don't make assu... False
10 graph construction from polygonal lattices, li... False
11 exploratory spatio-temporal data analysis False
12 The file read parameters are based on GSSI's D... False
13 If you give it all of OpenStreetMap and zoom o... False
14 PySAL, the Python spatial analysis library, is... False
15 Resulting tiles conform to the JSON equivalent... False
16 The input scene graph is processed with a grap... False
17 A Jupyter / Leaflet bridge enabling interactiv... False
18 SEG-Y Revisions False
19 This is the implementation of our CVPR 2018 wo... False
20 mplstereonet also includes a number of utiliti... False
21 All traces in a file are assumed to be of the ... False
22 ResNet{50,101,152} False
23 This repository contains the experiments in th... False
24 This is the code for the paper False
25 We build a 3D pose estimation system based mai... False
26 Overview False
27 Flow-Guided Feature Aggregation (FGFA) is init... False
28 The major contributors of this repository incl... False
29 mplleaflet is a Python library that converts a... False
.. ... ...
510 booktitle = {Computer Vision and Pattern Recog... True
511 year={2018} True
512 } True
513 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True
514 Yulun Zhang, Yapeng Tian, Yu Kong, Bineng Zhon... True
515 @InProceedings{Lim_2017_CVPR_Workshops, True
516 author = {Lim, Bee and Son, Sanghyun and Kim, ... True
517 title = {Enhanced Deep Residual Networks for S... True
518 booktitle = {The IEEE Conference on Computer V... True
519 month = {July}, True
520 year = {2017} True
521 } True
522 @inproceedings{zhang2018residual, True
523 title={Residual Dense Network for Image Super-... True
524 author={Zhang, Yulun and Tian, Yapeng and Kong... True
525 booktitle={CVPR}, True
526 year={2018} True
527 @article{zhang2018rdnir, True
528 title={Residual Dense Network for Image Restor... True
529 booktitle={arXiv}, True
530 @inproceedings{tang2018quantized, True
531 title={Quantized densely connected U-Nets for ... True
532 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True
533 booktitle={ECCV}, True
534 year={2018} True
535 } True
536 @inproceedings{tang2018cu, True
537 title={CU-Net: Coupled U-Nets}, True
538 author={Tang, Zhiqiang and Peng, Xi and Geng, ... True
539 booktitle={BMVC}, True

[540 rows x 2 columns]}

doing something haha

In [2]:

from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())

cv = StratifiedKFold(n_splits = 5, shuffle=True)

In [3]:

for category in corpora:
    scores = cross_val_score(pipeline, corpora[category].excerpt, corpora[category][category], cv=cv)
    print(f"Category: {category}\nScores: {scores}\nAccuracy: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

Category: description
Scores: [0.79591837 0.75510204 0.78571429 0.75510204 0.74489796]
Accuracy: 0.7673 (+/- 0.0396)
Category: installation
Scores: [0.85714286 0.84285714 0.86785714 0.84285714 0.83928571]
Accuracy: 0.8500 (+/- 0.0217)
Category: invocation
Scores: [0.88010204 0.84693878 0.86189258 0.87179487 0.87692308]
Accuracy: 0.8675 (+/- 0.0240)
Category: citation
Scores: [0.88990826 0.91666667 0.93518519 0.85185185 0.92523364]
Accuracy: 0.9038 (+/- 0.0600)

In [4]:

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt

In [5]:

X = corpora['description'].excerpt
y = corpora['description'].description
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Description ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Description Classification')
plt.legend(loc="lower right")
plt.show()

Description ROC

In [6]:

X = corpora['installation'].excerpt
y = corpora['installation'].installation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Installation Classification')
plt.legend(loc="lower right")
plt.show()

Installation ROC

In [7]:

X = corpora['invocation'].excerpt
y = corpora['invocation'].invocation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Installation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Invocation Classification')
plt.legend(loc="lower right")
plt.show()

Installation ROC

In [8]:

X = corpora['citation'].excerpt
y = corpora['citation'].citation
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
    
i = 0
print('Citation ROC')
for train, test in cv.split(X, y):
    probas_ = pipeline.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area under the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i+=1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Citation Classification')
plt.legend(loc="lower right")
plt.show()

Citation ROC