import random
import pandas as pd
import nltk
# nltk.download('treebank')
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
description_df = pd.read_csv('./data/description.csv')
installation_df = pd.read_csv('./data/installation.csv')
invocation_df = pd.read_csv('./data/invocation.csv')
citation_df = pd.read_csv('./data/citation.csv')
Make sure that csv data has been successfully imported.
print("Number of description entries: {}".format(len(description_df)))
print("Number of installation entries: {}".format(len(installation_df)))
print("Number of invocation entries: {}".format(len(invocation_df)))
print("Number of citation entries: {}".format(len(citation_df)))
description_df.head()
Number of description entries: 336 Number of installation entries: 929 Number of invocation entries: 1134 Number of citation entries: 316
URL | contributor | excerpt | |
---|---|---|---|
0 | https://github.com/GoogleChrome/puppeteer | Allen Mao | Puppeteer is a Node library which provides a h... |
1 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | The major contributors of this repository incl... |
2 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | Integral Regression is initially described in ... |
3 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | We build a 3D pose estimation system based mai... |
4 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | The Integral Regression is also known as soft-... |
print("Number of installation entries: {}".format(len(installation_df)))
installation_df.head()
Number of installation entries: 929
URL | contributor | excerpt | |
---|---|---|---|
0 | https://github.com/GoogleChrome/puppeteer | Allen Mao | Installation |
1 | https://github.com/GoogleChrome/puppeteer | Allen Mao | To use Puppeteer in your project, run: |
2 | https://github.com/GoogleChrome/puppeteer | Allen Mao | npm i puppeteer |
3 | https://github.com/GoogleChrome/puppeteer | Allen Mao | # or "yarn add puppeteer" |
4 | https://github.com/GoogleChrome/puppeteer | Allen Mao | puppeteer-core |
print("Number of invocation entries: {}".format(len(invocation_df)))
invocation_df.head()
Number of invocation entries: 1134
URL | contributor | excerpt | |
---|---|---|---|
0 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | Usage |
1 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | We have placed some example config files in ex... |
2 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | Train |
3 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | For Integral Human Pose Regression, cd to pyto... |
4 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | Integral Regression |
print("Number of citation entries: {}".format(len(citation_df)))
citation_df.head()
Number of citation entries: 316
URL | contributor | excerpt | |
---|---|---|---|
0 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | If you find Integral Regression useful in your... |
1 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | @article{sun2017integral, |
2 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | title={Integral human pose regression}, |
3 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | author={Sun, Xiao and Xiao, Bin and Liang, Shu... |
4 | https://github.com/JimmySuen/integral-human-pose | Allen Mao | journal={arXiv preprint arXiv:1711.08229}, |
import numpy as np
import pickle
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split #can add stratified later
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from setup_corpus import build_corpora
corpora = build_corpora()
# print(corpora)
Selected Category: description description has 336 samples; installation has 84 samples; invocation has 84 samples; citation has 84 samples; Selected Category: installation description has 232 samples; installation has 929 samples; invocation has 232 samples; citation has 232 samples; Selected Category: invocation description has 283 samples; installation has 283 samples; invocation has 1134 samples; citation has 283 samples; Selected Category: citation description has 79 samples; installation has 79 samples; invocation has 79 samples; citation has 316 samples;
scoring = {'accuracy' : make_scorer(accuracy_score),
'precision' : make_scorer(precision_score),
'recall' : make_scorer(recall_score),
'f1_score' : make_scorer(f1_score)}
def evaluate(corpora,pipeline,name):
dec = 3
cv = StratifiedKFold(n_splits = 5, shuffle=True)
for category in corpora:
X = corpora[category].excerpt
Y = corpora[category][category]
print("\n",category,"X",len(X),"Y",len(Y))
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.2)
pipeline.fit(X_train, y_train)
title = category[:3]+name+".p"
print(title)
scores = cross_validate(pipeline, X, Y, cv=cv, scoring = scoring)
print("Mean test accuracy:",np.around(scores["test_accuracy"].mean(),decimals=dec),"\nPrecision",np.around(scores["test_precision"].mean(),decimals=dec),"\nRecall",np.around(scores["test_recall"].mean(),decimals=dec),"\nF-measure",np.around(scores["test_f1_score"].mean(),decimals=dec))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
pipeline = make_pipeline(CountVectorizer(), LogisticRegression(solver='liblinear'))
cv1 = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvlr"
evaluate(corpora,pipeline,name)
description X 588 Y 588 descvlr.p Mean test accuracy: 0.821 Precision 0.871 Recall 0.81 F-measure 0.838 installation X 1625 Y 1625 inscvlr.p Mean test accuracy: 0.877 Precision 0.87 Recall 0.924 F-measure 0.896 invocation X 1983 Y 1983 invcvlr.p Mean test accuracy: 0.852 Precision 0.829 Recall 0.934 F-measure 0.878 citation X 553 Y 553 citcvlr.p Mean test accuracy: 0.877 Precision 0.84 Recall 0.971 F-measure 0.901
Description: 81 Installation: 84 Invocation: 83 Citation: 90 [81 86 83 85] [81 86 84 86] [82 89 86 90] [82 86 85 86] [75 90 86 86]
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tflr"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destflr.p Mean test accuracy: 0.828 Precision 0.809 Recall 0.92 F-measure 0.86 installation X 1625 Y 1625 instflr.p Mean test accuracy: 0.884 Precision 0.897 Recall 0.901 F-measure 0.899 invocation X 1983 Y 1983 invtflr.p Mean test accuracy: 0.846 Precision 0.824 Recall 0.93 F-measure 0.874 citation X 553 Y 553 cittflr.p Mean test accuracy: 0.875 Precision 0.841 Recall 0.975 F-measure 0.901
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
pipeline = make_pipeline(TfidfVectorizer(), MultinomialNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfnb"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfnb.p Mean test accuracy: 0.784 Precision 0.73 Recall 0.991 F-measure 0.841 installation X 1625 Y 1625 instfnb.p Mean test accuracy: 0.838 Precision 0.786 Recall 0.984 F-measure 0.874 invocation X 1983 Y 1983 invtfnb.p Mean test accuracy: 0.875 Precision 0.853 Recall 0.944 F-measure 0.896 citation X 553 Y 553 cittfnb.p Mean test accuracy: 0.893 Precision 0.853 Recall 0.984 F-measure 0.914
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
pipeline = make_pipeline(CountVectorizer(), MultinomialNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvnb"
evaluate(corpora,pipeline,name)
description X 588 Y 588 descvnb.p Mean test accuracy: 0.823 Precision 0.778 Recall 0.973 F-measure 0.864 installation X 1625 Y 1625 inscvnb.p Mean test accuracy: 0.876 Precision 0.841 Recall 0.967 F-measure 0.899 invocation X 1983 Y 1983 invcvnb.p Mean test accuracy: 0.875 Precision 0.884 Recall 0.899 F-measure 0.892 citation X 553 Y 553 citcvnb.p Mean test accuracy: 0.917 Precision 0.893 Recall 0.972 F-measure 0.93
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
pipeline = make_pipeline(CountVectorizer(), BernoulliNB())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "cvbb"
evaluate(corpora,pipeline,name)
description X 588 Y 588 descvbb.p Mean test accuracy: 0.728 Precision 0.923 Recall 0.571 F-measure 0.703 installation X 1625 Y 1625 inscvbb.p Mean test accuracy: 0.753 Precision 0.704 Recall 0.982 F-measure 0.82 invocation X 1983 Y 1983 invcvbb.p Mean test accuracy: 0.76 Precision 0.722 Recall 0.944 F-measure 0.818 citation X 553 Y 553 citcvbb.p Mean test accuracy: 0.745 Precision 0.7 Recall 0.975 F-measure 0.814
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = make_pipeline(TfidfVectorizer(), SGDClassifier(loss='log'))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfsgd"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfsgd.p Mean test accuracy: 0.852 Precision 0.857 Recall 0.89 F-measure 0.873 installation X 1625 Y 1625 instfsgd.p Mean test accuracy: 0.895 Precision 0.911 Recall 0.905 F-measure 0.908 invocation X 1983 Y 1983 invtfsgd.p Mean test accuracy: 0.867 Precision 0.863 Recall 0.912 F-measure 0.887 citation X 553 Y 553 cittfsgd.p Mean test accuracy: 0.899 Precision 0.864 Recall 0.978 F-measure 0.917
from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = make_pipeline(TfidfVectorizer(), XGBClassifier())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfxgb"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfxgb.p Mean test accuracy: 0.786 Precision 0.85 Recall 0.759 F-measure 0.801 installation X 1625 Y 1625 instfxgb.p Mean test accuracy: 0.782 Precision 0.893 Recall 0.704 F-measure 0.787 invocation X 1983 Y 1983 invtfxgb.p Mean test accuracy: 0.768 Precision 0.741 Recall 0.913 F-measure 0.818 citation X 553 Y 553 cittfxgb.p Mean test accuracy: 0.799 Precision 0.759 Recall 0.953 F-measure 0.844
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
pipeline = make_pipeline(TfidfVectorizer(), Perceptron(tol=1e-3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfper"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfper.p Mean test accuracy: 0.827 Precision 0.826 Recall 0.884 F-measure 0.853 installation X 1625 Y 1625 instfper.p Mean test accuracy: 0.873 Precision 0.879 Recall 0.902 F-measure 0.89 invocation X 1983 Y 1983 invtfper.p Mean test accuracy: 0.833 Precision 0.868 Recall 0.837 F-measure 0.851 citation X 553 Y 553 cittfper.p Mean test accuracy: 0.868 Precision 0.842 Recall 0.949 F-measure 0.892
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier()) #(max_depth=3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfrfc"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfrfc.p
C:\Users\harip\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22. "10 in version 0.20 to 100 in 0.22.", FutureWarning)
Mean test accuracy: 0.752 Precision 0.842 Recall 0.7 F-measure 0.763 installation X 1625 Y 1625 instfrfc.p Mean test accuracy: 0.836 Precision 0.9 Recall 0.803 F-measure 0.848 invocation X 1983 Y 1983 invtfrfc.p Mean test accuracy: 0.794 Precision 0.856 Recall 0.769 F-measure 0.81 citation X 553 Y 553 cittfrfc.p Mean test accuracy: 0.799 Precision 0.767 Recall 0.937 F-measure 0.843
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
pipeline = make_pipeline(CountVectorizer(), DecisionTreeClassifier())
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfdtc"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfdtc.p Mean test accuracy: 0.741 Precision 0.79 Recall 0.744 F-measure 0.765 installation X 1625 Y 1625 instfdtc.p Mean test accuracy: 0.821 Precision 0.884 Recall 0.792 F-measure 0.835 invocation X 1983 Y 1983 invtfdtc.p Mean test accuracy: 0.765 Precision 0.854 Recall 0.711 F-measure 0.776 citation X 553 Y 553 cittfdtc.p Mean test accuracy: 0.792 Precision 0.767 Recall 0.921 F-measure 0.835
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
pipeline = make_pipeline(TfidfVectorizer(), AdaBoostClassifier()) #(max_depth=3, random_state=0))
cv = StratifiedKFold(n_splits = 5, shuffle=True)
name = "tfada"
evaluate(corpora,pipeline,name)
description X 588 Y 588 destfada.p Mean test accuracy: 0.78 Precision 0.82 Recall 0.789 F-measure 0.803 installation X 1625 Y 1625 instfada.p Mean test accuracy: 0.793 Precision 0.905 Recall 0.714 F-measure 0.798 invocation X 1983 Y 1983 invtfada.p Mean test accuracy: 0.774 Precision 0.755 Recall 0.893 F-measure 0.819 citation X 553 Y 553 cittfada.p Mean test accuracy: 0.836 Precision 0.92 Recall 0.806 F-measure 0.847