%pylab inline
Populating the interactive namespace from numpy and matplotlib
import pandas
d = pandas.read_csv("data/movie_reviews.tsv", delimiter="\t")
# Holdout split
split = 0.7
d_train = d[:int(split*len(d))]
d_test = d[int((1-split)*len(d)):]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(d_train.review)
i = 45000
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pandas.DataFrame(features[j:j+7,i:i+10].todense(), columns=words)
producer | producer9and | producers | produces | producing | product | production | productions | productive | productively | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
float(features.getnnz())*100 / (features.shape[0]*features.shape[1])
0.21303978814816443
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB()
model1.fit(features, d_train.sentiment)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
pred1 = model1.predict_proba(vectorizer.transform(d_test.review))
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
def performance(y_true, pred, color="g", ann=True):
acc = accuracy_score(y_true, pred[:,1] > 0.5)
auc = roc_auc_score(y_true, pred[:,1])
fpr, tpr, thr = roc_curve(y_true, pred[:,1])
plot(fpr, tpr, color, linewidth="3")
xlabel("False positive rate")
ylabel("True positive rate")
if ann:
annotate("Acc: %0.2f" % acc, (0.1,0.8), size=14)
annotate("AUC: %0.2f" % auc, (0.1,0.7), size=14)
performance(d_test.sentiment, pred1)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(d_train.review)
pred2 = model1.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b")
xlim(0,0.5)
ylim(0.5,1)
(0.5, 1)
param_ranges = {
"max_features": [10000, 30000, 50000, None],
"min_df": [1,2,3],
"nb_alpha": [0.01, 0.1, 1.0]
}
def build_model(max_features=None, min_df=1, nb_alpha=1.0, return_preds=False):
vectorizer = TfidfVectorizer(max_features=max_features, min_df=min_df)
features = vectorizer.fit_transform(d_train.review)
model = MultinomialNB(alpha=nb_alpha)
model.fit(features, d_train.sentiment)
pred = model.predict_proba(vectorizer.transform(d_test.review))
res = {
"max_features": max_features,
"min_df": min_df,
"nb_alpha": nb_alpha,
"auc": roc_auc_score(d_test.sentiment, pred[:,1])
}
if return_preds:
res['preds'] = pred
return res
from itertools import product
results = []
for p in product(*param_ranges.values()):
res = build_model(**dict(zip(param_ranges.keys(), p)))
results.append( res )
print res
{'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.9452288421633197, 'min_df': 1} {'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.94534598458416985, 'min_df': 2} {'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.94528327613097241, 'min_df': 3} {'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.94535170553865855, 'min_df': 1} {'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.94544454696447144, 'min_df': 2} {'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.9453788017032404, 'min_df': 3} {'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.9451069192188557, 'min_df': 1} {'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.94514844185428892, 'min_df': 2} {'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.94509984639498434, 'min_df': 3} {'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96420321713304125, 'min_df': 1} {'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96356040650699282, 'min_df': 2} {'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96297866511088048, 'min_df': 3} {'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96332309138802907, 'min_df': 1} {'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96286801427537505, 'min_df': 2} {'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96222698001875662, 'min_df': 3} {'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.95663438781365639, 'min_df': 1} {'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.9564270685211389, 'min_df': 2} {'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.9559847525329952, 'min_df': 3} {'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.97030439304911353, 'min_df': 1} {'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.96733456459245626, 'min_df': 2} {'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.96336889820855887, 'min_df': 3} {'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96838833698005011, 'min_df': 1} {'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96585363413380598, 'min_df': 2} {'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96251569229733958, 'min_df': 3} {'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95877558564647003, 'min_df': 1} {'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95769994171026562, 'min_df': 2} {'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95611163233186192, 'min_df': 3} {'max_features': None, 'nb_alpha': 0.01, 'auc': 0.97338569521829166, 'min_df': 1} {'max_features': None, 'nb_alpha': 0.01, 'auc': 0.96733456459245626, 'min_df': 2} {'max_features': None, 'nb_alpha': 0.01, 'auc': 0.96336889820855887, 'min_df': 3} {'max_features': None, 'nb_alpha': 0.1, 'auc': 0.97083593241650989, 'min_df': 1} {'max_features': None, 'nb_alpha': 0.1, 'auc': 0.96585363413380598, 'min_df': 2} {'max_features': None, 'nb_alpha': 0.1, 'auc': 0.96251569229733958, 'min_df': 3} {'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95977324565853028, 'min_df': 1} {'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95769994171026562, 'min_df': 2} {'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95611163233186192, 'min_df': 3}
opt = pandas.DataFrame(results)
mf_idx = [0,9,18,27]
plot(opt.max_features[mf_idx], opt.auc[mf_idx], linewidth=2)
title("AUC vs max_features")
<matplotlib.text.Text at 0x108e2d150>
mdf_idx = [27,28,29]
plot(opt.min_df[mdf_idx], opt.auc[mdf_idx], linewidth=2)
title("AUC vs min_df")
<matplotlib.text.Text at 0x10982b650>
nba_idx = [27,30,33]
plot(opt.nb_alpha[nba_idx], opt.auc[nba_idx], linewidth=2)
title("AUC vs alpha")
<matplotlib.text.Text at 0x10a0e80d0>
pred3 = build_model(nb_alpha=0.01, return_preds=True)['preds']
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b", ann=False)
performance(d_test.sentiment, pred3, color="r")
xlim(0,0.5)
ylim(0.5,1)
(0.5, 1)
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=3, max_features=30000, norm="l2")
features = vectorizer.fit_transform(d_train.review)
model3 = MultinomialNB()
model3.fit(features, d_train.sentiment)
pred3 = model3.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred3)
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=100)
model2.fit(features, d_train.sentiment)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
pred2 = model2.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred2)
import re, string
stop_words = set(['all', "she'll", "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", 'those', 'under', 'ours', 'has', "haven't", 'do', 'them', 'his', "they'll", 'very', "who's", "they'd", 'cannot', "you've", 'they', 'not', 'during', 'yourself', 'him', 'nor', "we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'some', 'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', "wasn't", 'does', "shouldn't", 'above', 'between', 'be', 'we', 'who', "you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', "how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', "where's", "i've", 'with', "didn't", "what's", 'but', 'herself', 'than', "here's", 'he', 'me', "they're", 'myself', 'these', "hasn't", 'below', 'ought', 'theirs', 'my', "wouldn't", "we'd", 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'again', 'no', 'that', 'when', 'same', 'how', 'other', 'which', 'you', "shan't", 'our', 'after', "let's", 'most', 'such', 'on', "he'll", 'a', 'off', 'i', "she'd", 'yours', "you'll", 'so', "we're", "she's", 'the', "that's", 'having', 'once'])
def tokenize(docs):
pattern = re.compile('[\W_]+', re.UNICODE)
sentences = []
for d in docs:
sentence = d.lower().split(" ")
sentence = [pattern.sub('', w) for w in sentence]
sentences.append( [w for w in sentence if w not in stop_words] )
return sentences
print list(stop_words)
['all', "she'll", "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", "here's", 'under', 'ours', 'has', "haven't", 'do', 'them', 'his', 'above', 'very', "who's", "they'd", 'cannot', "you've", 'they', 'not', 'during', 'him', 'nor', "we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'theirs', 'some', "hasn't", 'are', 'further', 'ourselves', 'out', 'what', 'for', 'herself', 'below', 'does', "shouldn't", "they'll", 'between', 'be', 'we', 'after', "doesn't", 'here', 'hers', "aren't", 'by', 'both', 'about', 'her', 'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 'into', 'yourself', 'down', "hadn't", "couldn't", 'your', "you're", 'from', "how's", 'would', 'their', "it's", 'there', 'been', "he'll", 'whom', 'too', 'themselves', 'was', 'until', 'more', 'himself', "i've", 'am', "what's", 'but', 'it', 'with', 'than', 'those', 'he', 'me', "they're", 'myself', "wasn't", 'up', 'while', 'ought', 'were', 'my', "wouldn't", "we'd", 'and', 'then', 'is', "didn't", 'few', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'these', "let's", 'no', 'that', 'when', 'same', 'how', 'other', 'which', 'you', "shan't", 'again', 'our', 'who', "where's", 'most', 'such', 'on', 'why', 'a', 'off', 'i', "she'd", 'having', "you'll", 'so', "we're", "she's", 'the', 'once', 'yours', "that's"]
sentences = tokenize(d_train.review)
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences, size=300, window=10, min_count=1, sample=1e-3, workers=2)
model.init_sims(replace=True)
model['movie']
array([ 3.68031524e-02, -1.73313618e-02, 2.39172988e-02, 2.63126623e-02, -2.09560362e-03, 2.88584847e-02, 1.84983388e-02, 1.61847845e-02, 3.55088450e-02, -1.09551460e-01, 8.66681430e-03, 5.40465722e-03, 3.01059918e-04, -6.86682537e-02, 9.07223746e-02, -2.10372563e-02, 3.71289365e-02, 7.52746388e-02, 2.14488988e-04, -1.00362170e-02, -1.01014741e-01, 1.02308214e-01, 3.20761763e-02, -5.46756908e-02, -1.42806098e-02, 5.29203713e-02, 6.76437244e-02, 4.52545471e-02, -3.38600017e-02, -9.30002108e-02, 2.45192088e-02, 5.53836077e-02, -7.16699800e-03, 5.90483360e-02, -8.47217366e-02, 6.08438440e-02, -7.18232915e-02, 4.67804708e-02, 1.32195026e-01, 1.17428429e-01, -4.01911624e-02, 6.89821271e-03, 2.51749437e-02, 2.66396683e-02, 3.70586081e-03, 1.63978413e-02, 4.36227731e-02, -7.04589635e-02, -1.55292740e-02, 8.42350423e-02, 1.51802972e-01, 3.18416655e-02, -4.92057689e-02, -3.66481440e-03, -4.09203470e-02, -1.18032508e-02, 9.20815486e-03, 6.21036775e-02, -6.40419051e-02, -4.64000776e-02, 1.51039407e-01, -1.50426120e-01, -1.33195787e-03, -1.60630178e-02, -6.76299818e-03, -4.00880203e-02, -4.95725572e-02, 2.77464017e-02, -5.78683876e-02, -4.15379368e-03, -6.82369387e-03, -1.12992570e-01, 1.90758295e-02, -3.58685909e-04, -8.61261711e-02, 6.88697174e-02, -7.72073492e-02, 5.14152534e-02, 4.87889685e-02, -9.20623261e-03, 2.75032986e-02, -8.93953349e-03, -7.17302263e-02, 2.63757426e-02, 1.17861174e-01, -9.01978835e-02, -9.88838151e-02, -1.53771825e-02, -6.96591660e-02, -1.07649993e-03, -1.11715309e-01, -5.69908284e-02, -3.45526747e-02, -1.03636552e-02, -1.06374146e-02, -4.84549142e-02, -5.06430119e-02, -9.12421271e-02, -4.20696139e-02, 7.88904428e-02, -6.89790249e-02, 7.52062025e-03, -2.90804580e-02, 3.07238027e-02, 6.54164553e-02, 3.93598229e-02, 1.45371864e-02, 6.26129424e-03, -9.42131132e-02, 8.22537392e-03, -7.89110214e-02, 8.79941583e-02, 1.90700181e-02, -1.34457862e-02, -4.17764559e-02, 1.03315689e-01, 3.10423244e-02, -1.09508671e-02, 1.61643643e-02, 8.63924101e-02, 9.44718905e-03, -4.84026298e-02, 3.16865556e-02, -3.33051052e-04, -8.11982155e-02, -4.91626225e-02, 3.83723192e-02, 9.36327651e-02, 1.75947472e-02, 7.99207482e-03, -8.33873153e-02, 7.32554644e-02, 8.55481252e-02, 3.38279977e-02, -6.30003661e-02, -1.06950305e-01, -2.57532764e-03, -8.69107619e-02, -8.38905945e-02, -2.52390001e-02, 4.47176024e-02, 2.98092533e-02, -3.36933024e-02, 5.62217310e-02, 4.98724449e-03, -1.04179740e-01, 4.71587963e-02, 8.17830581e-03, -1.81893595e-02, -6.29652515e-02, 2.12100241e-03, -1.26776434e-02, -3.99298556e-02, -7.92319179e-02, -3.71179506e-02, -4.98937219e-02, -4.40028273e-02, -6.31705299e-02, 2.12799329e-02, 1.09728582e-01, -3.11787007e-04, 8.88880417e-02, 4.32723388e-02, -9.17474031e-02, 7.68122748e-02, -9.60509703e-02, 1.79586820e-02, 1.54920649e-02, -5.79718761e-02, -9.94104985e-03, -6.22458830e-02, -5.81062352e-03, 3.41688655e-02, -3.06482129e-02, 8.93115476e-02, 1.56571844e-03, -3.13598663e-02, 2.58147363e-02, 8.44683573e-02, -7.09150508e-02, 3.52193490e-02, -8.44642222e-02, 1.15281150e-01, 4.13945317e-02, -7.53380284e-02, 1.06682628e-02, -2.90197339e-02, -1.93742830e-02, 2.80632749e-02, 5.98113611e-03, -4.81246673e-02, 5.81961051e-02, -1.21646203e-01, -7.56746012e-05, 1.37762520e-02, -1.07660465e-01, -5.49132526e-02, -2.08277833e-02, -1.07252866e-01, 7.81668052e-02, -1.25467703e-01, 1.28760673e-02, 3.40522267e-02, -4.53600958e-02, 3.00191492e-02, -1.05644763e-03, 2.28488427e-02, -1.36948330e-02, -4.83282395e-02, 3.76505554e-02, 2.14589084e-03, -6.56597763e-02, -2.17904039e-02, -1.30352750e-01, 1.00631500e-02, -2.68849730e-02, 3.97608057e-02, -7.62850717e-02, -1.71926096e-02, 9.73487180e-03, 1.00726984e-01, -3.03553939e-02, 3.96722928e-02, -3.65546495e-02, -4.54760306e-02, -1.08036343e-02, -2.10739505e-02, -1.74224488e-02, 2.45836675e-02, 9.26169008e-02, 1.69083904e-02, -1.93465408e-02, 4.19885060e-03, 6.44169822e-02, 2.11614668e-02, 7.29327044e-03, -1.25794038e-01, 2.51813382e-02, 7.73636624e-02, 8.36196244e-02, -4.76672724e-02, 3.01581249e-03, -1.89450905e-02, -2.14905385e-02, 7.21007884e-02, -5.47636338e-02, 7.87780955e-02, 6.38814420e-02, -1.82743594e-02, 2.52366532e-03, 5.32709819e-04, -1.36896092e-02, 7.07334876e-02, -6.54596016e-02, -9.79023874e-02, 4.26013544e-02, 6.34806380e-02, 1.94339901e-02, -1.79679878e-02, 3.58978361e-02, -2.59635057e-02, -5.14540309e-03, -3.97503264e-02, 3.99754904e-02, 1.13015451e-01, 1.92152523e-02, 8.84376913e-02, -6.68382198e-02, 1.51651455e-02, 1.41883986e-02, 3.25143524e-02, 4.63566855e-02, -1.13264881e-01, -1.22436136e-02, -3.99175175e-02, -1.39550604e-02, -6.48894459e-02, -9.74341184e-02, -1.99261121e-02, 1.34053323e-02, -1.10854441e-02, -2.31818724e-02, 6.56687096e-02, 9.27144662e-02, 6.97587132e-02, 3.42045613e-02, 4.58224751e-02, -1.15564905e-01, -4.97712307e-02, -1.05411328e-01, -5.43938540e-02, -1.27959177e-02, -1.02667041e-01, -9.49689746e-03, 3.08882631e-02, 1.99520700e-02, -5.47346135e-04, 5.61500862e-02, -4.44469228e-02, -2.57499572e-02], dtype=float32)
def featurize_w2v(model, sentences):
f = zeros((len(sentences), model.vector_size))
for i,s in enumerate(sentences):
for w in s:
try:
vec = model[w]
except KeyError:
continue
f[i,:] = f[i,:] + vec
f[i,:] = f[i,:] / len(s)
return f
features_w2v = featurize_w2v(model, sentences)
model4 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model4.fit(features_w2v, d_train.sentiment)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1, oob_score=False, random_state=None, verbose=0, warm_start=False)
test_sentences = tokenize(d_test.review)
test_features_w2v = featurize_w2v(model, test_sentences)
pred4 = model4.predict_proba(test_features_w2v)
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b", ann=False)
performance(d_test.sentiment, pred3, color="r", ann=False)
performance(d_test.sentiment, pred4, color="c")
xlim(0,0.3)
ylim(0.6,1)
(0.6, 1)
examples = [
"This movie is bad",
"This movie is great",
"I was going to say something awesome, but I simply can't because the movie is so bad.",
"I was going to say something awesome or great or good, but I simply can't because the movie is so bad.",
"It might have bad actors, but everything else is good."
]
example_feat4 = featurize_w2v(model, tokenize(examples))
model4.predict(example_feat4)
array([0, 1, 0, 0, 0])