Chapter 8 - Movie Review Example

In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
import pandas
In [3]:
d = pandas.read_csv("data/movie_reviews.tsv", delimiter="\t")
In [6]:
# Holdout split
split = 0.7
d_train = d[:int(split*len(d))]
d_test = d[int((1-split)*len(d)):]
In [7]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(d_train.review)
In [8]:
i = 45000
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pandas.DataFrame(features[j:j+7,i:i+10].todense(), columns=words)
Out[8]:
producer producer9and producers produces producing product production productions productive productively
0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 1 1 0 0 0
4 0 0 0 0 0 0 1 0 0 0
5 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0
In [9]:
float(features.getnnz())*100 / (features.shape[0]*features.shape[1])
Out[9]:
0.21303978814816443
In [10]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB()
model1.fit(features, d_train.sentiment)
Out[10]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [11]:
pred1 = model1.predict_proba(vectorizer.transform(d_test.review))
In [12]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, roc_curve
def performance(y_true, pred, color="g", ann=True):
    acc = accuracy_score(y_true, pred[:,1] > 0.5)
    auc = roc_auc_score(y_true, pred[:,1])
    fpr, tpr, thr = roc_curve(y_true, pred[:,1])
    plot(fpr, tpr, color, linewidth="3")
    xlabel("False positive rate")
    ylabel("True positive rate")
    if ann:
        annotate("Acc: %0.2f" % acc, (0.1,0.8), size=14)
        annotate("AUC: %0.2f" % auc, (0.1,0.7), size=14)
In [13]:
performance(d_test.sentiment, pred1)

tf-idf features

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(d_train.review)
In [16]:
pred2 = model1.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b")
xlim(0,0.5)
ylim(0.5,1)
Out[16]:
(0.5, 1)

Parameter optimization

In [17]:
param_ranges = {
    "max_features": [10000, 30000, 50000, None],
    "min_df": [1,2,3],
    "nb_alpha": [0.01, 0.1, 1.0]
}
In [18]:
def build_model(max_features=None, min_df=1, nb_alpha=1.0, return_preds=False):
    vectorizer = TfidfVectorizer(max_features=max_features, min_df=min_df)
    features = vectorizer.fit_transform(d_train.review)
    model = MultinomialNB(alpha=nb_alpha)
    model.fit(features, d_train.sentiment)
    pred = model.predict_proba(vectorizer.transform(d_test.review))
    res = {
        "max_features": max_features,
        "min_df": min_df,
        "nb_alpha": nb_alpha,
        "auc": roc_auc_score(d_test.sentiment, pred[:,1])
    }
    if return_preds:
        res['preds'] = pred
    return res
In [19]:
from itertools import product
results = []
for p in product(*param_ranges.values()): 
    res = build_model(**dict(zip(param_ranges.keys(), p)))
    results.append( res )
    print res
{'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.9452288421633197, 'min_df': 1}
{'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.94534598458416985, 'min_df': 2}
{'max_features': 10000, 'nb_alpha': 0.01, 'auc': 0.94528327613097241, 'min_df': 3}
{'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.94535170553865855, 'min_df': 1}
{'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.94544454696447144, 'min_df': 2}
{'max_features': 10000, 'nb_alpha': 0.1, 'auc': 0.9453788017032404, 'min_df': 3}
{'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.9451069192188557, 'min_df': 1}
{'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.94514844185428892, 'min_df': 2}
{'max_features': 10000, 'nb_alpha': 1.0, 'auc': 0.94509984639498434, 'min_df': 3}
{'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96420321713304125, 'min_df': 1}
{'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96356040650699282, 'min_df': 2}
{'max_features': 30000, 'nb_alpha': 0.01, 'auc': 0.96297866511088048, 'min_df': 3}
{'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96332309138802907, 'min_df': 1}
{'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96286801427537505, 'min_df': 2}
{'max_features': 30000, 'nb_alpha': 0.1, 'auc': 0.96222698001875662, 'min_df': 3}
{'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.95663438781365639, 'min_df': 1}
{'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.9564270685211389, 'min_df': 2}
{'max_features': 30000, 'nb_alpha': 1.0, 'auc': 0.9559847525329952, 'min_df': 3}
{'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.97030439304911353, 'min_df': 1}
{'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.96733456459245626, 'min_df': 2}
{'max_features': 50000, 'nb_alpha': 0.01, 'auc': 0.96336889820855887, 'min_df': 3}
{'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96838833698005011, 'min_df': 1}
{'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96585363413380598, 'min_df': 2}
{'max_features': 50000, 'nb_alpha': 0.1, 'auc': 0.96251569229733958, 'min_df': 3}
{'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95877558564647003, 'min_df': 1}
{'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95769994171026562, 'min_df': 2}
{'max_features': 50000, 'nb_alpha': 1.0, 'auc': 0.95611163233186192, 'min_df': 3}
{'max_features': None, 'nb_alpha': 0.01, 'auc': 0.97338569521829166, 'min_df': 1}
{'max_features': None, 'nb_alpha': 0.01, 'auc': 0.96733456459245626, 'min_df': 2}
{'max_features': None, 'nb_alpha': 0.01, 'auc': 0.96336889820855887, 'min_df': 3}
{'max_features': None, 'nb_alpha': 0.1, 'auc': 0.97083593241650989, 'min_df': 1}
{'max_features': None, 'nb_alpha': 0.1, 'auc': 0.96585363413380598, 'min_df': 2}
{'max_features': None, 'nb_alpha': 0.1, 'auc': 0.96251569229733958, 'min_df': 3}
{'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95977324565853028, 'min_df': 1}
{'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95769994171026562, 'min_df': 2}
{'max_features': None, 'nb_alpha': 1.0, 'auc': 0.95611163233186192, 'min_df': 3}
In [20]:
opt = pandas.DataFrame(results)
In [21]:
mf_idx = [0,9,18,27]
plot(opt.max_features[mf_idx], opt.auc[mf_idx], linewidth=2)
title("AUC vs max_features")
Out[21]:
<matplotlib.text.Text at 0x108e2d150>
In [22]:
mdf_idx = [27,28,29]
plot(opt.min_df[mdf_idx], opt.auc[mdf_idx], linewidth=2)
title("AUC vs min_df")
Out[22]:
<matplotlib.text.Text at 0x10982b650>
In [23]:
nba_idx = [27,30,33]
plot(opt.nb_alpha[nba_idx], opt.auc[nba_idx], linewidth=2)
title("AUC vs alpha")
Out[23]:
<matplotlib.text.Text at 0x10a0e80d0>
In [24]:
pred3 = build_model(nb_alpha=0.01, return_preds=True)['preds']
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b", ann=False)
performance(d_test.sentiment, pred3, color="r")
xlim(0,0.5)
ylim(0.5,1)
Out[24]:
(0.5, 1)

Random Forest

In [25]:
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=3, max_features=30000, norm="l2")
features = vectorizer.fit_transform(d_train.review)
In [26]:
model3 = MultinomialNB()
model3.fit(features, d_train.sentiment)
pred3 = model3.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred3)
In [27]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(n_estimators=100)
model2.fit(features, d_train.sentiment)
Out[27]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [28]:
pred2 = model2.predict_proba(vectorizer.transform(d_test.review))
performance(d_test.sentiment, pred2)

Word2Vec

In [29]:
import re, string
stop_words = set(['all', "she'll", "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", 'those', 'under', 'ours', 'has', "haven't", 'do', 'them', 'his', "they'll", 'very', "who's", "they'd", 'cannot', "you've", 'they', 'not', 'during', 'yourself', 'him', 'nor', "we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'some', 'up', 'are', 'further', 'ourselves', 'out', 'what', 'for', 'while', "wasn't", 'does', "shouldn't", 'above', 'between', 'be', 'we', 'who', "you're", 'were', 'here', 'hers', "aren't", 'by', 'both', 'about', 'would', 'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 'into', 'whom', 'down', "hadn't", "couldn't", 'your', "doesn't", 'from', "how's", 'her', 'their', "it's", 'there', 'been', 'why', 'few', 'too', 'themselves', 'was', 'until', 'more', 'himself', "where's", "i've", 'with', "didn't", "what's", 'but', 'herself', 'than', "here's", 'he', 'me', "they're", 'myself', 'these', "hasn't", 'below', 'ought', 'theirs', 'my', "wouldn't", "we'd", 'and', 'then', 'is', 'am', 'it', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'again', 'no', 'that', 'when', 'same', 'how', 'other', 'which', 'you', "shan't", 'our', 'after', "let's", 'most', 'such', 'on', "he'll", 'a', 'off', 'i', "she'd", 'yours', "you'll", 'so', "we're", "she's", 'the', "that's", 'having', 'once'])

def tokenize(docs):
    pattern = re.compile('[\W_]+', re.UNICODE)
    sentences = []
    for d in docs:
        sentence = d.lower().split(" ") 
        sentence = [pattern.sub('', w) for w in sentence]
        sentences.append( [w for w in sentence if w not in stop_words] )
    return sentences
In [30]:
print list(stop_words)
['all', "she'll", "don't", 'being', 'over', 'through', 'yourselves', 'its', 'before', "he's", "when's", "we've", 'had', 'should', "he'd", 'to', 'only', "there's", "here's", 'under', 'ours', 'has', "haven't", 'do', 'them', 'his', 'above', 'very', "who's", "they'd", 'cannot', "you've", 'they', 'not', 'during', 'him', 'nor', "we'll", 'did', "they've", 'this', 'she', 'each', "won't", 'where', "mustn't", "isn't", "i'll", "why's", 'because', "you'd", 'doing', 'theirs', 'some', "hasn't", 'are', 'further', 'ourselves', 'out', 'what', 'for', 'herself', 'below', 'does', "shouldn't", "they'll", 'between', 'be', 'we', 'after', "doesn't", 'here', 'hers', "aren't", 'by', 'both', 'about', 'her', 'of', 'could', 'against', "i'd", "weren't", "i'm", 'or', "can't", 'own', 'into', 'yourself', 'down', "hadn't", "couldn't", 'your', "you're", 'from', "how's", 'would', 'their', "it's", 'there', 'been', "he'll", 'whom', 'too', 'themselves', 'was', 'until', 'more', 'himself', "i've", 'am', "what's", 'but', 'it', 'with', 'than', 'those', 'he', 'me', "they're", 'myself', "wasn't", 'up', 'while', 'ought', 'were', 'my', "wouldn't", "we'd", 'and', 'then', 'is', "didn't", 'few', 'an', 'as', 'itself', 'at', 'have', 'in', 'any', 'if', 'these', "let's", 'no', 'that', 'when', 'same', 'how', 'other', 'which', 'you', "shan't", 'again', 'our', 'who', "where's", 'most', 'such', 'on', 'why', 'a', 'off', 'i', "she'd", 'having', "you'll", 'so', "we're", "she's", 'the', 'once', 'yours', "that's"]
In [31]:
sentences = tokenize(d_train.review)
In [33]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(sentences, size=300, window=10, min_count=1, sample=1e-3, workers=2)
In [34]:
model.init_sims(replace=True)
In [35]:
model['movie']
Out[35]:
array([  3.68031524e-02,  -1.73313618e-02,   2.39172988e-02,
         2.63126623e-02,  -2.09560362e-03,   2.88584847e-02,
         1.84983388e-02,   1.61847845e-02,   3.55088450e-02,
        -1.09551460e-01,   8.66681430e-03,   5.40465722e-03,
         3.01059918e-04,  -6.86682537e-02,   9.07223746e-02,
        -2.10372563e-02,   3.71289365e-02,   7.52746388e-02,
         2.14488988e-04,  -1.00362170e-02,  -1.01014741e-01,
         1.02308214e-01,   3.20761763e-02,  -5.46756908e-02,
        -1.42806098e-02,   5.29203713e-02,   6.76437244e-02,
         4.52545471e-02,  -3.38600017e-02,  -9.30002108e-02,
         2.45192088e-02,   5.53836077e-02,  -7.16699800e-03,
         5.90483360e-02,  -8.47217366e-02,   6.08438440e-02,
        -7.18232915e-02,   4.67804708e-02,   1.32195026e-01,
         1.17428429e-01,  -4.01911624e-02,   6.89821271e-03,
         2.51749437e-02,   2.66396683e-02,   3.70586081e-03,
         1.63978413e-02,   4.36227731e-02,  -7.04589635e-02,
        -1.55292740e-02,   8.42350423e-02,   1.51802972e-01,
         3.18416655e-02,  -4.92057689e-02,  -3.66481440e-03,
        -4.09203470e-02,  -1.18032508e-02,   9.20815486e-03,
         6.21036775e-02,  -6.40419051e-02,  -4.64000776e-02,
         1.51039407e-01,  -1.50426120e-01,  -1.33195787e-03,
        -1.60630178e-02,  -6.76299818e-03,  -4.00880203e-02,
        -4.95725572e-02,   2.77464017e-02,  -5.78683876e-02,
        -4.15379368e-03,  -6.82369387e-03,  -1.12992570e-01,
         1.90758295e-02,  -3.58685909e-04,  -8.61261711e-02,
         6.88697174e-02,  -7.72073492e-02,   5.14152534e-02,
         4.87889685e-02,  -9.20623261e-03,   2.75032986e-02,
        -8.93953349e-03,  -7.17302263e-02,   2.63757426e-02,
         1.17861174e-01,  -9.01978835e-02,  -9.88838151e-02,
        -1.53771825e-02,  -6.96591660e-02,  -1.07649993e-03,
        -1.11715309e-01,  -5.69908284e-02,  -3.45526747e-02,
        -1.03636552e-02,  -1.06374146e-02,  -4.84549142e-02,
        -5.06430119e-02,  -9.12421271e-02,  -4.20696139e-02,
         7.88904428e-02,  -6.89790249e-02,   7.52062025e-03,
        -2.90804580e-02,   3.07238027e-02,   6.54164553e-02,
         3.93598229e-02,   1.45371864e-02,   6.26129424e-03,
        -9.42131132e-02,   8.22537392e-03,  -7.89110214e-02,
         8.79941583e-02,   1.90700181e-02,  -1.34457862e-02,
        -4.17764559e-02,   1.03315689e-01,   3.10423244e-02,
        -1.09508671e-02,   1.61643643e-02,   8.63924101e-02,
         9.44718905e-03,  -4.84026298e-02,   3.16865556e-02,
        -3.33051052e-04,  -8.11982155e-02,  -4.91626225e-02,
         3.83723192e-02,   9.36327651e-02,   1.75947472e-02,
         7.99207482e-03,  -8.33873153e-02,   7.32554644e-02,
         8.55481252e-02,   3.38279977e-02,  -6.30003661e-02,
        -1.06950305e-01,  -2.57532764e-03,  -8.69107619e-02,
        -8.38905945e-02,  -2.52390001e-02,   4.47176024e-02,
         2.98092533e-02,  -3.36933024e-02,   5.62217310e-02,
         4.98724449e-03,  -1.04179740e-01,   4.71587963e-02,
         8.17830581e-03,  -1.81893595e-02,  -6.29652515e-02,
         2.12100241e-03,  -1.26776434e-02,  -3.99298556e-02,
        -7.92319179e-02,  -3.71179506e-02,  -4.98937219e-02,
        -4.40028273e-02,  -6.31705299e-02,   2.12799329e-02,
         1.09728582e-01,  -3.11787007e-04,   8.88880417e-02,
         4.32723388e-02,  -9.17474031e-02,   7.68122748e-02,
        -9.60509703e-02,   1.79586820e-02,   1.54920649e-02,
        -5.79718761e-02,  -9.94104985e-03,  -6.22458830e-02,
        -5.81062352e-03,   3.41688655e-02,  -3.06482129e-02,
         8.93115476e-02,   1.56571844e-03,  -3.13598663e-02,
         2.58147363e-02,   8.44683573e-02,  -7.09150508e-02,
         3.52193490e-02,  -8.44642222e-02,   1.15281150e-01,
         4.13945317e-02,  -7.53380284e-02,   1.06682628e-02,
        -2.90197339e-02,  -1.93742830e-02,   2.80632749e-02,
         5.98113611e-03,  -4.81246673e-02,   5.81961051e-02,
        -1.21646203e-01,  -7.56746012e-05,   1.37762520e-02,
        -1.07660465e-01,  -5.49132526e-02,  -2.08277833e-02,
        -1.07252866e-01,   7.81668052e-02,  -1.25467703e-01,
         1.28760673e-02,   3.40522267e-02,  -4.53600958e-02,
         3.00191492e-02,  -1.05644763e-03,   2.28488427e-02,
        -1.36948330e-02,  -4.83282395e-02,   3.76505554e-02,
         2.14589084e-03,  -6.56597763e-02,  -2.17904039e-02,
        -1.30352750e-01,   1.00631500e-02,  -2.68849730e-02,
         3.97608057e-02,  -7.62850717e-02,  -1.71926096e-02,
         9.73487180e-03,   1.00726984e-01,  -3.03553939e-02,
         3.96722928e-02,  -3.65546495e-02,  -4.54760306e-02,
        -1.08036343e-02,  -2.10739505e-02,  -1.74224488e-02,
         2.45836675e-02,   9.26169008e-02,   1.69083904e-02,
        -1.93465408e-02,   4.19885060e-03,   6.44169822e-02,
         2.11614668e-02,   7.29327044e-03,  -1.25794038e-01,
         2.51813382e-02,   7.73636624e-02,   8.36196244e-02,
        -4.76672724e-02,   3.01581249e-03,  -1.89450905e-02,
        -2.14905385e-02,   7.21007884e-02,  -5.47636338e-02,
         7.87780955e-02,   6.38814420e-02,  -1.82743594e-02,
         2.52366532e-03,   5.32709819e-04,  -1.36896092e-02,
         7.07334876e-02,  -6.54596016e-02,  -9.79023874e-02,
         4.26013544e-02,   6.34806380e-02,   1.94339901e-02,
        -1.79679878e-02,   3.58978361e-02,  -2.59635057e-02,
        -5.14540309e-03,  -3.97503264e-02,   3.99754904e-02,
         1.13015451e-01,   1.92152523e-02,   8.84376913e-02,
        -6.68382198e-02,   1.51651455e-02,   1.41883986e-02,
         3.25143524e-02,   4.63566855e-02,  -1.13264881e-01,
        -1.22436136e-02,  -3.99175175e-02,  -1.39550604e-02,
        -6.48894459e-02,  -9.74341184e-02,  -1.99261121e-02,
         1.34053323e-02,  -1.10854441e-02,  -2.31818724e-02,
         6.56687096e-02,   9.27144662e-02,   6.97587132e-02,
         3.42045613e-02,   4.58224751e-02,  -1.15564905e-01,
        -4.97712307e-02,  -1.05411328e-01,  -5.43938540e-02,
        -1.27959177e-02,  -1.02667041e-01,  -9.49689746e-03,
         3.08882631e-02,   1.99520700e-02,  -5.47346135e-04,
         5.61500862e-02,  -4.44469228e-02,  -2.57499572e-02], dtype=float32)
In [36]:
def featurize_w2v(model, sentences):
    f = zeros((len(sentences), model.vector_size))
    for i,s in enumerate(sentences):
        for w in s:
            try:
                vec = model[w]
            except KeyError:
                continue
            f[i,:] = f[i,:] + vec
        f[i,:] = f[i,:] / len(s)
    return f
In [37]:
features_w2v = featurize_w2v(model, sentences)
In [38]:
model4 = RandomForestClassifier(n_estimators=100, n_jobs=-1)
model4.fit(features_w2v, d_train.sentiment)
Out[38]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [39]:
test_sentences = tokenize(d_test.review)
In [40]:
test_features_w2v = featurize_w2v(model, test_sentences)
In [41]:
pred4 = model4.predict_proba(test_features_w2v)
In [42]:
performance(d_test.sentiment, pred1, ann=False)
performance(d_test.sentiment, pred2, color="b", ann=False)
performance(d_test.sentiment, pred3, color="r", ann=False)
performance(d_test.sentiment, pred4, color="c")
xlim(0,0.3)
ylim(0.6,1)
Out[42]:
(0.6, 1)
In [48]:
examples = [
        "This movie is bad",
        "This movie is great",
        "I was going to say something awesome, but I simply can't because the movie is so bad.",
        "I was going to say something awesome or great or good, but I simply can't because the movie is so bad.",
        "It might have bad actors, but everything else is good."
    ]
example_feat4 = featurize_w2v(model, tokenize(examples))
model4.predict(example_feat4)
Out[48]:
array([0, 1, 0, 0, 0])