import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In this notebook you will build an algorithm that classifies social media comments into normal or toxic. Like in many real-world cases, you only have a small (10^3) dataset of hand-labeled examples to work with. We'll tackle this problem using both classical nlp methods and embedding-based approach.
import pandas as pd
data = pd.read_csv("comments.tsv", sep='\t')
texts = data['comment_text'].values
target = data['should_ban'].values
data[50::200]
should_ban | comment_text | |
---|---|---|
50 | 0 | "Those who're in advantageous positions are th... |
250 | 1 | Fartsalot56 says f**k you motherclucker!! |
450 | 1 | Are you a fool? \n\nI am sorry, but you seem t... |
650 | 1 | I AM NOT A VANDAL!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |
850 | 0 | Citing sources\n\nCheck out the Wikipedia:Citi... |
from sklearn.model_selection import train_test_split
texts_train, texts_test, y_train, y_test = train_test_split(texts, target, test_size=0.5, random_state=42)
Note: it is generally a good idea to split data into train/test before anything is done to them.
It guards you against possible data leakage in the preprocessing stage. For example, should you decide to select words present in obscene tweets as features, you should only count those words over the training set. Otherwise your algoritm can cheat evaluation.
Comments contain raw text with punctuation, upper/lowercase letters and even newline symbols.
To simplify all further steps, we'll split text into space-separated tokens using one of nltk tokenizers.
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
preprocess = lambda text: ' '.join(tokenizer.tokenize(text.lower()))
text = 'How to be a grown-up at work: replace "fuck you" with "Ok, great!".'
print("before:", text)
print("after:", preprocess(text))
before: How to be a grown-up at work: replace "fuck you" with "Ok, great!". after: how to be a grown-up at work : replace " fuck you " with " ok , great ! " .
# task: preprocess each comment in train and test
texts_train = [preprocess(text) for text in texts_train]
texts_test = [preprocess(text) for text in texts_test]
assert texts_train[5] == 'who cares anymore . they attack with impunity .'
assert texts_test[89] == 'hey todds ! quick q ? why are you so gay'
assert len(texts_test) == len(y_test)
One traditional approach to such problem is to use bag of words features:
Note: in practice, you can compute such features using sklearn. Please don't do that in the current assignment, though.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import itertools
from collections import Counter
k = 10000
# task: find up to k most frequent tokens in texts_train,
# sort them by number of occurences (highest first)
vocab = [text.split() for text in texts_train]
vocab = list(itertools.chain(*vocab))
vocab_counts = dict(Counter(vocab))
vocab_counts_sorted = dict(sorted(vocab_counts.items(), key=lambda item: -item[1]))
bow_vocabulary = list(vocab_counts_sorted.keys())[:k]
print("example features:", sorted(bow_vocabulary)[::100])
example features: ['!', '12:20', '300', '_', 'adorned', 'alternative', 'archive', 'average', 'benkner', 'bout', 'came', 'chest', 'combined', 'consumers', 'cricket', 'decisions', 'dickheads', 'domestic', 'eductaion', 'essentially', 'faggot', 'firms', 'frustrated', 'goal', 'hanibal', 'hip-hop', 'identified', 'infoboxes', 'issue', 'kindergarten', 'lets', 'lot', "mclaren's", 'moderator', 'naturally', 'noticeable', 'opposing', 'pdf', 'plant', 'pretoria', 'punctuation', 'rebels', 'repetative', 'riadh', 'schulz', 'shes', 'slit', 'spoof', 'stupid', 't', 'theoretical', 'topic', 'uglyness', 'userspace', 'wanted', 'wikieditor', 'year', '←']
After we calculate frequencies for our vocabulary we need to update the k value
k = min(k, len(bow_vocabulary))
k
5707
def text_to_bow(text):
"""convert text string to an array of token counts. Use bow_vocabulary."""
bow = np.zeros(k)
for token in text.split():
ind = (
bow_vocabulary.index(token)
if token in bow_vocabulary
else -1
)
if ind != -1:
bow[ind] += 1
return bow
X_train_bow = np.stack(list(map(text_to_bow, texts_train)))
X_test_bow = np.stack(list(map(text_to_bow, texts_test)))
k_max = len(set(" ".join(texts_train).split()))
assert X_train_bow.shape == (len(texts_train), min(k, k_max))
assert X_test_bow.shape == (len(texts_test), min(k, k_max))
assert np.all(
X_train_bow[5:10].sum(-1) == np.array([len(s.split()) for s in texts_train[5:10]])
)
assert len(bow_vocabulary) <= min(k, k_max)
assert X_train_bow[6, bow_vocabulary.index(".")] == texts_train[6].split().count(".")
Naive bayes: perhaps the simplest model that can solve your problem is the so called Naive Bayes Classifier. Its a trivial linear model that assumes the independence of input features and computes the coefficients by, well, counting probabilities.
If you don't remember the math behind Naive Bayes, read this chunk to help refresh your memory. Done? Good! Now let's implement that :)
def get_p_x(delta, word_ind, counts, vocab_size, sum_counts):
return (delta + counts[word_ind]) / (delta * vocab_size + sum_counts)
class BinaryNaiveBayes:
delta = 1.0 # add this to all word counts to smoothe probabilities
def fit(self, X, y):
"""
Fit a NaiveBayes classifier for two classes
:param X: [batch_size, vocab_size] of bag-of-words features
:param y: [batch_size] of binary targets {0, 1}
"""
# first, compute marginal probabilities of every class, p(y=k) for k = 0,1
p_y_0 = np.count_nonzero(y == 0) / len(y)
p_y_1 = np.count_nonzero(y == 1) / len(y)
self.p_y = np.array([p_y_0, p_y_1])
# count occurences of each word in texts with label 1 and label 0 separately
word_counts_positive = X[y == 1].sum(axis=0)
word_counts_negative = X[y == 0].sum(axis=0)
# ^-- both must be vectors of shape [vocab_size].
# finally, lets use those counts to estimate p(x | y = k) for k = 0, 1
# <YOUR CODE HERE>
sum_word_counts_positive = sum(word_counts_positive)
sum_word_counts_negative = sum(word_counts_negative)
self.p_x_given_positive = np.array(
[
get_p_x(
delta=self.delta,
word_ind=word_ind,
counts=word_counts_positive,
vocab_size=X.shape[1],
sum_counts=sum_word_counts_positive,
)
for word_ind in range(X.shape[1])
]
)
self.p_x_given_negative = np.array(
[
get_p_x(
delta=self.delta,
word_ind=word_ind,
counts=word_counts_negative,
vocab_size=X.shape[1],
sum_counts=sum_word_counts_negative,
)
for word_ind in range(X.shape[1])
]
)
# both must be of shape [vocab_size]; and don't forget to add self.delta!
return self
def predict_scores(self, X):
"""
:param X: [batch_size, vocab_size] of bag-of-words features
:returns: a matrix of scores [batch_size, k] of scores for k-th class
"""
# compute scores for positive and negative classes separately.
# these scores should be proportional to log-probabilities of the respective target {0, 1}
# note: if you apply logarithm to p_x_given_*, the total log-probability can be written
# as a dot-product with X
score_negative = np.log(self.p_y[0]) + np.dot(X, np.log(self.p_x_given_negative))
# <YOUR CODE HERE - compute unnormalized negative log-probability>
score_positive = np.log(self.p_y[1]) + np.dot(X, np.log(self.p_x_given_positive))
# <YOUR CODE HERE - compute unnormalized positive log-probability>
# you can compute total p(x | y=k) with a dot product
return np.stack([score_negative, score_positive], axis=-1)
def predict(self, X):
return self.predict_scores(X).argmax(axis=-1)
naive_model = BinaryNaiveBayes().fit(X_train_bow, y_train)
assert naive_model.p_y.shape == (2,) and naive_model.p_y.sum() == 1 and naive_model.p_y[0] > naive_model.p_y[1]
assert naive_model.p_x_given_positive.shape == naive_model.p_x_given_negative.shape == X_train_bow.shape[1:]
assert np.allclose(naive_model.p_x_given_positive.sum(), 1.0)
assert np.allclose(naive_model.p_x_given_negative.sum(), 1.0)
assert naive_model.p_x_given_negative.min() > 0, "did you forget to add delta?"
f_index = bow_vocabulary.index('fuck') # offensive tweets should contain more of this
assert naive_model.p_x_given_positive[f_index] > naive_model.p_x_given_negative[f_index]
g_index = bow_vocabulary.index('good') # offensive tweets should contain less of this
assert naive_model.p_x_given_positive[g_index] < naive_model.p_x_given_negative[g_index]
from sklearn.metrics import roc_auc_score, roc_curve
for name, X, y, model in [
("train", X_train_bow, y_train, naive_model),
("test ", X_test_bow, y_test, naive_model),
]:
proba = model.predict_scores(X)[:, 1] - model.predict_scores(X)[:, 0]
auc = roc_auc_score(y, proba)
plt.plot(*roc_curve(y, proba)[:2], label="%s AUC=%.4f" % (name, auc))
plt.plot(
[0, 1],
[0, 1],
"--",
color="black",
)
plt.legend(fontsize="large")
plt.grid()
test_accuracy = np.mean(naive_model.predict(X_test_bow) == y_test)
print(f"Model accuracy: {test_accuracy:.3f}")
assert test_accuracy > 0.75, "Accuracy too low. There's likely a mistake in the code."
print("Well done!")
Model accuracy: 0.756 Well done!
Okay, it definitely learned something. Now let's figure out what exactly it learned. The simplest way to do that is by highlighting which words have a greatest ratio of positive to negative probability or vice versa. We'll go with the positive one because reasons.
Your task is to compute top-25 words that have the highest ratio of ${p(x_i | y=1)} \over {p(x_i | y=0)}$. Enjoy!
# hint: use naive_model.p_*
probability_ratio = naive_model.p_x_given_positive / naive_model.p_x_given_negative
top_negative_words = np.array(bow_vocabulary)[np.argsort(-probability_ratio)[:25]]
assert len(top_negative_words) == 25 and [isinstance(w, str) for w in top_negative_words]
assert 'j.delanoy' in top_negative_words and 'college' in top_negative_words
for i, word in enumerate(top_negative_words):
print(f"#{i}\t{word.rjust(10, ' ')}\t(ratio={probability_ratio[bow_vocabulary.index(word)]})")
#0 hitler (ratio=475.47341740332655) #1 heil (ratio=471.80652729481756) #2 offfuck (ratio=441.24910972390967) #3 suck (ratio=314.7414009803511) #4 nigger (ratio=223.68029661904563) #5 j.delanoy (ratio=220.0134065105367) #6 dick (ratio=187.01139553395618) #7 fggt (ratio=97.78373622690519) #8 bitch (ratio=59.89253843897943) #9 fuck (ratio=53.78105492479786) #10 slap (ratio=44.00268130210734) #11 shit (ratio=44.00268130210734) #12 fucking (ratio=31.779714273744187) #13 ass (ratio=26.89052746239893) #14 stupid (ratio=18.334450542544726) #15 = (ratio=17.53995768570112) #16 college (ratio=17.11215383970841) #17 * (ratio=17.11215383970841) #18 asshole (ratio=15.889857136872093) #19 u (ratio=15.278708785453937) #20 bastard (ratio=14.66756043403578) #21 hit (ratio=14.66756043403578) #22 idiot (ratio=13.445263731199464) #23 @ (ratio=13.445263731199464) #24 scientific (ratio=12.222967028363149)
Now lets try something less prehistoric: Logistic Regression. Turns out, if you're using silicon instead of an abacus, you can find model weights by optimizing the log-probability of the answer. Though, of course, you don't even need to write it by hand anymore. Let's sklearn it!
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
log_reg = LogisticRegression(solver='lbfgs', random_state=42)
parameters = {"C": np.logspace(-1, 1, 300)}
clf = GridSearchCV(log_reg, parameters, n_jobs=-1, cv=StratifiedKFold(2))
clf.fit(X_train_bow, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False), estimator=LogisticRegression(random_state=42), n_jobs=-1, param_grid={'C': array([ 0.1 , 0.10155211, 0.10312832, 0.10472898, 0.1063545 , 0.10800524, 0.1096816 , 0.11138398, 0.11311279, 0.11486843, 0.11665131, 0.11846187, 0.12030053, 0.12216773, 0.12406392, 0.12598953, 0.12794503, 0.12993088, 0.1319... 5.92345715, 6.01539588, 6.10876161, 6.20357648, 6.29986298, 6.39764396, 6.4969426 , 6.59778248, 6.7001875 , 6.80418197, 6.90979055, 7.01703829, 7.12595063, 7.23655342, 7.34887289, 7.46293569, 7.57876886, 7.6963999 , 7.81585671, 7.93716762, 8.06036141, 8.18546731, 8.31251499, 8.4415346 , 8.57255673, 8.70561248, 8.8407334 , 8.97795155, 9.11729948, 9.25881025, 9.40251743, 9.5484551 , 9.69665789, 9.84716096, 10. ])})
clf.best_params_
{'C': 0.19693113379074229}
bow_log_reg_model = clf.best_estimator_
from sklearn.metrics import roc_auc_score, roc_curve
for name, X, y, model in [
('train', X_train_bow, y_train, bow_log_reg_model),
('test ', X_test_bow, y_test, bow_log_reg_model)
]:
proba = model.predict_proba(X)[:, 1]
auc = roc_auc_score(y, proba)
plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))
plt.plot([0, 1], [0, 1], '--', color='black',)
plt.title("Logistic regression + Bag-of-words")
plt.legend(fontsize='large')
plt.grid()
test_accuracy = np.mean(bow_log_reg_model.predict(X_test_bow) == y_test)
print(f"Model accuracy: {test_accuracy:.3f}")
assert test_accuracy > 0.77, "Hint: tune the parameter C to improve performance"
print("Well done!")
Model accuracy: 0.772 Well done!
Not all words are equally useful. One can prioritize rare words and downscale words like "and"/"or" by using tf-idf features. This abbreviation stands for text frequency/inverse document frequence and means exactly that:
$$ feature_i = { Count(word_i \in x) \times { log {N \over Count(word_i \in D) + \alpha} }} $$, where:
It may also be a good idea to normalize each data sample after computing tf-idf features.
Your task: implement tf-idf features, train a model and evaluate ROC curve. Compare it with basic BagOfWords model from above.
Please don't use sklearn/nltk builtin tf-idf vectorizers in your solution :) You can still use 'em for debugging though.
tokenized_texts_train = []
for text in texts_train:
tokenized_text = [token for token in text.split() if token in bow_vocabulary]
tokenized_texts_train.append(tokenized_text)
def get_tf(tokenized_text, term):
return tokenized_text.count(term) / len(tokenized_text)
def get_idf(tokenized_corpus, term):
size = len(tokenized_corpus)
doc_count = sum([1 if term in text else 0 for text in tokenized_corpus])
return np.log((1 + size) / (1 + doc_count)) + 1
def get_tf_idf(tf, idf):
return tf * idf
def get_l2_norm(vector):
return np.sqrt(np.sum(vector**2))
def text_to_tf_idf(text):
"""convert text string to an array of token counts. Use bow_vocabulary."""
vec = np.zeros(k)
tokenized_text = text.split()
tf_idf_sentence = []
tokenized_text_set = list(set(tokenized_text))
for word in tokenized_text_set:
tf_ = get_tf(tokenized_text, word)
idf_ = get_idf(tokenized_texts_train, word)
tf_idf_sentence.append(get_tf_idf(tf_, idf_))
tf_idf_sentence = np.array(tf_idf_sentence)
l2_norm = get_l2_norm(tf_idf_sentence)
tf_idf_sentence = tf_idf_sentence / l2_norm
tf_idf_sentence_dict = dict(zip(tokenized_text_set, tf_idf_sentence))
for token in tokenized_text_set:
ind = (
bow_vocabulary.index(token)
if token in bow_vocabulary
else -1
)
if ind != -1:
vec[ind] = tf_idf_sentence_dict[token]
return vec
X_train_tf_idf = np.stack(list(map(text_to_tf_idf, texts_train)))
X_test_tf_idf = np.stack(list(map(text_to_tf_idf, texts_test)))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
log_reg_tf_idf = LogisticRegression(solver='lbfgs', random_state=42)
parameters = {"C": np.logspace(-1, 1, 600)}
clf_tf_idf = GridSearchCV(log_reg_tf_idf, parameters, n_jobs=-1, cv=StratifiedKFold(2))
clf_tf_idf.fit(X_train_tf_idf, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False), estimator=LogisticRegression(random_state=42), n_jobs=-1, param_grid={'C': array([ 0.1 , 0.10077177, 0.1015495 , 0.10233323, 0.10312301, 0.10391889, 0.10472091, 0.10552911, 0.10634356, 0.10716429, 0.10799135, 0.1088248 , 0.10966468, 0.11051104, 0.11136394, 0.11222341, 0.11308952, 0.11396232, 0.1148... 7.69976486, 7.75918954, 7.81907284, 7.8794183 , 7.9402295 , 8.00151002, 8.06326348, 8.12549354, 8.18820388, 8.2513982 , 8.31508023, 8.37925375, 8.44392253, 8.50909042, 8.57476125, 8.64093891, 8.70762732, 8.7748304 , 8.84255214, 8.91079654, 8.97956763, 9.04886948, 9.11870618, 9.18908186, 9.26000068, 9.33146683, 9.40348454, 9.47605806, 9.54919168, 9.62288973, 9.69715656, 9.77199656, 9.84741416, 9.92341381, 10. ])})
clf_tf_idf.best_params_
{'C': 9.771996562044086}
tf_idf_log_reg_model = clf_tf_idf.best_estimator_
from sklearn.metrics import roc_auc_score, roc_curve
for name, X, y, model in [
('train', X_train_tf_idf, y_train, tf_idf_log_reg_model),
('test ', X_test_tf_idf, y_test, tf_idf_log_reg_model)
]:
proba = model.predict_proba(X)[:, 1]
auc = roc_auc_score(y, proba)
plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))
plt.plot([0, 1], [0, 1], '--', color='black',)
plt.title("Logistic regression + Tf-Idf features")
plt.legend(fontsize='large')
plt.grid()
test_accuracy = np.mean(tf_idf_log_reg_model.predict(X_test_tf_idf) == y_test)
print(f"Model accuracy: {test_accuracy:.3f}")
assert test_accuracy > 0.77, "Hint: tune the parameter C to improve performance"
print("Well done!")
Model accuracy: 0.790 Well done!
Let's try another approach: instead of counting per-word frequencies, we shall map all words to pre-trained word vectors and average over them to get text features.
This should give us two key advantages: (1) we now have 10^2 features instead of 10^4 and (2) our model can generalize to words that are not in training dataset.
We begin with a standard approach with pre-trained word vectors. However, you may also try
gensim.downloader.info()['models'].keys()
to get a list of available modelsNote: loading pre-trained model may take a while. It's a perfect opportunity to refill your cup of tea/coffee and grab some extra cookies. Or binge-watch some tv series if you're slow on internet connection
# !pip install gensim
import gensim.downloader
embeddings = gensim.downloader.load("fasttext-wiki-news-subwords-300")
# If you're low on RAM or download speed, use "glove-wiki-gigaword-100" instead. Ignore all further asserts.
def vectorize_sum(comment):
"""
implement a function that converts preprocessed comment to a sum of token vectors
"""
embedding_dim = embeddings.vectors.shape[1]
features = np.zeros([embedding_dim], dtype="float32")
features = np.sum(
[
embeddings.get_vector(word) if word in embeddings.key_to_index else features
for word in comment.split()
],
axis=0,
) / len(comment.split())
return features
assert np.allclose(
vectorize_sum("who cares anymore . they attack with impunity .")[::70],
np.array([ 0.00120684, 0.00290737, 0.01539459, -0.0205673 , -0.05153336])
)
X_train_wv = np.stack([vectorize_sum(text) for text in texts_train])
X_test_wv = np.stack([vectorize_sum(text) for text in texts_test])
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
parameters = {"C": np.logspace(-2, 2, 1000)}
clf = GridSearchCV(LogisticRegression(solver='lbfgs', random_state=42, max_iter=10000), parameters, n_jobs=-1, cv=StratifiedKFold(2))
clf.fit(X_train_wv, y_train)
GridSearchCV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False), estimator=LogisticRegression(max_iter=10000, random_state=42), n_jobs=-1, param_grid={'C': array([1.00000000e-02, 1.00926219e-02, 1.01861017e-02, 1.02804473e-02, 1.03756668e-02, 1.04717682e-02, 1.05687597e-02, 1.06666496e-02, 1.07654461e-02, 1.08651577e-02, 1.09657929e-02, 1.10673602e-02,... 8.08924349e+01, 8.16416760e+01, 8.23978568e+01, 8.31610415e+01, 8.39312950e+01, 8.47086827e+01, 8.54932707e+01, 8.62851257e+01, 8.70843150e+01, 8.78909065e+01, 8.87049689e+01, 8.95265713e+01, 9.03557835e+01, 9.11926760e+01, 9.20373200e+01, 9.28897872e+01, 9.37501502e+01, 9.46184819e+01, 9.54948564e+01, 9.63793480e+01, 9.72720319e+01, 9.81729841e+01, 9.90822810e+01, 1.00000000e+02])})
clf.best_params_
{'C': 40.889482262948604}
wv_model = clf.best_estimator_
for name, X, y, model in [
('bow train', X_train_bow, y_train, bow_log_reg_model),
('bow test ', X_test_bow, y_test, bow_log_reg_model),
('vec train', X_train_wv, y_train, wv_model),
('vec test ', X_test_wv, y_test, wv_model)
]:
proba = model.predict_proba(X)[:, 1]
auc = roc_auc_score(y, proba)
plt.plot(*roc_curve(y, proba)[:2], label='%s AUC=%.4f' % (name, auc))
plt.plot([0, 1], [0, 1], '--', color='black',)
plt.legend(fontsize='large')
plt.grid()
assert roc_auc_score(y_test, wv_model.predict_proba(X_test_wv)[:, 1]) > 0.92, "something's wrong with your features"
If everything went right, you've just managed to reduce misclassification rate by a factor of two. This trick is very useful when you're dealing with small datasets. However, if you have hundreds of thousands of samples, there's a whole different range of methods for that. We'll get there in the second part.