#!/usr/bin/env python
# coding: utf-8
# [Sebastian Raschka](http://sebastianraschka.com), 2015
#
# https://github.com/rasbt/python-machine-learning-book
# # Python Machine Learning - Code Examples
# # Chapter 8 - Applying Machine Learning To Sentiment Analysis
# Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s).
# In[1]:
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scikit-learn,nltk")
# In[2]:
# to install watermark just uncomment the following line:
#%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
#
#
# ### Overview
# - [Obtaining the IMDb movie review dataset](#Obtaining-the-IMDb-movie-review-dataset)
# - [Introducing the bag-of-words model](#Introducing-the-bag-of-words-model)
# - [Transforming words into feature vectors](#Transforming-words-into-feature-vectors)
# - [Assessing word relevancy via term frequency-inverse document frequency](#Assessing-word-relevancy-via-term-frequency-inverse-document-frequency)
# - [Cleaning text data](#Cleaning-text-data)
# - [Processing documents into tokens](#Processing-documents-into-tokens)
# - [Training a logistic regression model for document classification](#Training-a-logistic-regression-model-for-document-classification)
# - [Working with bigger data – online algorithms and out-of-core learning](#Working-with-bigger-data-–-online-algorithms-and-out-of-core-learning)
# - [Summary](#Summary)
#
#
# # Obtaining the IMDb movie review dataset
# The IMDB movie review set can be downloaded from [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/).
# After downloading the dataset, decompress the files.
#
# A) If you are working with Linux or MacOS X, open a new terminal windowm `cd` into the download directory and execute
#
# `tar -zxf aclImdb_v1.tar.gz`
#
# B) If you are working with Windows, download an archiver such as [7Zip](http://www.7-zip.org) to extract the files from the download archive.
# ### Compatibility Note:
#
# I received an email from a reader who was having troubles with reading the movie review texts due to encoding issues. Typically, Python's default encoding is set to `'utf-8'`, which shouldn't cause troubles when running this IPython notebook. You can simply check the encoding on your machine by firing up a new Python interpreter from the command line terminal and execute
#
# >>> import sys
# >>> sys.getdefaultencoding()
#
# If the returned result is **not** `'utf-8'`, you probably need to change your Python's encoding to `'utf-8'`, for example by typing `export PYTHONIOENCODING=utf8` in your terminal shell prior to running this IPython notebook. (Note that this is a temporary change, and it needs to be executed in the same shell that you'll use to launch `ipython notebook`.
#
# Alternatively, you can replace the lines
#
# with open(os.path.join(path, file), 'r') as infile:
# ...
# pd.read_csv('./movie_data.csv')
# ...
# df.to_csv('./movie_data.csv', index=False)
#
# by
#
# with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
# ...
# pd.read_csv('./movie_data.csv', encoding='utf-8')
# ...
# df.to_csv('./movie_data.csv', index=False, encoding='utf-8')
#
# in the following cells to achieve the desired effect.
# In[3]:
import pyprind
import pandas as pd
import os
# change the `basepath` to the directory of the
# unzipped movie dataset
#basepath = '/Users/Sebastian/Desktop/aclImdb/'
basepath = './aclImdb'
labels = {'pos':1, 'neg':0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
for l in ('pos', 'neg'):
path = os.path.join(basepath, s, l)
for file in os.listdir(path):
with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index=True)
pbar.update()
df.columns = ['review', 'sentiment']
# Shuffling the DataFrame:
# In[4]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
# Optional: Saving the assembled data as CSV file:
# In[5]:
df.to_csv('./movie_data.csv', index=False)
# In[6]:
import pandas as pd
df = pd.read_csv('./movie_data.csv')
df.head(3)
#
# ### Note
#
# If you have problems with creating the `movie_data.csv` file in the previous chapter, you can find a download a zip archive at
# https://github.com/rasbt/python-machine-learning-book/tree/master/code/datasets/movie
#
#
#
# # Introducing the bag-of-words model
# ...
# ## Transforming documents into feature vectors
# In[7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
'The sun is shining',
'The weather is sweet',
'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
# In[8]:
print(count.vocabulary_)
# In[9]:
print(bag.toarray())
#
# ## Assessing word relevancy via term frequency-inverse document frequency
# In[10]:
np.set_printoptions(precision=2)
# In[11]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())
# In[12]:
tf_is = 2
n_docs = 3
idf_is = np.log((n_docs+1) / (3+1) )
tfidf_is = tf_is * (idf_is + 1)
print('tf-idf of term "is" = %.2f' % tfidf_is)
# In[13]:
tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1]
raw_tfidf
# In[14]:
l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
l2_tfidf
#
# ## Cleaning text data
# In[15]:
df.loc[0, 'review'][-50:]
# In[16]:
import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text = re.sub('[\W]+', ' ', text.lower()) + \
' '.join(emoticons).replace('-', '')
return text
# In[17]:
preprocessor(df.loc[0, 'review'][-50:])
# In[18]:
preprocessor("This :) is :( a test :-)!")
# In[19]:
df['review'] = df['review'].apply(preprocessor)
#
# ## Processing documents into tokens
# In[20]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
return text.split()
def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]
# In[21]:
tokenizer('runners like running and thus they run')
# In[22]:
tokenizer_porter('runners like running and thus they run')
# In[23]:
import nltk
nltk.download('stopwords')
# In[24]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]
#
#
# # Training a logistic regression model for document classification
# Strip HTML and punctuation to speed up the GridSearch later:
# In[25]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values
# In[28]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=False,
preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'vect__use_idf':[False],
'vect__norm':[None],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
]
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
scoring='accuracy',
cv=5, verbose=1,
n_jobs=-1)
# In[29]:
gs_lr_tfidf.fit(X_train, y_train)
# In[30]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
# In[31]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
#
#
# #### Start comment:
#
# Please note that `gs_lr_tfidf.best_score_` is the average k-fold cross-validation score. I.e., if we have a `GridSearchCV` object with 5-fold cross-validation (like the one above), the `best_score_` attribute returns the average score over the 5-folds of the best model. To illustrate this with an example:
# In[38]:
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np
np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)
cv5_idx = list(StratifiedKFold(y, n_folds=5, shuffle=False, random_state=0))
cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)
# By executing the code above, we created a simple data set of random integers that shall represent our class labels. Next, we fed the indices of 5 cross-validation folds (`cv3_idx`) to the `cross_val_score` scorer, which returned 5 accuracy scores -- these are the 5 accuracy values for the 5 test folds.
#
# Next, let us use the `GridSearchCV` object and feed it the same 5 cross-validation sets (via the pre-generated `cv3_idx` indices):
# In[39]:
from sklearn.grid_search import GridSearchCV
gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)
# As we can see, the scores for the 5 folds are exactly the same as the ones from `cross_val_score` earlier.
# Now, the best_score_ attribute of the `GridSearchCV` object, which becomes available after `fit`ting, returns the average accuracy score of the best model:
# In[40]:
gs.best_score_
# As we can see, the result above is consistent with the average score computed the `cross_val_score`.
# In[41]:
cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean()
# #### End comment.
#
#
#
#
#
# # Working with bigger data - online algorithms and out-of-core learning
# In[32]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized
def stream_docs(path):
with open(path, 'r', encoding='utf-8') as csv:
next(csv) # skip header
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label
# In[33]:
next(stream_docs(path='./movie_data.csv'))
# In[34]:
def get_minibatch(doc_stream, size):
docs, y = [], []
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y
# In[35]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
n_features=2**21,
preprocessor=None,
tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')
# In[36]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
pbar.update()
# In[37]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))
# In[38]:
clf = clf.partial_fit(X_test, y_test)
#
#
# # Summary