#!/usr/bin/env python # coding: utf-8 # [Sebastian Raschka](http://sebastianraschka.com), 2015 # # https://github.com/rasbt/python-machine-learning-book # # Python Machine Learning - Code Examples # # Chapter 8 - Applying Machine Learning To Sentiment Analysis # Note that the optional watermark extension is a small IPython notebook plugin that I developed to make the code reproducible. You can just skip the following line(s). # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -u -d -v -p numpy,pandas,matplotlib,scikit-learn,nltk") # In[2]: # to install watermark just uncomment the following line: #%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py #
#
# ### Overview # - [Obtaining the IMDb movie review dataset](#Obtaining-the-IMDb-movie-review-dataset) # - [Introducing the bag-of-words model](#Introducing-the-bag-of-words-model) # - [Transforming words into feature vectors](#Transforming-words-into-feature-vectors) # - [Assessing word relevancy via term frequency-inverse document frequency](#Assessing-word-relevancy-via-term-frequency-inverse-document-frequency) # - [Cleaning text data](#Cleaning-text-data) # - [Processing documents into tokens](#Processing-documents-into-tokens) # - [Training a logistic regression model for document classification](#Training-a-logistic-regression-model-for-document-classification) # - [Working with bigger data – online algorithms and out-of-core learning](#Working-with-bigger-data-–-online-algorithms-and-out-of-core-learning) # - [Summary](#Summary) #
#
# # Obtaining the IMDb movie review dataset # The IMDB movie review set can be downloaded from [http://ai.stanford.edu/~amaas/data/sentiment/](http://ai.stanford.edu/~amaas/data/sentiment/). # After downloading the dataset, decompress the files. # # A) If you are working with Linux or MacOS X, open a new terminal windowm `cd` into the download directory and execute # # `tar -zxf aclImdb_v1.tar.gz` # # B) If you are working with Windows, download an archiver such as [7Zip](http://www.7-zip.org) to extract the files from the download archive. # ### Compatibility Note: # # I received an email from a reader who was having troubles with reading the movie review texts due to encoding issues. Typically, Python's default encoding is set to `'utf-8'`, which shouldn't cause troubles when running this IPython notebook. You can simply check the encoding on your machine by firing up a new Python interpreter from the command line terminal and execute # # >>> import sys # >>> sys.getdefaultencoding() # # If the returned result is **not** `'utf-8'`, you probably need to change your Python's encoding to `'utf-8'`, for example by typing `export PYTHONIOENCODING=utf8` in your terminal shell prior to running this IPython notebook. (Note that this is a temporary change, and it needs to be executed in the same shell that you'll use to launch `ipython notebook`. # # Alternatively, you can replace the lines # # with open(os.path.join(path, file), 'r') as infile: # ... # pd.read_csv('./movie_data.csv') # ... # df.to_csv('./movie_data.csv', index=False) # # by # # with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: # ... # pd.read_csv('./movie_data.csv', encoding='utf-8') # ... # df.to_csv('./movie_data.csv', index=False, encoding='utf-8') # # in the following cells to achieve the desired effect. # In[3]: import pyprind import pandas as pd import os # change the `basepath` to the directory of the # unzipped movie dataset #basepath = '/Users/Sebastian/Desktop/aclImdb/' basepath = './aclImdb' labels = {'pos':1, 'neg':0} pbar = pyprind.ProgBar(50000) df = pd.DataFrame() for s in ('test', 'train'): for l in ('pos', 'neg'): path = os.path.join(basepath, s, l) for file in os.listdir(path): with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: txt = infile.read() df = df.append([[txt, labels[l]]], ignore_index=True) pbar.update() df.columns = ['review', 'sentiment'] # Shuffling the DataFrame: # In[4]: import numpy as np np.random.seed(0) df = df.reindex(np.random.permutation(df.index)) # Optional: Saving the assembled data as CSV file: # In[5]: df.to_csv('./movie_data.csv', index=False) # In[6]: import pandas as pd df = pd.read_csv('./movie_data.csv') df.head(3) #
# ### Note # # If you have problems with creating the `movie_data.csv` file in the previous chapter, you can find a download a zip archive at # https://github.com/rasbt/python-machine-learning-book/tree/master/code/datasets/movie #
#
#
# # Introducing the bag-of-words model # ... # ## Transforming documents into feature vectors # In[7]: import numpy as np from sklearn.feature_extraction.text import CountVectorizer count = CountVectorizer() docs = np.array([ 'The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet']) bag = count.fit_transform(docs) # In[8]: print(count.vocabulary_) # In[9]: print(bag.toarray()) #
# ## Assessing word relevancy via term frequency-inverse document frequency # In[10]: np.set_printoptions(precision=2) # In[11]: from sklearn.feature_extraction.text import TfidfTransformer tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) print(tfidf.fit_transform(count.fit_transform(docs)).toarray()) # In[12]: tf_is = 2 n_docs = 3 idf_is = np.log((n_docs+1) / (3+1) ) tfidf_is = tf_is * (idf_is + 1) print('tf-idf of term "is" = %.2f' % tfidf_is) # In[13]: tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True) raw_tfidf = tfidf.fit_transform(count.fit_transform(docs)).toarray()[-1] raw_tfidf # In[14]: l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2)) l2_tfidf #
# ## Cleaning text data # In[15]: df.loc[0, 'review'][-50:] # In[16]: import re def preprocessor(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = re.sub('[\W]+', ' ', text.lower()) + \ ' '.join(emoticons).replace('-', '') return text # In[17]: preprocessor(df.loc[0, 'review'][-50:]) # In[18]: preprocessor("This :) is :( a test :-)!") # In[19]: df['review'] = df['review'].apply(preprocessor) #
# ## Processing documents into tokens # In[20]: from nltk.stem.porter import PorterStemmer porter = PorterStemmer() def tokenizer(text): return text.split() def tokenizer_porter(text): return [porter.stem(word) for word in text.split()] # In[21]: tokenizer('runners like running and thus they run') # In[22]: tokenizer_porter('runners like running and thus they run') # In[23]: import nltk nltk.download('stopwords') # In[24]: from nltk.corpus import stopwords stop = stopwords.words('english') [w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop] #
#
# # Training a logistic regression model for document classification # Strip HTML and punctuation to speed up the GridSearch later: # In[25]: X_train = df.loc[:25000, 'review'].values y_train = df.loc[:25000, 'sentiment'].values X_test = df.loc[25000:, 'review'].values y_test = df.loc[25000:, 'sentiment'].values # In[28]: from sklearn.grid_search import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None) param_grid = [{'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer, tokenizer_porter], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}, {'vect__ngram_range': [(1,1)], 'vect__stop_words': [stop, None], 'vect__tokenizer': [tokenizer, tokenizer_porter], 'vect__use_idf':[False], 'vect__norm':[None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}, ] lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))]) gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1) # In[29]: gs_lr_tfidf.fit(X_train, y_train) # In[30]: print('Best parameter set: %s ' % gs_lr_tfidf.best_params_) print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_) # In[31]: clf = gs_lr_tfidf.best_estimator_ print('Test Accuracy: %.3f' % clf.score(X_test, y_test)) #
#
# #### Start comment: # # Please note that `gs_lr_tfidf.best_score_` is the average k-fold cross-validation score. I.e., if we have a `GridSearchCV` object with 5-fold cross-validation (like the one above), the `best_score_` attribute returns the average score over the 5-folds of the best model. To illustrate this with an example: # In[38]: from sklearn.cross_validation import StratifiedKFold, cross_val_score from sklearn.linear_model import LogisticRegression import numpy as np np.random.seed(0) np.set_printoptions(precision=6) y = [np.random.randint(3) for i in range(25)] X = (y + np.random.randn(25)).reshape(-1, 1) cv5_idx = list(StratifiedKFold(y, n_folds=5, shuffle=False, random_state=0)) cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx) # By executing the code above, we created a simple data set of random integers that shall represent our class labels. Next, we fed the indices of 5 cross-validation folds (`cv3_idx`) to the `cross_val_score` scorer, which returned 5 accuracy scores -- these are the 5 accuracy values for the 5 test folds. # # Next, let us use the `GridSearchCV` object and feed it the same 5 cross-validation sets (via the pre-generated `cv3_idx` indices): # In[39]: from sklearn.grid_search import GridSearchCV gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y) # As we can see, the scores for the 5 folds are exactly the same as the ones from `cross_val_score` earlier. # Now, the best_score_ attribute of the `GridSearchCV` object, which becomes available after `fit`ting, returns the average accuracy score of the best model: # In[40]: gs.best_score_ # As we can see, the result above is consistent with the average score computed the `cross_val_score`. # In[41]: cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean() # #### End comment. # #
#
#
#
# # Working with bigger data - online algorithms and out-of-core learning # In[32]: import numpy as np import re from nltk.corpus import stopwords stop = stopwords.words('english') def tokenizer(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower()) text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '') tokenized = [w for w in text.split() if w not in stop] return tokenized def stream_docs(path): with open(path, 'r', encoding='utf-8') as csv: next(csv) # skip header for line in csv: text, label = line[:-3], int(line[-2]) yield text, label # In[33]: next(stream_docs(path='./movie_data.csv')) # In[34]: def get_minibatch(doc_stream, size): docs, y = [], [] try: for _ in range(size): text, label = next(doc_stream) docs.append(text) y.append(label) except StopIteration: return None, None return docs, y # In[35]: from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs(path='./movie_data.csv') # In[36]: import pyprind pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() # In[37]: X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) # In[38]: clf = clf.partial_fit(X_test, y_test) #
#
# # Summary