#!/usr/bin/env python # coding: utf-8 # # News Categorization using Multinomial Naive Bayes # The objective of this site is to show how to use Multinomial Naive Bayes method to classify news according to some predefined classes. # The News Aggregator Data Set comes from the UCI Machine Learning Repository. # * Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. # This specific dataset can be found in the UCI ML Repository at this URL: http://archive.ics.uci.edu/ml/datasets/News+Aggregator # This dataset contains headlines, URLs, and categories for 422,937 news stories collected by a web aggregator between March 10th, 2014 and August 10th, 2014. News categories in this dataset are labelled: # * b: business; # * t: science and technology; # * e: entertainment; and # * m: health. # Using Multinomial Naive Bayes method, we will try to predict the category (business, entertainment, etc.) of a news article given only its headline. # Let's begin importing the Pandas (Python Data Analysis Library) module. The import statement is the most common way to gain access to the code in another module. # In[3]: import pandas as pd # This way we can refer to pandas by its alias 'pd'. Let's import news aggregator data via Pandas # In[4]: news = pd.read_csv("uci-news-aggregator.csv") # Function head gives us the first 5 items in a column (or the first 5 rows in the DataFrame) # In[5]: print(news.head()) # We want to predict the category of a news article based only on its title. Class LabelEncoder allows to encode labels with values between 0 and n_classes-1. # In[6]: from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() y = encoder.fit_transform(news['CATEGORY']) print(y[:5]) # In[7]: categories = news['CATEGORY'] titles = news['TITLE'] N = len(titles) print('Number of news',N) # In[8]: labels = list(set(categories)) print('possible categories',labels) # In[9]: for l in labels: print('number of ',l,' news',len(news.loc[news['CATEGORY'] == l])) # Categories are literal labels, but it is better for machine learning algorithms just to work with numbers, so we will encode them using LabelEncoder, which encode labels with value between 0 and n_classes-1. # In[10]: from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() ncategories = encoder.fit_transform(categories) # Now we should split our data into two sets: # 1. a training set (70%) used to discover potentially predictive relationships, and # 2. a test set (30%) used to evaluate whether the discovered relationships hold and to assess the strength and utility of a predictive relationship. # Samples should be first shuffled and then split into a pair of train and test sets. Make sure you permute (shuffle) your training data before fitting the model. # In[11]: Ntrain = int(N * 0.7) from sklearn.utils import shuffle titles, ncategories = shuffle(titles, ncategories, random_state=0) # In[12]: X_train = titles[:Ntrain] print('X_train.shape',X_train.shape) y_train = ncategories[:Ntrain] print('y_train.shape',y_train.shape) X_test = titles[Ntrain:] print('X_test.shape',X_test.shape) y_test = ncategories[Ntrain:] print('y_test.shape',y_test.shape) # In order to make the training process easier, scikit-learn provides a Pipeline class that behaves like a compound classifier. The first step should be to tokenize and count the number of occurrence of each word that appears into the news'titles. For that, we will use the CountVectorizer class. Then we will transform the counters to a tf-idf representation using TfidfTransformer class. The last step creates the Naive Bayes classifier # In[13]: from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline # In[14]: print('Training...') text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) # Now we procede to fit the Naive Bayes classifier to the train set # In[15]: text_clf = text_clf.fit(X_train, y_train) # Now we can procede to apply the classifier to the test set and calculate the predicted values # In[16]: print('Predicting...') predicted = text_clf.predict(X_test) # sklearn.metrics module includes score functions, performance metrics, and pairwise metrics and distance computations. # accuracy_score: computes subset accuracy; used to compare set of predicted labels for a sample to the corresponding set of true labels # In[17]: from sklearn import metrics print('accuracy_score',metrics.accuracy_score(y_test,predicted)) print('Reporting...') # Let's build a text report showing the main classification metrics with the Precision/Recall/F1-score measures for each element in the test data. # In[18]: print(metrics.classification_report(y_test, predicted, target_names=labels)) # Have you heard about [cross-validation][1]? What about k-fold cross-validation? You can try it now just by repeating the previous steps (don't forget the shuffle part) and averaging the results. Let's try it! # Have a nice day! # [1]: https://en.wikipedia.org/wiki/Cross-validation_(statistics) "cross-validation"