#!/usr/bin/env python # coding: utf-8 # # Sentiment Symposium - Build a Sentiment Predictor in 5 Minutes in Python # ## Łukasz Augustyniak # ### Piotr Bródka # e-Mail: lukasz.augustyniak@pwr.edu.pl
# Twitter: @luk_augustyniak
# LinkedIn: Łukasz Augustyniak
# GitHub: laugustyniak
# Ipython Notebook view: SAS2015 Notebook #

# ## European research centre of Network intelliGence for INnovation Enhancement # #

# #

ENGINE Centre

# ## Purpose of the presentation: # - Learning by practice # - Real example implementation # - Using trained model for production # # Why Python? # - code readability
# - its syntax allows programmers to express concepts in fewer lines of code than C++ or Java
# - really strong open source community
# - ideal for fast prototyping and building models (research)
# # Why IPython Notebook? # The IPython Notebook - is an interactive computational environment, in which you can combine code execution, rich text, mathematics, plots and rich media. Just like you see it now :) # In[28]: sas2015 = 'Welcome at Sentiment Symposium' # In[29]: print sas2015 # In[30]: sas2015 + ' 2015' # # How can I install Python and IPython Notebook? # ## Python's distribution - Anaconda # Python interpreter with pre-installed libraries - Anaconda - is a completely free Python distribution (including for commercial use and redistribution). It includes over 195 of the most popular Python packages for science, math, engineering, data analysis. # # Python's libraries # ## Scikit-Learn & Pandas # scikit-learn - Machine Learning in Python
# - Simple and efficient tools for data mining and data analysis # - Accessible to everybody, and reusable in various contexts # - Built on NumPy, SciPy, and matplotlib # - Open source, commercially usable - BSD license #
# # pandas - Python Data Analysis Library
# - pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
# - great for loading and transforming data # In[31]: import pandas as pd # ## Where is NTLK? # Whole preprocessing and model creation is possible with scikit-learn library. # # Load dataset with Pandas # ##Path for dataset # In[32]: from os import path notebook_path = 'C:/Users/Dell/Documents/GitHub/Presentations/sas2015/' # ### SemEval 2014 dataset - http://alt.qcri.org/semeval2014/ # approximately 6 000 of tweets with annotation negative/neutral/positive # ## Load data into Data Frame structure # Tabular data structure with labeled axes (rows and columns).

# # Arithmetic operations align on both row and column labels. The primary pandas data structure # ### Lovely one liner for data loading :) # In[33]: data = pd.read_csv(path.join(notebook_path, 'data', 'SemEval-2014.csv'), index_col=0) # In[34]: data # In[35]: get_ipython().run_line_magic('matplotlib', 'inline') data.sentiment.hist() # ### Get documents and labels into more intutive names # In[36]: docs = data['document'] y = data['sentiment'] # standart name for labels/classes variable # In[37]: docs[0] # In[38]: y[0] # # Build Bag of Word model with Scikit-Learn # ## Convert a collection of text documents to a matrix of token counts # 'I like new Note IV.' -> [0, 1, 1, 1, 1, 0, 0]
# 'I was dissapointed by new Samsung phone.' -> [1, 0, 0, 1, 0, 1, 1] # In[39]: from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english') X = count_vect.fit_transform(docs) print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0]) # # Cross-Validation # ### Good practice for research #

# In[40]: from sklearn import metrics, cross_validation from sklearn.linear_model import LogisticRegression def sentiment_classification(X, y, n_folds=10, classifier=None): """ Counting sentiment with cross validation - supervised method :type X: ndarray feature matrix for classification :type y: list or ndarray of classes :type n_folds: int # of folds for CV :type classifier: classifier which we train and predict sentiment :return: measures: accuracy, precision, recall, f1 """ results = {'acc': [], 'prec': [], 'rec': [], 'f1': [], 'cm': []} kf = cross_validation.StratifiedKFold(y, n_folds=n_folds, shuffle=True) for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] ######################## Most important part ########################## clf = classifier.fit(X_train, y_train) # train the classifier predicted = clf.predict(X_test) # predict test the classifier ####################################################################### results['acc'].append(metrics.accuracy_score(y_test, predicted)) results['prec'].append(metrics.precision_score(y_test, predicted, average='weighted')) results['rec'].append(metrics.recall_score(y_test, predicted, average='weighted')) results['f1'].append(metrics.f1_score(y_test, predicted, average='weighted')) results['cm'].append(metrics.confusion_matrix(y_test, predicted)) return results # # Run sentiment classification # In[41]: results = sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression()) # In[42]: import numpy as np print 'Accuracy: %s' % np.mean(results['acc']) print 'F1-measure: %s' % np.mean(results['f1']) # # Saving trained classfier # ## Great for production purposes! # In the computer programming language Python, pickle is the standard mechanism for object serialization; pickling is the common term among Python programmers for serialization (unpickling for deserializing). # In[43]: classifier = LogisticRegression() clf = classifier.fit(X, y) # trained clf # In[44]: from sklearn.externals import joblib fn_clf = 'sentiment-classifier.pkl' joblib.dump(clf, fn_clf) # In[45]: clf_loaded = joblib.load(fn_clf) print 'predictions => %s' % clf_loaded.predict(X) print 'classifier: %s' % clf_loaded # # Whole code # In[46]: # load data data = pd.read_csv('C:/Users/Dell/Documents/GitHub/Presentations/sas2015/data/SemEval-2014.csv', index_col=0) count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english') X = count_vect.fit_transform(data.document) results = sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression()) joblib.dump(clf, 'sentiment-classifier.pkl') # save classifier # In[47]: print 'Accuracy: %s' % np.mean(results['acc']) print 'F1-measure: %s' % np.mean(results['f1']) # # What are we doing now? # ## - Hybrid method lexicon-based + supervised learning # # ## - Sentiment lexicons generation (English and Polish) for various product domains # # ## - Sentiment analysis for Polish # # ### API for Polish text analysis (especially sentiment) - coming soon ~2-3 months check http://streamlytics.io/ # # Additional clues # ## Vectorizer parameters # ### min_df : float in range [0.0, 1.0] or int, default=1 # When building the vocabulary ignore terms that have a document frequency (TF) strictly lower than the given threshold. This value is also called cut-off in the literature. #

# If float, the parameter represents a proportion of documents, integer absolute counts. # In[48]: min_df=2 # In[49]: count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', min_df=min_df) X = count_vect.fit_transform(docs) print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0]) # ### max_features : int or None, default=None # Build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. # In[50]: max_features=1000 # In[51]: count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', max_features=max_features) X = count_vect.fit_transform(docs) print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0]) # ## Check different minimum thresholds (minimum number of time a word appears in dataset) # In[52]: min_words = [1, 2, 5, 10, 100, 1000] # In[53]: features_counts = [] for m in min_words: docs_fitted = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', min_df=m).fit_transform(docs) print '#features=%s for #documents=%s (min_df=%s)' % (docs_fitted.shape[1], docs_fitted.shape[0], m) features_counts.append((m, docs_fitted.shape[1])) # In[54]: import matplotlib.pyplot as plt plt.bar(range(len(features_counts)), [x[1] for x in features_counts], align='center') plt.xticks(range(len(features_counts)), [x[0] for x in features_counts]) plt.xlabel('min_df') plt.ylabel('#features') plt.show() # ## Use sparse matrices! Why? Time and memory complexity... # In[55]: X # ### Whole matrix will be stored in memory, do not do that! # In[56]: get_ipython().run_line_magic('timeit', 'sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression())') # In[57]: X_array = X.toarray() get_ipython().run_line_magic('timeit', 'sentiment_classification(X_array, y, n_folds=4, classifier=LogisticRegression())') # In[ ]: