#!/usr/bin/env python
# coding: utf-8
# # Sentiment Symposium - Build a Sentiment Predictor in 5 Minutes in Python
# ## Łukasz Augustyniak
# ### Piotr Bródka
# e-Mail: lukasz.augustyniak@pwr.edu.pl
# Twitter: @luk_augustyniak
# LinkedIn: Łukasz Augustyniak
# GitHub: laugustyniak
# Ipython Notebook view: SAS2015 Notebook
# 
#
# ## European research centre of Network intelliGence for INnovation Enhancement
#
# 
#
#
# ## Purpose of the presentation:
# - Learning by practice
# - Real example implementation
# - Using trained model for production
# # Why Python?
# - code readability
# - its syntax allows programmers to express concepts in fewer lines of code than C++ or Java
# - really strong open source community
# - ideal for fast prototyping and building models (research)
# # Why IPython Notebook?
# The IPython Notebook - is an interactive computational environment, in which you can combine code execution, rich text, mathematics, plots and rich media. Just like you see it now :)
# In[28]:
sas2015 = 'Welcome at Sentiment Symposium'
# In[29]:
print sas2015
# In[30]:
sas2015 + ' 2015'
# # How can I install Python and IPython Notebook?
# ## Python's distribution - Anaconda
# Python interpreter with pre-installed libraries - Anaconda - is a completely free Python distribution (including for commercial use and redistribution). It includes over 195 of the most popular Python packages for science, math, engineering, data analysis.
# # Python's libraries
# ## Scikit-Learn & Pandas
# scikit-learn - Machine Learning in Python
# - Simple and efficient tools for data mining and data analysis
# - Accessible to everybody, and reusable in various contexts
# - Built on NumPy, SciPy, and matplotlib
# - Open source, commercially usable - BSD license
#
#
# pandas - Python Data Analysis Library
# - pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
# - great for loading and transforming data
# In[31]:
import pandas as pd
# ## Where is NTLK?
# Whole preprocessing and model creation is possible with scikit-learn library.
# # Load dataset with Pandas
# ##Path for dataset
# In[32]:
from os import path
notebook_path = 'C:/Users/Dell/Documents/GitHub/Presentations/sas2015/'
# ### SemEval 2014 dataset - http://alt.qcri.org/semeval2014/
# approximately 6 000 of tweets with annotation negative/neutral/positive
# ## Load data into Data Frame structure
# Tabular data structure with labeled axes (rows and columns).
#
# Arithmetic operations align on both row and column labels. The primary pandas data structure
# ### Lovely one liner for data loading :)
# In[33]:
data = pd.read_csv(path.join(notebook_path, 'data', 'SemEval-2014.csv'), index_col=0)
# In[34]:
data
# In[35]:
get_ipython().run_line_magic('matplotlib', 'inline')
data.sentiment.hist()
# ### Get documents and labels into more intutive names
# In[36]:
docs = data['document']
y = data['sentiment'] # standart name for labels/classes variable
# In[37]:
docs[0]
# In[38]:
y[0]
# # Build Bag of Word model with Scikit-Learn
# ## Convert a collection of text documents to a matrix of token counts
# 'I like new Note IV.' -> [0, 1, 1, 1, 1, 0, 0]
# 'I was dissapointed by new Samsung phone.' -> [1, 0, 0, 1, 0, 1, 1]
# In[39]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')
X = count_vect.fit_transform(docs)
print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0])
# # Cross-Validation
# ### Good practice for research
#
# In[40]:
from sklearn import metrics, cross_validation
from sklearn.linear_model import LogisticRegression
def sentiment_classification(X, y, n_folds=10, classifier=None):
"""
Counting sentiment with cross validation - supervised method
:type X: ndarray feature matrix for classification
:type y: list or ndarray of classes
:type n_folds: int # of folds for CV
:type classifier: classifier which we train and predict sentiment
:return: measures: accuracy, precision, recall, f1
"""
results = {'acc': [], 'prec': [], 'rec': [], 'f1': [], 'cm': []}
kf = cross_validation.StratifiedKFold(y, n_folds=n_folds, shuffle=True)
for train_index, test_index in kf:
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
######################## Most important part ##########################
clf = classifier.fit(X_train, y_train) # train the classifier
predicted = clf.predict(X_test) # predict test the classifier
#######################################################################
results['acc'].append(metrics.accuracy_score(y_test, predicted))
results['prec'].append(metrics.precision_score(y_test, predicted, average='weighted'))
results['rec'].append(metrics.recall_score(y_test, predicted, average='weighted'))
results['f1'].append(metrics.f1_score(y_test, predicted, average='weighted'))
results['cm'].append(metrics.confusion_matrix(y_test, predicted))
return results
# # Run sentiment classification
# In[41]:
results = sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression())
# In[42]:
import numpy as np
print 'Accuracy: %s' % np.mean(results['acc'])
print 'F1-measure: %s' % np.mean(results['f1'])
# # Saving trained classfier
# ## Great for production purposes!
# In the computer programming language Python, pickle is the standard mechanism for object serialization; pickling is the common term among Python programmers for serialization (unpickling for deserializing).
# In[43]:
classifier = LogisticRegression()
clf = classifier.fit(X, y) # trained
clf
# In[44]:
from sklearn.externals import joblib
fn_clf = 'sentiment-classifier.pkl'
joblib.dump(clf, fn_clf)
# In[45]:
clf_loaded = joblib.load(fn_clf)
print 'predictions => %s' % clf_loaded.predict(X)
print 'classifier: %s' % clf_loaded
# # Whole code
# In[46]:
# load data
data = pd.read_csv('C:/Users/Dell/Documents/GitHub/Presentations/sas2015/data/SemEval-2014.csv', index_col=0)
count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english')
X = count_vect.fit_transform(data.document)
results = sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression())
joblib.dump(clf, 'sentiment-classifier.pkl')
# save classifier
# In[47]:
print 'Accuracy: %s' % np.mean(results['acc'])
print 'F1-measure: %s' % np.mean(results['f1'])
# # What are we doing now?
# ## - Hybrid method lexicon-based + supervised learning
#
# ## - Sentiment lexicons generation (English and Polish) for various product domains
#
# ## - Sentiment analysis for Polish
#
# ### API for Polish text analysis (especially sentiment) - coming soon ~2-3 months check http://streamlytics.io/
# # Additional clues
# ## Vectorizer parameters
# ### min_df : float in range [0.0, 1.0] or int, default=1
# When building the vocabulary ignore terms that have a document frequency (TF) strictly lower than the given threshold. This value is also called cut-off in the literature.
#
# If float, the parameter represents a proportion of documents, integer absolute counts.
# In[48]:
min_df=2
# In[49]:
count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', min_df=min_df)
X = count_vect.fit_transform(docs)
print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0])
# ### max_features : int or None, default=None
# Build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
# In[50]:
max_features=1000
# In[51]:
count_vect = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', max_features=max_features)
X = count_vect.fit_transform(docs)
print '#features=%s for #documents=%s' % (X.shape[1], X.shape[0])
# ## Check different minimum thresholds (minimum number of time a word appears in dataset)
# In[52]:
min_words = [1, 2, 5, 10, 100, 1000]
# In[53]:
features_counts = []
for m in min_words:
docs_fitted = CountVectorizer(ngram_range=(1, 2), lowercase=True, stop_words='english', min_df=m).fit_transform(docs)
print '#features=%s for #documents=%s (min_df=%s)' % (docs_fitted.shape[1], docs_fitted.shape[0], m)
features_counts.append((m, docs_fitted.shape[1]))
# In[54]:
import matplotlib.pyplot as plt
plt.bar(range(len(features_counts)), [x[1] for x in features_counts], align='center')
plt.xticks(range(len(features_counts)), [x[0] for x in features_counts])
plt.xlabel('min_df')
plt.ylabel('#features')
plt.show()
# ## Use sparse matrices! Why? Time and memory complexity...
# In[55]:
X
# ### Whole matrix will be stored in memory, do not do that!
# In[56]:
get_ipython().run_line_magic('timeit', 'sentiment_classification(X, y, n_folds=4, classifier=LogisticRegression())')
# In[57]:
X_array = X.toarray()
get_ipython().run_line_magic('timeit', 'sentiment_classification(X_array, y, n_folds=4, classifier=LogisticRegression())')
# In[ ]: