#!/usr/bin/env python
# coding: utf-8

# This notebooks is an experiment to see if a pure scikit-learn implementation of the fastText model can work better than a linear model on a small text classification problem: 20 newsgroups.
# 
# http://arxiv.org/abs/1607.01759
# 
# Those models are very similar to Deep Averaging Network (with only 1 hidden layer with a linear activation function):
# 
# https://www.cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf
# 
# 
# Note that scikit-learn does not provide a hierarchical softmax implementation (but we don't need it on 20 newsgroups anyways).

# In[1]:


from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.model_selection import train_test_split


# In[3]:


twentyng_train = fetch_20newsgroups(
    subset='train',
    #remove=('headers', 'footers'),
)
docs_train, target_train = twentyng_train.data, twentyng_train.target


twentyng_test = fetch_20newsgroups(
    subset='test',
    #remove=('headers', 'footers'),
)

docs_test, target_test = twentyng_test.data, twentyng_test.target


# In[18]:


2 ** 18


# The following uses the hashing tricks on unigrams and bigrams. `binary=True` makes us ignore repeated words in a document. The `l1` normalization ensures that we "average" the embeddings of the tokens in the document instead of summing them.

# In[17]:


get_ipython().run_cell_magic('time', '', "vec = HashingVectorizer(\n    encoding='latin-1', binary=True, ngram_range=(1, 2),\n    norm='l1', n_features=2 ** 18)\n\nX_train = vec.transform(docs_train)\nX_test = vec.transform(docs_test)\n")


# In[19]:


first_doc_vectors = X_train[:3].toarray()
first_doc_vectors


# In[20]:


first_doc_vectors.min(axis=1)


# In[21]:


first_doc_vectors.max(axis=1)


# In[22]:


first_doc_vectors.sum(axis=1)


# Baseline: OvR logistic regression (the multinomial logistic regression loss is currently not implemented in scikit-learn). In practice, the OvR reduction seems to work well enough.

# In[86]:


get_ipython().run_cell_magic('time', '', "from sklearn.linear_model import SGDClassifier\n\nlr = SGDClassifier(loss='log', alpha=1e-10, n_iter=50, n_jobs=-1)\nlr.fit(X_train, target_train)\n")


# In[87]:


get_ipython().run_cell_magic('time', '', 'print("train score: %0.3f" % lr.score(X_train, target_train))\nprint("test score: %0.3f" % lr.score(X_test, target_test))\n')


# Let's now use the MLPClassifier of scikit-learn to add a single hidden layer with a small number of hidden units.
# 
# Note: instead of tanh or relu we would rather like to use a linear / identity activation function for the hidden layer but this is not (yet) implemented in scikit-learn.
# 
# In that respect the following model is closer to a Deep Averaging Network (without dropout) than fastText.

# In[90]:


get_ipython().run_cell_magic('time', '', "from sklearn.neural_network import MLPClassifier\n\nmlp = MLPClassifier(algorithm='adam', learning_rate_init=0.01,\n                    hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,\n                    early_stopping=True, validation_fraction=0.05, alpha=1e-10)\nmlp.fit(X_train, target_train)\n")


# In[92]:


get_ipython().run_cell_magic('time', '', 'print("train score: %0.3f" % mlp.score(X_train, target_train))\nprint("test score: %0.3f" % mlp.score(X_test, target_test))\n')


# In[ ]: