#!/usr/bin/env python
# coding: utf-8

# 
# # Dirty categories: learning with non normalized strings
# 
# Including strings that represent categories often calls for much data
# preparation. In particular categories may appear with many morphological
# variants, when they have been manually input, or assembled from diverse
# sources.
# 
# Including such a column in a learning pipeline as a standard categorical
# colum leads to categories with very high cardinalities and can lose
# information on which categories are similar.
# 
# Here we look at a dataset on wages [#]_ where the column *Employee
# Position Title* contains dirty categories.
# 
# .. [#] https://catalog.data.gov/dataset/employee-salaries-2016
# 
# We investigate encodings to include such compare different categorical
# encodings for the dirty column to predict the *Current Annual Salary*,
# using gradient boosted trees. For this purpose, we use the skrub
# library ( https://skrub-data.org ).
# 

# .. |SV| replace::
#     :class:`~skrub.TableVectorizer`
# 
# .. |tabular_learner| replace::
#     :func:`~skrub.tabular_learner`
# 
# .. |OneHotEncoder| replace::
#     :class:`~sklearn.preprocessing.OneHotEncoder`
# 
# .. |RandomForestRegressor| replace::
#     :class:`~sklearn.ensemble.RandomForestRegressor`
# 
# .. |SE| replace:: :class:`~skrub.SimilarityEncoder`
# 
# .. |GapEncoder| replace:: :class:`~skrub.GapEncoder`
# 
# .. |permutation importances| replace::
#     :func:`~sklearn.inspection.permutation_importance`
# 
# 
# ## The data
# 
# ### Data Importing and preprocessing
# 
# We first download the dataset:
# 
# 

# In[ ]:


from skrub.datasets import fetch_employee_salaries
employee_salaries = fetch_employee_salaries()
print(employee_salaries.description)


# Then we load it:
# 
# 

# In[ ]:


import pandas as pd
df = employee_salaries.X.copy()
df


# Recover the target
# 
# 

# In[ ]:


y = employee_salaries.y


# ## A simple default as a learner
# 
# The function |tabular_learner| is a simple way of creating a default
# learner for tabular_learner data:
# 
# 

# In[ ]:


from skrub import tabular_learner
model = tabular_learner("regressor")


# We can quickly compute its cross-validation score using the
# corresponding scikit-learn utility
# 
# 

# In[ ]:


from sklearn.model_selection import cross_validate
import numpy as np

results = cross_validate(model, df, y)
print(f"Prediction score: {np.mean(results['test_score'])}")
print(f"Training time: {np.mean(results['fit_time'])}")


# Below the hood, `model` is a pipeline:
# 
# 

# In[ ]:


model


# We can see that it is made of first a |SV|, and an
# HistGradientBoostingRegressor
# 
# 

# ## Understanding the vectorizer + learner pipeline
# 
# The number one difficulty is that our input is a complex and
# heterogeneous dataframe:
# 
# 

# In[ ]:


df


# The |SV| is a transformer that turns this dataframe into a
# form suited for machine learning.
# 
# Feeding it output to a powerful learner,
# such as gradient boosted trees, gives **a machine-learning method that
# can be readily applied to the dataframe**.
# 
# 

# In[ ]:


from skrub import TableVectorizer


# ### Assembling the pipeline
# 
# 
# 

# We use the |SV| with a HistGradientBoostingRegressor, which is a good
# predictor for data with heterogeneous columns
# 
# 

# In[ ]:


from sklearn.ensemble import HistGradientBoostingRegressor


# We then create a pipeline chaining our encoders to a learner
# 
# 

# In[ ]:


from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TableVectorizer(),
    HistGradientBoostingRegressor()
)
pipeline


# Note that it is almost the same model as above (can you spot the
# differences)
# 
# Let's perform a cross-validation to see how well this model predicts
# 
# 

# In[ ]:


results = cross_validate(pipeline, df, y)
print(f"Prediction score: {np.mean(results['test_score'])}")
print(f"Training time: {np.mean(results['fit_time'])}")


# The prediction perform here is pretty much as good as above
# but the code here is much simpler as it does not involve specifying
# columns manually.
# 
# 

# ### Analyzing the features created
# 
# Let us perform the same workflow, but without the `Pipeline`, so we can
# analyze its mechanisms along the way.
# 
# 

# In[ ]:


tab_vec = TableVectorizer()


# We split the data between train and test, and transform them:
# 
# 

# In[ ]:


from sklearn.model_selection import train_test_split
df_train, df_test, y_train, y_test = train_test_split(
    df, y, test_size=0.15, random_state=42
)

X_train_enc = tab_vec.fit_transform(df_train, y_train)
X_test_enc = tab_vec.transform(df_test)


# The encoded data, X_train_enc and X_test_enc are numerical arrays:
# 
# 

# In[ ]:


X_train_enc


# They have more columns than the original dataframe, but not much more:
# 
# 

# In[ ]:


X_train_enc.shape


# #### Inspecting the features created
# 
# The |SV| assigns a transformer for each column. We can inspect this
# choice:
# 
# 

# In[ ]:


tab_vec.transformers_


# This is what is being passed to transform the different columns under the hood.
# We can notice it classified the columns "gender" and "assignment_category"
# as low cardinality string variables.
# A |OneHotEncoder| will be applied to these columns.
# 
# The vectorizer actually makes the difference between string variables
# (data type ``object`` and ``string``) and categorical variables
# (data type ``category``).
# 
# Next, we can have a look at the encoded feature names.
# 
# Before encoding:
# 
# 

# In[ ]:


df.columns.to_list()


# After encoding (we only plot the first 8 feature names):
# 
# 

# In[ ]:


feature_names = tab_vec.get_feature_names_out()
feature_names[:8]


# As we can see, it created a new column for each unique value.
# This is because we used |SE| on the column "division",
# which was classified as a high cardinality string variable.
# (default values, see |SV|'s docstring).
# 
# In total, we have reasonnable number of encoded columns.
# 
# 

# In[ ]:


len(feature_names)


# ### Feature importance in the statistical model
# 
# Here we consider interpretability, plot the feature importances of a
# classifier. We can do this because the |GapEncoder| leads to
# interpretable features even with messy categories
# 
# .. topic:: Note:
# 
#    To minimize compute time, use the feature importances computed by the
#    |RandomForestRegressor|, but you should prefer |permutation importances|
#    instead (which are less subject to biases)
# 
# First, let's train the |RandomForestRegressor|,
# 
# 

# In[ ]:


from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor()
regressor.fit(X_train_enc, y_train)


# Retrieving the feature importances
# 
# 

# In[ ]:


importances = regressor.feature_importances_
std = np.std(
    [
        tree.feature_importances_
        for tree in regressor.estimators_
    ],
    axis=0
)
indices = np.argsort(importances)[::-1]


# Plotting the results:
# 
# 

# In[ ]:


import matplotlib.pyplot as plt
plt.figure(figsize=(12, 9))
plt.title("Feature importances")
n = 20
n_indices = indices[:n]
labels = np.array(feature_names)[n_indices]
plt.barh(range(n), importances[n_indices], color="b", yerr=std[n_indices])
plt.yticks(range(n), labels, size=15)
plt.tight_layout(pad=1)
plt.show()


# We can deduce from this data that the three factors that define the
# most the salary are: being hired for a long time, being a manager, and
# having a permanent, full-time job :).
# 
# 

# ## Exploring different machine-learning pipeline to encode the data
# 
# ### The learning pipeline
# 
# To build a learning pipeline, we need to assemble encoders for each
# column, and apply a supervised learning model on top.
# 
# 

# #### Encoding the table
# 
# The TableVectorizer applies different transformations to the different
# columns to turn them into numerical values suitable for learning
# 
# 

# In[ ]:


from skrub import TableVectorizer
encoder = TableVectorizer()


# #### Pipelining an encoder with a learner
# 
# Here again we use a pipeline with HistGradientBoostingRegressor
# 
# 

# In[ ]:


from sklearn.ensemble import HistGradientBoostingRegressor
pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())


# The pipeline can be readily applied to the dataframe for prediction
# 
# 

# In[ ]:


pipeline.fit(df, y)

# The categorical encoders
# ........................
#
# A encoder is needed to turn a categorical column into a numerical
# representation
from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


# ### Dirty-category encoding
# 
# The one-hot encoder is actually not well suited to the 'Employee
# Position Title' column, as this columns contains 400 different entries.
# 
# We will now experiments with different encoders for dirty columns
# 
# 

# In[ ]:


from skrub import SimilarityEncoder, MinHashEncoder,\
    GapEncoder
from sklearn.preprocessing import TargetEncoder

similarity = SimilarityEncoder()
target = TargetEncoder()
minhash = MinHashEncoder(n_components=100)
gap = GapEncoder(n_components=100)

encoders = {
    'one-hot': one_hot,
    'similarity': similarity,
    'target': target,
    'minhash': minhash,
    'gap': gap}


# We now loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score:
# 
# 

# In[ ]:


all_scores = dict()

for name, method in encoders.items():
    encoder = TableVectorizer(high_cardinality=method)

    pipeline = make_pipeline(encoder, HistGradientBoostingRegressor())
    scores = cross_validate(pipeline, df, y)
    print('{} encoding'.format(name))
    print('r2 score:  mean: {:.3f}; std: {:.3f}'.format(
        np.mean(scores['test_score']), np.std(scores['test_score'])))
    print('time:  {:.3f}\n'.format(
        np.mean(scores['fit_time'])))
    all_scores[name] = scores['test_score']


# Note that the time it takes to fit varies also a lot, and not only the
# prediction score
# 
# 

# #### Plotting the results
# Finally, we plot the scores on a boxplot:
# 
# 

# In[ ]:


import seaborn
import matplotlib.pyplot as plt
plt.figure(figsize=(4, 3))
ax = seaborn.boxplot(data=pd.DataFrame(all_scores), orient='h')
plt.ylabel('Encoding', size=20)
plt.xlabel('Prediction accuracy     ', size=20)
plt.yticks(size=20)
plt.tight_layout()


# The clear trend is that encoders that use the string form
# of the category (similarity, minhash, and gap) perform better than
# those that discard it.
# 
# SimilarityEncoder is the best performer, but it is less scalable on big
# data than MinHashEncoder and GapEncoder. The most scalable encoder is
# the MinHashEncoder. GapEncoder, on the other hand, has the benefit that
# it provides interpretable features, as shown above
# 
# |
# 
# 
# .. topic:: The TableVectorizer automates preprocessing
# 
#   As this notebook demonstrates, many preprocessing steps can be
#   automated by the |SV|, and the resulting pipeline can still be
#   inspected, even with non-normalized entries.
# 
# 
#