#!/usr/bin/env python
# coding: utf-8

# # Imports

# In[1]:


import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from feature_engine.encoding import StringSimilarityEncoder


# # Load and preprocess data

# In[2]:


# Helper function for loading and preprocessing data
def load_titanic() -> pd.DataFrame:
    translate_table = str.maketrans('' , '', string.punctuation)
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['home.dest'] = (
        data['home.dest']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    data['name'] = (
        data['name']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    data['ticket'] = (
        data['ticket']
        .str.strip()
        .str.translate(translate_table)
        .str.replace('  ', ' ')
        .str.lower()
    )
    return data


# In[3]:


# Load dataset
data = load_titanic()


# In[4]:


# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['survived', 'sex', 'cabin', 'embarked'], axis=1),
    data['survived'],
    test_size=0.3,
    random_state=0
)


# # StringSimilarityEncoder

# In[5]:


# set up the encoder
encoder = StringSimilarityEncoder(top_categories=2, variables=['name', 'home.dest', 'ticket'])


# In[6]:


# fit the encoder
encoder.fit(X_train)


# In[7]:


# lets see what categories we will be comparing to others
encoder.encoder_dict_


# In[8]:


# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)


# In[9]:


# check output
train_t.head(5)


# In[10]:


# check output
test_t.head(5)


# In[11]:


# plot encoded column - ticket
# OHE could produce only 0, but SSE produces values in [0,1] range
fig, ax = plt.subplots(2, 1);
train_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in train', ax=ax[0]);
test_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in test', ax=ax[1]);


# In[12]:


# defining encoder that ignores NaNs
encoder = StringSimilarityEncoder(
    top_categories=2,
    handle_missing='ignore',
    variables=['name', 'home.dest', 'ticket']
)


# In[13]:


# refiting the encoder
encoder.fit(X_train)


# In[14]:


# lets see what categories we will be comparing to others
# note - no empty strings with handle_missing='ignore'
encoder.encoder_dict_


# In[15]:


# transform the data
train_t = encoder.transform(X_train)
test_t = encoder.transform(X_test)


# In[16]:


# check output
train_t.head(5)


# In[17]:


# check output
test_t.head(5)


# In[18]:


# plot encoded column - home.dest
fig, ax = plt.subplots(2, 1);
train_t.plot(
    kind='scatter',
    x='home.dest_new york ny',
    y='home.dest_london',
    sharex=True,
    title='Home destination encoding in train',
    ax=ax[0]
);
test_t.plot(
    kind='scatter',
    x='home.dest_new york ny',
    y='home.dest_london',
    sharex=True,
    title='Home destination encoding in test',
    ax=ax[1]
);


# # Note on dimensionality reduction

# In[19]:


# These encoded columns could also be compressed further to reduce dimensions
# since they are not boolean, but real numbers
from sklearn.decomposition import PCA


# In[20]:


# defining encoder for home destination
encoder = StringSimilarityEncoder(
    top_categories=None,
    handle_missing='impute',
    variables=['home.dest']
)


# In[21]:


# refiting the encoder
encoder.fit(X_train)


# In[22]:


# transform the data
train_t = encoder.transform(X_train)


# In[23]:


# check the shape (should be pretty big)
train_t.shape


# In[24]:


# take home.dest encoded columns
home_encoded = train_t.filter(like='home.dest')


# In[25]:


# defining PCA for compression
pca = PCA(n_components=0.9)


# In[26]:


# train PCA
pca.fit(home_encoded)


# In[27]:


# transform train and test datasets
train_compressed = pca.transform(home_encoded)


# In[28]:


# check compressed shape (should be way smaller)
train_compressed.shape


# In[ ]: