#!/usr/bin/env python # coding: utf-8 # # Imports # In[1]: import string import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from feature_engine.encoding import StringSimilarityEncoder # # Load and preprocess data # In[2]: # Helper function for loading and preprocessing data def load_titanic() -> pd.DataFrame: translate_table = str.maketrans('' , '', string.punctuation) data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl') data = data.replace('?', np.nan) data['home.dest'] = ( data['home.dest'] .str.strip() .str.translate(translate_table) .str.replace(' ', ' ') .str.lower() ) data['name'] = ( data['name'] .str.strip() .str.translate(translate_table) .str.replace(' ', ' ') .str.lower() ) data['ticket'] = ( data['ticket'] .str.strip() .str.translate(translate_table) .str.replace(' ', ' ') .str.lower() ) return data # In[3]: # Load dataset data = load_titanic() # In[4]: # Separate into train and test sets X_train, X_test, y_train, y_test = train_test_split( data.drop(['survived', 'sex', 'cabin', 'embarked'], axis=1), data['survived'], test_size=0.3, random_state=0 ) # # StringSimilarityEncoder # In[5]: # set up the encoder encoder = StringSimilarityEncoder(top_categories=2, variables=['name', 'home.dest', 'ticket']) # In[6]: # fit the encoder encoder.fit(X_train) # In[7]: # lets see what categories we will be comparing to others encoder.encoder_dict_ # In[8]: # transform the data train_t = encoder.transform(X_train) test_t = encoder.transform(X_test) # In[9]: # check output train_t.head(5) # In[10]: # check output test_t.head(5) # In[11]: # plot encoded column - ticket # OHE could produce only 0, but SSE produces values in [0,1] range fig, ax = plt.subplots(2, 1); train_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in train', ax=ax[0]); test_t.plot(kind='scatter', x='ticket_ca 2343', y='ticket_ca 2144', sharex=True, title='Ticket encoding in test', ax=ax[1]); # In[12]: # defining encoder that ignores NaNs encoder = StringSimilarityEncoder( top_categories=2, handle_missing='ignore', variables=['name', 'home.dest', 'ticket'] ) # In[13]: # refiting the encoder encoder.fit(X_train) # In[14]: # lets see what categories we will be comparing to others # note - no empty strings with handle_missing='ignore' encoder.encoder_dict_ # In[15]: # transform the data train_t = encoder.transform(X_train) test_t = encoder.transform(X_test) # In[16]: # check output train_t.head(5) # In[17]: # check output test_t.head(5) # In[18]: # plot encoded column - home.dest fig, ax = plt.subplots(2, 1); train_t.plot( kind='scatter', x='home.dest_new york ny', y='home.dest_london', sharex=True, title='Home destination encoding in train', ax=ax[0] ); test_t.plot( kind='scatter', x='home.dest_new york ny', y='home.dest_london', sharex=True, title='Home destination encoding in test', ax=ax[1] ); # # Note on dimensionality reduction # In[19]: # These encoded columns could also be compressed further to reduce dimensions # since they are not boolean, but real numbers from sklearn.decomposition import PCA # In[20]: # defining encoder for home destination encoder = StringSimilarityEncoder( top_categories=None, handle_missing='impute', variables=['home.dest'] ) # In[21]: # refiting the encoder encoder.fit(X_train) # In[22]: # transform the data train_t = encoder.transform(X_train) # In[23]: # check the shape (should be pretty big) train_t.shape # In[24]: # take home.dest encoded columns home_encoded = train_t.filter(like='home.dest') # In[25]: # defining PCA for compression pca = PCA(n_components=0.9) # In[26]: # train PCA pca.fit(home_encoded) # In[27]: # transform train and test datasets train_compressed = pca.transform(home_encoded) # In[28]: # check compressed shape (should be way smaller) train_compressed.shape # In[ ]: