#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0" import numpy as np import random import tensorflow as tf import pandas as pd pd.set_option('display.max_columns', None) seed_value = 0 os.environ['PYTHONHASHSEED']=str(seed_value) random.seed(seed_value) np.random.seed(seed_value) tf.random.set_seed(seed_value) # In[2]: import ktrain from ktrain import tabular # # Classification and Regression on Tabular Data in `ktrain` # # As of v0.19.x, *ktrain* supports classification and regression on "traditional" tabular datasets. We will cover two examples in this notebook: # - **Part I: Classification**: predicting which [Titanic passengers survived](https://www.kaggle.com/c/titanic) # - **Part II: Regression**: predicting the age of people from [census data](http://archive.ics.uci.edu/ml/datasets/Census+Income) # # Let's begin with a demonstration of tabular classfication using the well-studied Titatnic dataset from Kaggle. # # ## Part I: Classification for Tabular Data # # # ### Solving the Titanic Kaggle Challenge in `ktrain` # # This notebook demonstrates using *ktrain* for predicting which passengers survived the Titatnic shipwreck. # # The dataset can be [downloaded from Kaggle here](https://www.kaggle.com/c/titanic/overview). There is a `train.csv` with labels (i.e., `Survived`) and a `test.csv` with no labels. We will only use `train.csv` in this notebook. # # Let's begin by loading the data as a pandas DataFrame and inspecting it. # In[3]: train_df = pd.read_csv('data/titanic/train.csv', index_col=0) # In[4]: train_df.head() # We'll drop the `Name`, `Ticket`, `Cabin` columns, as they seem like they'll be less predictive. These columns are largely unique or near-unique to passengers. # In[5]: train_df = train_df.drop('Name', 1) train_df = train_df.drop('Ticket', 1) train_df = train_df.drop('Cabin', 1) # *ktrain* will automatically split out a validation set if given only a training set. But, let's also manually split out a test set that we can evaluate later. # In[6]: np.random.seed(42) p = 0.1 # 10% for test set prop = 1-p df = train_df.copy() msk = np.random.rand(len(df)) < prop train_df = df[msk] test_df = df[~msk] # In[7]: train_df.shape # In[8]: test_df.shape # ### STEP 1: Load and Preprocess the Data # In[9]: trn, val, preproc = tabular.tabular_from_df(train_df, label_columns=['Survived'], random_state=42) # ##### Automated Preprocessing # *ktrain* automatically preprocesses the dataset appropriately. Numerical columns are automatically normalized, missing values are handled, and categorical variables will be vectorized as [entity embeddings](https://arxiv.org/abs/1604.06737) for input to a neural network. # # ##### Auto-generated Features # *ktrain* will auto-generate some new features. For instance, if `Age` is missing for a particular individual, an `Age_na=True` feature will be automatically added. # # New date features are also automatically added. This dataset does not have any **date** fields. If it did, we could populate the `date_columns` parameter to `tabular_from_df` in which case they would be used to auto-generate new features (e.g., `Day`, `Week`, `Is_month_start`, `Is_quarter_end`, etc.) using methods adapted from the **fastai** library. # # ##### Manually-Engineered Features # # In addition to these auto-generated features, one can also optionally add manually-generated, dataset-specific features to `train_df` **prior** to invoking `tabular_from_df`. For instance, the `Cabin` feature we discarded earlier might be used to extract the **deck** associated with each passenger (e.g., **B22** --> **Deck B**). # ### STEP 2: Create a Model and Wrap in `Learner` # # *ktrain* uses multilayer perceptrons as the model for tabular datasets. The model can be configured with arguments to `tabular_classifier` (e.g., number and size of hidden layers, dropout values, etc.), but we will leave the defaults here. # In[10]: tabular.print_tabular_classifiers() # In[11]: model = tabular.tabular_classifier('mlp', trn) # In[12]: learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) # ### STEP 3: Estimate the Learning Rate # # Based on the plot below, we will choose a learning rate of `1e-3`. # In[13]: learner.lr_find(show_plot=True, max_epochs=5) # ### STEP 4: Train the Model # In[14]: learner.fit_onecycle(5e-3, 10) # Since we don't appear to be quite overfitting yet, we could try to train further. But, we will stop here. # # # **Let's evaluate the validation set:** # In[15]: learner.evaluate(val, class_names=preproc.get_classes()) # ### Make Predictions # # The `Predictor` for tabular datasets accepts input as a dataframe in the same format as the original training dataframe. # # We will use `test_df` that we created earlier. # In[16]: predictor = ktrain.get_predictor(learner.model, preproc) # In[17]: preds = predictor.predict(test_df, return_proba=True) # In[18]: preds.shape # In[19]: print('test accuracy:') (np.argmax(preds, axis=1) == test_df['Survived'].values).sum()/test_df.shape[0] # **Our final results as a DataFrame:** # In[20]: df = test_df.copy()[[c for c in test_df.columns.values if c != 'Survived']] df['Survived'] = test_df['Survived'] df['predicted_Survived'] = np.argmax(preds, axis=1) df.head() # ### Explaining Predictions # # We can use the `explain` method to better understand **why** a prediction was made for a particular example. Consider the passenger in the fourth row above (`PassengerID=35`) that did not survive. Although we classified this passenger correctly here, this row tends to get classified differently across different training runs. It is sometimes classified correctly (as in this run), but is also often misclassifeid. # # Let's better understand why. # # The `explain` method accepts at minimum the following three inputs: # 1. **df**: a pandas DataFrame in the same format is the original training DataFrame # 2. **row_index**: the DataFrame index of the example (here, we choose PassengerID=35) # 3. **class_id**: the id of the class of interest (we choose the **Survived** class in this case) # # One can also replace the `row_index=35` with `row_num=3`, as both denote the fourth row. # In[21]: predictor.explain(test_df, row_index=35, class_id=1) # The plot above is generated using the [shap](https://github.com/slundberg/shap) library. You can install it with either `pip install shap` or, for *conda* users, `conda install -c conda-forge shap`. The features in red are causing our model to increase the prediction for the **Survived** class, while features in blue cause our model to *decrease* the prediction for **Survived** (or *increase* the prediction for **Not_Survived**). # # From the plot, we see that the predicted softmax probability for `Survived` is **50%**, which is a comparatively much less confident classification than other classifications. Why is this? # # We see that`Sex=male` is an influential feature that is pushing the prediction lower towards **Not_Survived**, as it was women and children given priority when allocating lifeboats on the Titanic. # # On the other hand, we also see that this is a First Class passenger (`Pclass=1`) with a higher-than-average `Fare` price of *82.17*. In the cell below, you'll see that the average `Fare` price is only *32*. (Moreover, this passenger embarked from Cherbourg, which has been shown to be correlated with survival.) Such features suggest that this is an upper-class, wealthier passenger and, therefore, more likely to make it onto a lifeboat and survive. We know from history that crew members were ordered to close gates that lead to the upper decks so the first and second class passengers could be evacuated first. As a result, these "upper class" features are pushing our model to increase the classification to **Survived**. # # **Thus, there are two opposing forces at play working against each other in this prediction,** which explains why the prediction probability is comparatively nearer to the border than other examples. # # # In[22]: train_df['Fare'].mean() # **NOTE**: We choose `class_id=1` in the example above because the **Survived** class of interest has an index position of 1 in the `class_names` list: # In[23]: preproc.get_classes() # Let us now look at the examples for which we were the most wrong (highest loss). # In[24]: learner.view_top_losses(val_data=preproc.preprocess_test(test_df), preproc=preproc, n=3) # The example with the highest losses are `row_num={27, 53, 19}`. Why did we get these so wrong? Let's examine `row_num=53`. Note that these IDs shown in the `view_top_losses` output are the raw row numbers, not DataFrame indices (or PassengerIDs). So, we need to use `row_num`, not `row_index` here. # # In[25]: predictor.explain(test_df, row_num=53, class_id=1) # This is a wealthy First Class (`Pclass=1`) female passenger with a very high `Fare` price of 151.55. As mentioned above, such a passenger had a high chance for survival, which explains our model's high prediction for **Survival**. Yet, she did not survive. Upon further investigation, we can understand why. This particular passenger is **Bess Allison**, a wealthy married 25-year old mother to two toddlers. When the collision occurred, her and her husband could not locate their nanny (Alice Cleaver) and son (Trevor). So, Bess, her husband, and her 3-year-old daughter Loraine stayed behind to wait for them instead of evacuating with other First and Second Class passengers with children. They were last seen standing together smiling on the promenade deck. All three died with her daughter Loraine being the only child in 1st class and 2nd class who died on the Titanic. Their son and nanny successfully evacuated and survived. # # REFERENCE: [https://rt.cto.mil/stpe/](https://rt.cto.mil/stpe/) # ### Saving and Reloading the Tabular Predictor # # It is easy to save and reload the predictor for deployment scenarios. # In[26]: predictor.save('/tmp/titanic_predictor') # In[27]: reloaded_predictor = ktrain.load_predictor('/tmp/titanic_predictor/') # In[28]: reloaded_predictor.predict(test_df)[:5] # ### Evaluating Test Sets Automatically # # When we evaulated the test set above, we did so manually. To evaluate a test set automatically, # one can invoke the `learner.evaluate` method and supply a preprocessed test set as an argument: # In[29]: learner.evaluate(preproc.preprocess_test(test_df), class_names=preproc.get_classes()) # The `learner.evaluate` method is simply an alias to `learner.validate`, which can also accept a dataset as an argument. If no argument is supplied, metrics will be computed for `learner.val_data`, which was supplied to `get_learner` above. # ## Part II: Regression for Tabular Data # # We will briefly demonstrate tabular regression in *ktrain* by simply predicting the `age` attribute in the Census dataset available from te UCI Machine Learning repository. This is the same example used in the [AutoGluon regression example](https://autogluon.mxnet.io/tutorials/tabular_prediction/tabular-quickstart.html#regression-predicting-numeric-table-columns). Let's begin by downloading the dataset from the AutoGluon website. # In[30]: import urllib.request urllib.request.urlretrieve('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv', '/tmp/train.csv') get_ipython().system('ls /tmp/train.csv') # ### STEP 1: Load and Preprocess Data # # Make sure you specify `is_regression=True` here as we are predicting a numerical dependent variable (i.e., `age`). # In[31]: trn, val, preproc = tabular.tabular_from_csv('/tmp/train.csv', label_columns='age', is_regression=True, random_state=42) # We used `tabular_from_csv` to load the dataset, but let's also quickly load as DataFrame to see it: # In[2]: pd.read_csv('/tmp/train.csv').head() # ### STEP 2: Create a Model and Wrap in `Learner` # # We'll use `tabular_regression_model` to create a regression model. # In[33]: tabular.print_tabular_regression_models() # In[34]: model = tabular.tabular_regression_model('mlp', trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) # ### STEP 3: Estimate Learning Rate # In[35]: learner.lr_find(show_plot=True) # ### STEP 4: Train the Model # # According to our final validation MAE (see below), our age predictions are only off about **~7 years**. # In[36]: learner.autofit(1e-3) # In[37]: learner.validate() # See the [House Price Prediction notebook](https://github.com/amaiya/ktrain/blob/master/examples/tabular/HousePricePrediction-MLP.ipynb) for another regression example. # In[ ]: