#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # ## Text Regression with Extra Regressors: An Example of Using Custom Data Formats and Models in *ktrain* # # This notebook illustrates how one can construct custom data formats and models for use in *ktrain*. In this example, we will build a model that can predict the price of a wine by **both** its textual description and the winery from which it was produced. This example is inspired by [FloydHub's regression template](https://github.com/floydhub/regression-template) for wine price prediction. However, instead of using the wine variety as the extra regressor, we will use the winery. # # Text classification (or text regression) with extra predictors arises across many scenarios. For instance, when making a prediction about the trustworthiness of a news story, one may want to consider both the text of the news aricle in addition to extra metadata such as the news publication and the authors. Here, such models can be built. # # The dataset in CSV format can be obtained from Floydhub at [this URL](https://www.floydhub.com/floydhub/datasets/wine-reviews/1/wine_data.csv). We will begin by importing some necessary modules and reading in the dataset. # In[2]: # import some modules and read in the dataset import pandas as pd from tensorflow import keras import numpy as np import math path = 'data/wine/wine_data.csv' # ADD path/to/dataset data = pd.read_csv(path) data = data.sample(frac=1., random_state=42) data.head() # ## Cleaning the Data # # We use the exact same data-cleaning steps employed in [FloydHub's regression example](https://github.com/floydhub/regression-template) for this dataset. # In[3]: # Clean it from null values data = data[pd.notnull(data['country'])] data = data[pd.notnull(data['price'])] data = data.drop(data.columns[0], axis=1) variety_threshold = 500 # Anything that occurs less than this will be removed. value_counts = data['variety'].value_counts() to_remove = value_counts[value_counts <= variety_threshold].index data.replace(to_remove, np.nan, inplace=True) data = data[pd.notnull(data['variety'])] data = data[pd.notnull(data['winery'])] # Split data into train and test train_size = int(len(data) * .8) print ("Train size: %d" % train_size) print ("Test size: %d" % (len(data) - train_size)) # Train features description_train = data['description'][:train_size] variety_train = data['variety'][:train_size] # Train labels labels_train = data['price'][:train_size] # Test features description_test = data['description'][train_size:] variety_test = data['variety'][train_size:] # Test labels labels_test = data['price'][train_size:] x_train = description_train.values y_train = labels_train.values x_test = description_test.values y_test = labels_test.values # winery metadata to be used later winery_train = data['winery'][:train_size] winery_test = data['winery'][train_size:] # ## Building a Vanilla Text Regression Model in *ktrain* # # We will preprocess the data and select a `linreg` model for our initial "vanilla" text regression model. # In[4]: import ktrain from ktrain import text # In[5]: trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, ngram_range=3, maxlen=200, max_features=35000) # In[6]: text.print_text_regression_models() # In[7]: model = text.text_regression_model('linreg', train_data=trn, preproc=preproc) # ## Adding an Extra Regressor to Our Model # # Next, we will add an extra regressor to our model, thereby, creating a new, augmented model. We choose the winery as the extra regressor, which is a categorical variable. Instead of representing the winery as a typical one-hot-encoded vector, we will learn an embedding for the winery during training. The embedding module will then be concatenated with our `linreg` text regression model forming a new model. The new model expects two distinct inputs. The first input is an integer representing the winery. The second input is a sequence of word IDs - standard input to neural text classifiers/regressors. # In[8]: extra_train_data = winery_train extra_test_data = winery_test # encode winery as integers from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(data['winery']) extra_train = encoder.transform(extra_train_data) extra_test = encoder.transform(extra_test_data) no_of_unique_cat = np.max(extra_train) + 1 embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 ) embedding_size = int(embedding_size) vocab = no_of_unique_cat+1 print(embedding_size) extra_train = np.expand_dims(extra_train, -1) extra_test = np.expand_dims(extra_test, -1) # winery module extra_input = keras.layers.Input(shape=(1,)) extra_output = keras.layers.Embedding(vocab, embedding_size, input_length=1)(extra_input) extra_output = keras.layers.Flatten()(extra_output) extra_model = keras.Model(inputs=extra_input, outputs=extra_output) extra_model.compile(loss='mse', optimizer='adam', metrics=['mae']) # Combine winery module with linreg model merged_out = keras.layers.concatenate([extra_model.output, model.output]) merged_out = keras.layers.Dropout(0.25)(merged_out) merged_out = keras.layers.Dense(1000, activation='relu')(merged_out) merged_out = keras.layers.Dropout(0.25)(merged_out) merged_out = keras.layers.Dense(500, activation='relu')(merged_out) merged_out = keras.layers.Dropout(0.5)(merged_out) merged_out = keras.layers.Dense(1)(merged_out) combined_model = keras.Model([extra_model.input] + [model.input], merged_out) combined_model.compile(loss='mae', optimizer='adam', metrics=['mae']) # ## Wrapping our Data in an Instance of `ktrain.Dataset` # To use this custom data format of two inputs in *ktrain*, we will wrap it in a `ktrain.Dataset` instance. There are two ways to do this. # # The first is to represent our datasets as `tf.data.Dataset` instances and then wrap each in a `ktrain.TFDataset` instance, which is a wrapper to a `tf.data.Dataset`. Use of `tf.data.Dataset` instances can potentially [yield certain performance improvements](https://www.tensorflow.org/guide/data_performance). See [this example notebook](https://github.com/amaiya/ktrain/blob/master/examples/vision/mnist-tf_workflow.ipynb) for a demonstration of using the `ktrain.TFDataset` class. For this example, one can make us of `ktrain.TFDataset` instances as follows: # # ```python # import tensorflow as tf # from ktrain.data import TFDataset # BATCH_SIZE = 256 # # trn_combined = [extra_train] + [trn[0]] + [trn[1]] # val_combined = [extra_test] + [val[0]] + [val[1]] # # def features_to_tfdataset(examples): # # def gen(): # for idx, ex0 in enumerate(examples[0]): # ex1 = examples[1][idx] # label = examples[2][idx] # x = (ex0, ex1) # y = label # yield ( (x, y) ) # # tfdataset= tf.data.Dataset.from_generator(gen, # ((tf.int32, tf.int32), tf.int64), # ((tf.TensorShape([None]), tf.TensorShape([None])), tf.TensorShape([])) ) # return tfdataset # train_tfdataset= features_to_tfdataset(trn_combined) # val_tfdataset= features_to_tfdataset(val_combined) # train_tfdataset = train_tfdataset.shuffle(trn_combined[0].shape[0]).batch(BATCH_SIZE).repeat(-1) # val_tfdataset = val_tfdataset.batch(BATCH_SIZE) # # train_data = ktrain.TFDataset(train_tfdataset, n=trn_combined[0].shape[0], y=trn_combined[2]) # val_data = ktrain.TFDataset(val_tfdataset, n=val_combined[0].shape[0], y=val_combined[2]) # learner = ktrain.get_learner(combined_model, train_data=train_data, val_data=val_data) # ``` # # # # The second approach is to wrap our datasets in a subclass of `ktrain.SequenceDataset`. We must be sure to override and implment the required methods (e.g., `def nsamples` and `def get_y`). The `ktrain.SequenceDataset` class is simply a subclass of `tf.keras.utils.Sequence`. See the TensorFlow documentation on the [Sequence class](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) for more information on how Sequence wrappers work. # # We employ the second approach in this tutorial. Note that, in the implementation below, we have made `MyCustomDataset` more general such that it can wrap lists containing an arbitrary number of inputs instead of just the two needed in our example. # In[9]: class MyCustomDataset(ktrain.SequenceDataset): def __init__(self, x, y, batch_size=32, shuffle=True): # error checks err = False if type(x) == np.ndarray and len(x.shape) != 2: err = True elif type(x) == list: for d in x: if type(d) != np.ndarray or len(d.shape) != 2: err = True break else: err = True if err: raise ValueError('x must be a 2d numpy array or a list of 2d numpy arrays') if type(y) != np.ndarray: raise ValueError('y must be a numpy array') if type(x) == np.ndarray: x = [x] # set variables super().__init__(batch_size=batch_size) self.x, self.y = x, y self.indices = np.arange(self.x[0].shape[0]) self.n_inputs = len(x) self.shuffle = shuffle # required for instances of tf.keras.utils.Sequence def __len__(self): return math.ceil(self.x[0].shape[0] / self.batch_size) # required for instances of tf.keras.utils.Sequence def __getitem__(self, idx): inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] batch_x = [] for i in range(self.n_inputs): batch_x.append(self.x[i][inds]) batch_y = self.y[inds] return tuple(batch_x), batch_y # required for instances of ktrain.Dataset def nsamples(self): return self.x[0].shape[0] #required for instances of ktrain.Dataset def get_y(self): return self.y def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.indices) # Note that, you can also add a `to_tfdataset` method to your `ktrain.SequenceDataset` subclass. The `to_tfdataset` method is responsible for converting your dataset to a `tf.Dataset` and, if it exists, will be called by *ktrain* just prior to training. We have not done this here. # # # # ## Using the Custom Model and Data Format # # Once we wrap our data in a `ktrain.SequenceDataset` instance, we can wrap the model and datasets in a `Learner` object and use *ktrain* normally. # In[10]: train_data = MyCustomDataset([extra_train] + [trn[0]], trn[1], shuffle=True) val_data = MyCustomDataset([extra_test] + [val[0]], val[1], shuffle=False) learner = ktrain.get_learner(combined_model, train_data=train_data, val_data=val_data, batch_size=256) # ### Estimate Learning Rate # # We'll choose a learning rate where the loss is falling. As shown in the plot, *1e-3* seems to be a good choice in this case. # In[11]: learner.lr_find(show_plot=True, restore_weights_only=True) # ### Train the Model # # We will now train the model using the estimated learning rate from above for 12 epochs using the [1cycle learning rate policy](https://arxiv.org/pdf/1803.09820.pdf). # In[12]: learner.fit_onecycle(1e-3, 12) # Our final validation MAE is **7.82**, which means our predictions are, on average, about $8 off the mark, which is not bad considering our model only looks at the textual description of the wine and the winery. # # ### Plot Some Training History # # The validation loss is still decreasing, which suggests we could train further if desired. The second and third plot show the learning rate and momentum schedules employed by `fit_onecycle`. # In[13]: learner.plot('loss') # In[14]: learner.plot('lr') # In[15]: learner.plot('momentum') # ### View Top Losses # # Let's examine the validation examples that we got the most wrong. Looks like our model has trouble with expensive wines. # In[16]: learner.view_top_losses(n=3) # In[17]: print(x_test[21790]) # In[18]: print(x_test[13745]) # In[19]: preds = learner.predict(val_data) # In[20]: preds[13745] # ### Making Predictions # # Lastly, we will use our model to make predictions on 5 randomly selected wines in the validation set. # In[22]: # 5 random predictions val_data.batch_size = 1 for i in range(5): idx = np.random.choice(len(x_test)) print("TEXT:\n%s" % (x_test[idx])) print() print("\tpredicted: %s" % (np.squeeze(learner.predict(val_data[idx])))) print("\tactual: %s" % (y_test[idx])) print('----------------------------------------') # Let's look at our most expensive prediction. Our most expensive prediction (`$404`) is associated with an expensive wine priced at `$800`, which is good. However, we are `~$400` off. Again, our model has trouble with expensive wines. This is somewhat understandable since our model only looks at short textual descriptions and the winery - neither of which contain clear indicators of their exorbitant prices. # In[43]: max_pred_id = np.argmax(preds) print("highest-priced prediction: %s" % (np.squeeze(preds[max_pred_id]))) print("actual price for this wine:%s" % (y_test[max_pred_id])) print('TEXT:\n%s' % (x_test[max_pred_id])) # ## Making Predictions on Unseen Examples # # In the example above, we made predictions for examples in the validation set. To make predictions for an arbitrary set of wine data, the steps are as follows: # 1. Encode the winery using the same label encoder used above for validation data # 2. Preprocess the wine description using the `preprocess_test` method. In this example, you will use `preproc.preprocess_test`. # 3. Combine both into a `ktrain.Dataset` instance, as we did above. # In[ ]: