#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # In[2]: import ktrain from ktrain import text # # Predicting Wine Prices from Textual Descriptions # # This notebook shows an example of **text regression** in *ktrain*. Given a textual description of a wine, we will attempt to predict its price. The data is available from FloydHub [here](https://www.floydhub.com/floydhub/datasets/wine-reviews/1/wine_data.csv). # # ## Clean and Prepare the Data # # We will simply perform the same data preparation as performed by the [original FloydHub example notebook](https://github.com/floydhub/regression-template) that inspired this exmaple. # In[3]: import pandas as pd import numpy as np path = 'data/wine/wine_data.csv' # ADD path/to/dataset data = pd.read_csv(path) data = data.sample(frac=1., random_state=0) data.head() # In[4]: # this code was taken directly from FloydHub's regression template for # wine price prediction: https://github.com/floydhub/regression-template # Clean it from null values data = data[pd.notnull(data['country'])] data = data[pd.notnull(data['price'])] data = data.drop(data.columns[0], axis=1) variety_threshold = 500 # Anything that occurs less than this will be removed. value_counts = data['variety'].value_counts() to_remove = value_counts[value_counts <= variety_threshold].index data.replace(to_remove, np.nan, inplace=True) data = data[pd.notnull(data['variety'])] # Split data into train and test train_size = int(len(data) * .8) print ("Train size: %d" % train_size) print ("Test size: %d" % (len(data) - train_size)) # Train features description_train = data['description'][:train_size] variety_train = data['variety'][:train_size] # Train labels labels_train = data['price'][:train_size] # Test features description_test = data['description'][train_size:] variety_test = data['variety'][train_size:] # Test labels labels_test = data['price'][train_size:] x_train = description_train.values y_train = labels_train.values x_test = description_test.values y_test = labels_test.values # ## STEP 1: Preprocess the Data # In[5]: trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, ngram_range=3, maxlen=200, max_features=35000) # ## STEP 2: Create a Text Regression Model and Wrap in Learner # In[6]: text.print_text_regression_models() # In[7]: model = text.text_regression_model('linreg', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256) # Lower the `batch size` above if you run out of GPU memory. # ## STEP 3: Estimate the LR # In[8]: learner.lr_find() # In[9]: learner.lr_plot() # ## STEP 4: Train and Inspect the Model # In[10]: learner.fit_onecycle(0.03, 10) # Our MAE is roughly 10, which means our model's predictions are about $10 off on average. This isn't bad considering there is a wide range of wine prices and predictions are being made purely from text descriptions. # # Let's examine the wines we got the most wrong. # In[11]: learner.view_top_losses(n=3, preproc=preproc) # It looks like our model has trouble with expensive wines, which is understandable given the descriptions of them, which may not differ much from less expensive wines. # ## STEP 5: Making Predictions # In[12]: predictor = ktrain.get_predictor(learner.model, preproc) # Let's make a prediction for a random wine in the validation set. # In[13]: idx = np.random.randint(len(x_test)) print('Description: %s' % (x_test[idx])) print('Actual Price: %s' % (y_test[idx])) # Our prediction for this wine: # In[14]: predictor.predict(x_test[idx]) # ## Using the Transfomer API for Text Regression # # *ktrain* includes a simplified interface to the Hugging Face transformers library. This interface can also be used for text regression. Here is a short example of training a [DistilBERT model](https://arxiv.org/abs/1910.01108) for a single epoch to predict wine prices. # In[9]: MODEL_NAME = 'distilbert-base-uncased' t = text.Transformer(MODEL_NAME, maxlen=75) trn = t.preprocess_train(x_train, y_train) val = t.preprocess_test(x_test, y_test) model = t.get_regression_model() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) learner.fit_onecycle(1e-4, 1) # ### For the Prediction part as discussed in the [issue](https://github.com/amaiya/ktrain/issues/417) ### # In[ ]: p = ktrain.get_predictor(learner.model, t) p.predict(['This is first document.', 'This is second document.', 'This is third document.'])