#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0";


# In[2]:


import ktrain
from ktrain import text


# # Predicting Wine Prices from Textual Descriptions
# 
# This notebook shows an example of **text regression** in *ktrain*.  Given a textual description of a wine, we will attempt to predict its price.  The data is available from FloydHub [here](https://www.floydhub.com/floydhub/datasets/wine-reviews/1/wine_data.csv).
# 
# ## Clean and Prepare the Data
# 
# We will simply perform the same data preparation as performed by the [original FloydHub example notebook](https://github.com/floydhub/regression-template) that inspired this exmaple.

# In[3]:


import pandas as pd
import numpy as np
path = 'data/wine/wine_data.csv'  # ADD path/to/dataset
data = pd.read_csv(path)
data = data.sample(frac=1., random_state=0)
data.head()


# In[4]:


# this code was taken directly from FloydHub's regression template for
# wine price prediction: https://github.com/floydhub/regression-template

# Clean it from null values
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1) 
variety_threshold = 500 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

x_train = description_train.values
y_train = labels_train.values
x_test = description_test.values
y_test = labels_test.values


# ## STEP 1: Preprocess the Data

# In[5]:


trn, val, preproc = text.texts_from_array(x_train=x_train, y_train=y_train,
                                          x_test=x_test, y_test=y_test,
                                          ngram_range=3, 
                                          maxlen=200, 
                                          max_features=35000)


# ## STEP 2: Create a Text Regression Model and Wrap in Learner

# In[6]:


text.print_text_regression_models()


# In[7]:


model = text.text_regression_model('linreg', train_data=trn, preproc=preproc)
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256)


# Lower the `batch size` above if you run out of GPU memory.

# ## STEP 3: Estimate the LR

# In[8]:


learner.lr_find()


# In[9]:


learner.lr_plot()


# ## STEP 4: Train and Inspect the Model

# In[10]:


learner.fit_onecycle(0.03, 10)


# Our MAE is roughly 10, which means our model's predictions are about $10 off on average.  This isn't bad considering there is a wide range of wine prices and predictions are being made purely from text descriptions. 
# 
# Let's examine the wines we got the most wrong.

# In[11]:


learner.view_top_losses(n=3, preproc=preproc)


# It looks like our model has trouble with expensive wines, which is understandable given the descriptions of them, which may  not differ much from less expensive wines.

# ## STEP 5: Making Predictions

# In[12]:


predictor = ktrain.get_predictor(learner.model, preproc)


# Let's make a prediction for a random wine in the validation set.

# In[13]:


idx = np.random.randint(len(x_test))
print('Description: %s' % (x_test[idx]))
print('Actual Price: %s' % (y_test[idx]))


# Our prediction for this wine:

# In[14]:


predictor.predict(x_test[idx])


# ## Using the Transfomer API for Text Regression
# 
# *ktrain* includes a simplified interface to the Hugging Face transformers library.  This interface can also be used for text regression. Here is a short example of training a [DistilBERT model](https://arxiv.org/abs/1910.01108) for a single epoch to predict wine prices.

# In[9]:


MODEL_NAME = 'distilbert-base-uncased'
t = text.Transformer(MODEL_NAME, maxlen=75)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_test(x_test, y_test)
model = t.get_regression_model()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128)
learner.fit_onecycle(1e-4, 1)


# ### For the Prediction part as discussed in the [issue](https://github.com/amaiya/ktrain/issues/417) ###

# In[ ]:


p = ktrain.get_predictor(learner.model, t)
p.predict(['This is first document.', 'This is second document.', 'This is third document.'])