#!/usr/bin/env python
# coding: utf-8

# In[26]:


# Load the "autoreload" extension
get_ipython().run_line_magic('load_ext', 'autoreload')

# always reload modules marked with "%aimport"
get_ipython().run_line_magic('autoreload', '1')

import os
import sys
from sklearn.metrics import roc_curve

# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

# import my method from the source code
get_ipython().run_line_magic('aimport', 'data.read_data')
get_ipython().run_line_magic('aimport', 'models.train_model')
get_ipython().run_line_magic('aimport', 'features.build_features')
get_ipython().run_line_magic('aimport', 'visualization.visualize')
from data.read_data import read_data, get_stopwords
from models.train_model import split_train, score_function, get_fasttext, model_ridge, model_xgb, model_lightgbm
from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int
from visualization.visualize import plot_roc, plot_scatter


# In[2]:


train = read_data(test=False)
y = train['Target']
stopwords = get_stopwords()


# In[3]:


train.head()


# # Feature engineering

# In[4]:


train = replace_na(train, ['review_content', 'review_title'])


# In[5]:


X_dummies = to_categorical(train, 'review_stars')


# In[6]:


X_content = to_tfidf(train, 'review_content', stopwords)
X_title = to_tfidf(train, 'review_title', stopwords)


# In[9]:


X_length = to_sparse_int(train, 'review_content')


# In[10]:


sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length])


# In[11]:


X_train, X_test, y_train, y_test = split_train(sparse_merge, y, 0.2)


# # LightGBM

# In[20]:


model_lgb = model_lightgbm(X_train, y_train)
preds = model_lgb.predict_proba(X_test)
preds1 = preds[:,1]
score_function(y_test, preds1)


# In[23]:


fpr, tpr, _ = roc_curve(y_test, preds1)
plot_roc(fpr, tpr)


# # Ridge

# In[21]:


model_rdg = model_ridge(X_train, y_train, )
preds = model_rdg.predict(X=X_test)
score_function(y_test, preds)


# In[33]:


fpr, tpr, _ = roc_curve(y_test, preds)


# In[34]:


plot_roc(fpr, tpr)


# # Xgboost

# In[27]:


model_xgboost = model_xgb(X_train, y_train)
preds = model_xgboost.predict_proba(X_test)
preds1 = preds[:,1]
score_function(y_test, preds1)


# In[28]:


fpr, tpr, _ = roc_curve(y_test, preds1)
plot_roc(fpr, tpr)