#!/usr/bin/env python # coding: utf-8 # In[26]: # Load the "autoreload" extension get_ipython().run_line_magic('load_ext', 'autoreload') # always reload modules marked with "%aimport" get_ipython().run_line_magic('autoreload', '1') import os import sys from sklearn.metrics import roc_curve # add the 'src' directory as one where we can import modules src_dir = os.path.join(os.getcwd(), os.pardir, 'src') sys.path.append(src_dir) # import my method from the source code get_ipython().run_line_magic('aimport', 'data.read_data') get_ipython().run_line_magic('aimport', 'models.train_model') get_ipython().run_line_magic('aimport', 'features.build_features') get_ipython().run_line_magic('aimport', 'visualization.visualize') from data.read_data import read_data, get_stopwords from models.train_model import split_train, score_function, get_fasttext, model_ridge, model_xgb, model_lightgbm from features.build_features import get_vec, to_categorical, replace_na, to_tfidf, stack_sparse, to_sparse_int from visualization.visualize import plot_roc, plot_scatter # In[2]: train = read_data(test=False) y = train['Target'] stopwords = get_stopwords() # In[3]: train.head() # # Feature engineering # In[4]: train = replace_na(train, ['review_content', 'review_title']) # In[5]: X_dummies = to_categorical(train, 'review_stars') # In[6]: X_content = to_tfidf(train, 'review_content', stopwords) X_title = to_tfidf(train, 'review_title', stopwords) # In[9]: X_length = to_sparse_int(train, 'review_content') # In[10]: sparse_merge = stack_sparse([X_dummies, X_content, X_title, X_length]) # In[11]: X_train, X_test, y_train, y_test = split_train(sparse_merge, y, 0.2) # # LightGBM # In[20]: model_lgb = model_lightgbm(X_train, y_train) preds = model_lgb.predict_proba(X_test) preds1 = preds[:,1] score_function(y_test, preds1) # In[23]: fpr, tpr, _ = roc_curve(y_test, preds1) plot_roc(fpr, tpr) # # Ridge # In[21]: model_rdg = model_ridge(X_train, y_train, ) preds = model_rdg.predict(X=X_test) score_function(y_test, preds) # In[33]: fpr, tpr, _ = roc_curve(y_test, preds) # In[34]: plot_roc(fpr, tpr) # # Xgboost # In[27]: model_xgboost = model_xgb(X_train, y_train) preds = model_xgboost.predict_proba(X_test) preds1 = preds[:,1] score_function(y_test, preds1) # In[28]: fpr, tpr, _ = roc_curve(y_test, preds1) plot_roc(fpr, tpr)