#!/usr/bin/env python
# coding: utf-8

# Catch Joe
# =========

# In[2]:


# Python libs
import json
from collections import Counter
import numpy as np
import pandas as pd
from dython.nominal import cramers_v, theils_u, correlation_ratio
from scipy.stats import randint

# Date/time/timezone
import datetime as dt
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder

# scikit-learn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Module settings
mpl.rc("figure", facecolor="white", dpi=144)
pd.set_option('expand_frame_repr', False)  # display dataframe without wrapping


# # Data Preparation
# 
# 

# ### Check data structure
# 
# 

# -   Two data files are given: the training data &ldquo;dataset.json&rdquo; and test data &ldquo;verify.json&rdquo;.
# -   The training data file is a big json file, about 77MB, so we&rsquo;ll firstly peek into the data file and check its structure.
# 
# 

# In[3]:


get_ipython().system('head -n 50 dataset.json')


# In[4]:


get_ipython().system('head -n 50 test.json')


# -   It looks the training data have 9 fields, and the sites fields is a nested list, which contains all the sites the user visits in the session.
# -   To transfer the nested json data into useful features, we'll vectorize the sites column.
# 
# 

# ### Load data
# 
# 

# In[5]:


with open('dataset.json', 'r') as f:
     data_json_struct = json.loads(f.read())
user_sessions = pd.DataFrame(data_json_struct)
print(data_json_struct[0])


# ### Data Inspection
# 
# 

# In[6]:


user_sessions.head(20)
print('\n')
user_sessions.info()


# In[7]:


user_sessions.query("sites.str.len() == 0")


# ### Prepare Data: Impute Empty Sites and Add Custom Features
# 
# 

# #### Impute empty sites
# 
# 

# In[8]:


empty_sites_index = user_sessions.query("sites.str.len() == 0").index
user_sessions.loc[empty_sites_index, 'sites'] = user_sessions.loc[empty_sites_index]['sites'].apply(lambda sites: sites + [{'site': 'NONE.NONE', 'length': 0}])
user_sessions.loc[empty_sites_index]


# #### Combine date/time columns and convert from string to datetime type
# 
# 

# In[9]:


user_sessions['start_dt'] = pd.to_datetime(user_sessions['date'] + ' ' + user_sessions['time'], utc=True)
user_sessions.drop(['time', 'date'], axis=1, inplace=True)
user_sessions


# #### Convert start time to local time
# 
# 

# -   A class that converts city name to timezone
# 
# 

# In[10]:


class TimezoneByCity:
    def __init__(self):
        self.geolocator = Nominatim(user_agent="geoapiExercises")
        self.tzfinder = TimezoneFinder()

    def tz_name(self, city: str):
        loc = self.geolocator.geocode(city)
        tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude)
        return tz_name

    def tz(self, city: str):
        tz_name = self.tz_name(city)
        return pytz.timezone(tz_name)


# -   Build timezone table that maps country/city to timezone
# 
# 

# In[11]:


tz_by_city = TimezoneByCity()
timezone_tbl = {loc: tz_by_city.tz_name(loc.split('/')[1]) for loc in  user_sessions.location.unique()}
print(timezone_tbl)


# -   Add local_time column to data
# 
# 

# In[12]:


user_sessions['local_time'] = user_sessions.apply(lambda row: row['start_dt'].tz_convert(timezone_tbl[row['location']]).tz_localize(None), axis=1)
user_sessions


# #### Split starting date and time to year / month / day / weekday and start_hour
# 
# 

# In[13]:


user_sessions["year"] = user_sessions.local_time.dt.year
user_sessions["month"] = user_sessions.local_time.dt.month
user_sessions["day"] = user_sessions.local_time.dt.day
user_sessions["weekday"] = user_sessions.local_time.dt.weekday
user_sessions["start_hour"] = user_sessions.local_time.dt.hour
user_sessions


# #### Sine/Cosine transform of local start time
# 
# 

# In[14]:


start_dt_normalized = (user_sessions['local_time'] - user_sessions['local_time'].dt.normalize()) / pd.Timedelta('1 second') / 86400
user_sessions['start_sin'] = np.sin(2*np.pi* (start_dt_normalized))
user_sessions['start_cos'] = np.cos(2*np.pi* (start_dt_normalized))
user_sessions


# #### Split location to country and city
# 
# 

# In[15]:


user_sessions[['country', 'city']] = user_sessions['location'].str.split('/', expand=True)
user_sessions


# #### Get total length of each user session
# 
# 

# In[16]:


user_sessions['length_session'] = user_sessions['sites'].apply(lambda session_sites: sum(site_entry['length'] for site_entry in session_sites))
user_sessions


# ### Vectorize top sites using TF-IDF
# 
# 

# In[17]:


n_top = 100

joe_cnt = Counter()
for sites_session in user_sessions.query('user_id == 0')['sites']:
    for site_entry in sites_session:
        joe_cnt.update({site_entry['site']: site_entry['length']})

joe_top_sites, _ = zip(*joe_cnt.most_common(n_top))

print("Total sites joe visited: ", len(joe_cnt))
print(f"Top {n_top} sites joe visited: \n", joe_top_sites[:100])


# In[18]:


n_top = 100

all_cnt = Counter()
for sites_session in user_sessions['sites']:
    for site_entry in sites_session:
        all_cnt.update({site_entry['site']: site_entry['length']})

all_top_sites, _ = zip(*all_cnt.most_common(n_top))
print("Total sites all users visited: ", len(all_cnt))
print(f"Top {n_top} sites all users visited: \n", all_top_sites[:100])


# In[20]:


def get_topsites_length(session_sites: list, top_sites=all_top_sites):
    topsites_len_dict = dict.fromkeys(top_sites, 0)
    for site_entry in session_sites:
        site = site_entry['site']
        if site in topsites_len_dict:
            topsites_len_dict[site] += site_entry['length']
    return list(topsites_len_dict.values())

topsites_length = user_sessions['sites'].apply(get_topsites_length)
topsites_length


# In[21]:


tfidf = TfidfTransformer()
topsites_tfidf = tfidf.fit_transform(topsites_length.values.tolist())
topsites_tfidf.toarray()[:2]


# ### Add binary class label: Joe=0, Other users=1
# 
# 

# In[22]:


user_sessions['target'] = (user_sessions['user_id'] != 0).astype(int)
user_sessions


# # Visual inspection

# ### Histogram / Count plot
# 
# 

# In[1]:


def set_xlabel_rotation(ax, deg=90):
    for label in ax.get_xticklabels():
        l = label.set_rotation(deg)


# In[23]:


fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.histplot(user_sessions[['user_id']], ax=ax.flatten()[0], discrete=True);
p = sns.histplot(user_sessions[['length_session']], ax=ax.flatten()[1], bins=200);

fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='browser', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='os', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='locale', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);

fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='gender', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='city', ax=ax.flatten()[1])
for label in ax.flatten()[1].get_xticklabels():
    label.set_rotation(90);
p = sns.countplot(data=user_sessions, x='country', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);


# In[1]:


fig, ax = plt.subplots(1, 3, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='year', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='month', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='day', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);
fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='weekday', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='start_hour', ax=ax.flatten()[1])


# ### Joe&rsquo;s Characteristics
# 
# 

# In[1]:


fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['browser', 'os', 'locale']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
set_xlabel_rotation(ax.flatten()[2], 90)


# In[1]:


fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['gender', 'city', 'country']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
    if feat != 'gender':
        set_xlabel_rotation(sub_ax, 90)


# In[1]:


fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['year', 'month', 'day']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
    if feat == 'day':
        set_xlabel_rotation(sub_ax, 90)


# In[1]:


fig, ax = plt.subplots(1, 2, figsize=(16, 4))
for i, feat in enumerate(['weekday', 'start_hour']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)


# # Features / target correlation

# ### Category features vs. target correlation with contingency analysis / Cramer&rsquo;s V
# 
# 

# #### Cramer&rsquo;s V
# 
# 

# In[24]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_crv = pd.Series([cramers_v(user_sessions[cat_feat], user_sessions['user_id']) for cat_feat in cat_cols], index = cat_cols, name='CramersV').sort_values(ascending=False)
cat_feat_target_crv


# #### Theil&rsquo;s U
# 
# 

# In[25]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_thu = pd.Series([theils_u(user_sessions[feat], user_sessions['user_id']) for feat in cat_cols], index = cat_cols, name='TheilsU').sort_values(ascending=False)
cat_feat_target_thu


# #### Plot
# 
# 

# In[1]:


cat_target_corr = pd.DataFrame({'Correlation':cat_feat_target_crv})
cat_target_corr['Stats'] = "Cramer's V"
df = pd.DataFrame({'Correlation': cat_feat_target_thu})
df['Stats'] = "Theil's U"
cat_target_corr = cat_target_corr.append(df).reset_index().rename(columns={'index': 'Features'})
fig, ax = plt.subplots(1, 1, figsize=(8.5, 4));
p = sns.barplot(data=cat_target_corr, x='Features', y='Correlation', hue='Stats', palette=sns.color_palette('rainbow', 3))
plt.title('Correlation between Categorical Features and Target user_id');


# ### Numerical features vs. target correlation with correlation ratio ( &eta; )
# 
# 

# In[26]:


num_cols = ['length_session', 'start_sin', 'start_cos']
num_target_corr = pd.Series([correlation_ratio(user_sessions['user_id'], user_sessions[feat]) for feat in num_cols], index = num_cols, name='CorrRatio').sort_values(ascending=False)
num_target_corr


# In[1]:


fig, ax = plt.subplots(1, 1, figsize=(9, 3));
p = sns.barplot(x=num_target_corr.index, y=num_target_corr.values, palette=sns.color_palette('rainbow', 3))
plt.title('Correlation between Session Lengths, Start Time Sin/Cos with Target user_id');


# In[1]:


fig, ax = plt.subplots(1, 2, figsize=(24,4))
for i, col in enumerate(['length_session', 'start_sin']):
    sub_ax = ax.flatten()[i]
    p = sns.scatterplot(data=user_sessions, x='user_id', y=col, ax=sub_ax)#, label=col)
    #sub_ax.legend(loc='upper right');
    t = sub_ax.set_title(f'{col} vs. user_id')


# # Data pipeline for modeling

# ### Definitions summarized from exploratory analysis above
# 
# 

# In[27]:


def load_data(data_file_path: Path) -> pd.DataFrame:
    """Load data from json file."""
    with open(data_file_path, "r") as f:
        data_json_struct = json.loads(f.read())
    user_sessions = pd.DataFrame(data_json_struct)
    return user_sessions


class TimezoneByCity:
    def __init__(self):
        self.geolocator = Nominatim(user_agent="geoapiExercises")
        self.tzfinder = TimezoneFinder()

    def tz_name(self, city: str):
        loc = self.geolocator.geocode(city)
        tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude)
        return tz_name

    def tz(self, city: str):
        tz_name = self.tz_name(city)
        return pytz.timezone(tz_name)


def prepare_data(user_sessions: pd.DataFrame, has_labels=False) -> pd.DataFrame:
    # impute empty sites
    empty_sites_index = user_sessions.query("sites.str.len() == 0").index
    user_sessions.loc[empty_sites_index, "sites"] = user_sessions.loc[
        empty_sites_index
    ]["sites"].apply(lambda sites: sites + [{"site": "NONE.NONE", "length": 0}])

    # Combine date/time columns and convert from string to datetime type
    user_sessions["start_dt"] = pd.to_datetime(
        user_sessions["date"] + " " + user_sessions["time"], utc=True
    )

    # Convert to local date time
    tz_by_city = TimezoneByCity()
    timezone_tbl = {
        loc: tz_by_city.tz_name(loc.split("/")[1])
        for loc in user_sessions.location.unique()
    }
    user_sessions["local_time"] = user_sessions.apply(
        lambda row: row["start_dt"]
        .tz_convert(timezone_tbl[row["location"]])
        .tz_localize(None),
        axis=1,
    )

    # Split start date/time to year / month / day / weekday and start_hour
    user_sessions["year"] = user_sessions.local_time.dt.year
    user_sessions["month"] = user_sessions.local_time.dt.month
    user_sessions["day"] = user_sessions.local_time.dt.day
    user_sessions["weekday"] = user_sessions.local_time.dt.weekday
    user_sessions["start_hour"] = user_sessions.local_time.dt.hour

    # Sine/Cosine transform of local start time
    start_dt_normalized = (
        (user_sessions["local_time"] - user_sessions["local_time"].dt.normalize())
        / pd.Timedelta("1 second")
        / 86400
    )
    user_sessions["start_sin"] = np.sin(2 * np.pi * (start_dt_normalized))
    user_sessions["start_cos"] = np.cos(2 * np.pi * (start_dt_normalized))

    # Split location to country and city
    user_sessions[["country", "city"]] = user_sessions["location"].str.split(
        "/", expand=True
    )

    # Get total length of each user session
    user_sessions["length_session"] = user_sessions["sites"].apply(
        lambda session_sites: sum(site_entry["length"] for site_entry in session_sites)
    )

    #
    user_sessions['sites_corpus'] = user_sessions['sites'].apply(
        lambda session_sites: ' '.join(site_entry['site'] for site_entry in session_sites))

    # Drop off original date/time columns
    user_sessions.drop(
        ["time", "date", "start_dt", "local_time", "location"], axis=1, inplace=True
    )

    return user_sessions


class SiteLengthTfIdfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_top=100, top_sites=None):
        self.n_top = n_top
        self.top_sites = top_sites
        self.top_sites_ = []
        self.tfidf = TfidfTransformer()

    def _get_topsites(self, sites):
        cnt = Counter()
        for session_sites in sites:
            for site_entry in session_sites:
                cnt.update({site_entry['site']: site_entry['length']})
        top_sites, _ = zip(*cnt.most_common(self.n_top))
        return top_sites

    def _vectorize_topsites_by_length(self, session_sites):
        topsites_len_dict = dict.fromkeys(self.top_sites_, 0)
        for site_entry in session_sites:
            site = site_entry['site']
            if site in topsites_len_dict:
                topsites_len_dict[site] += site_entry['length']
        return list(topsites_len_dict.values())

    def fit(self, X, y=None):
        if self.top_sites:
            self.top_sites_ = self.top_sites
        else:
            self.top_sites_ = self._get_topsites(X)
        topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X]
        self.tfidf = self.tfidf.fit(topsites_length)
        return self

    def transform(self, X, y=None):
        topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X]
        return self.tfidf.transform(topsites_length)

class SiteLengthTfIdfNGramTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_top=100, top_sites=None, ngram_min=1, ngram_max=1):
        self.n_top = n_top
        self.top_sites = top_sites
        self.ngram_min, self.ngram_max = ngram_min, ngram_max
        self.top_sites_ = []
        self.tfidf = TfidfTransformer()

    def _get_topsites(self, sites):
        cnt = Counter()
        for session_sites in sites:
            n_sites = len(session_sites)
            for n_gram in range(self.ngram_min, self.ngram_max+1):
                for i in range(0, n_sites - n_gram + 1):
                    token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram])
                    token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram])
                    cnt.update({token: token_len})
        top_sites, _ = zip(*cnt.most_common(self.n_top))
        return top_sites

    def _vectorize_topsites_by_length(self, session_sites):
        topsites_len_dict = dict.fromkeys(self.top_sites_, 0)
        n_sites = len(session_sites)
        for n_gram in range(self.ngram_min, self.ngram_max+1):
            for i in range(0, n_sites - n_gram + 1):
                token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram])
                token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram])
                if token in topsites_len_dict:
                    topsites_len_dict[token] += token_len
        return list(topsites_len_dict.values())

    def fit(self, X, y=None):
        if self.top_sites:
            self.top_sites_ = self.top_sites
        else:
            self.top_sites_ = self._get_topsites(X)
        topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X]
        self.tfidf = self.tfidf.fit(topsites_length)
        return self

    def transform(self, X, y=None):
        topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X]
        return self.tfidf.transform(topsites_length)


# ### Load data
# 
# 

# In[28]:


user_sessions = load_data('dataset.json')


# ### Stratified Train / Test Split
# 
# 

# In[29]:


ss_spliter = StratifiedShuffleSplit(n_splits=1, train_size=0.9, random_state=42)
train_idx, test_idx = next(ss_spliter.split(user_sessions, (user_sessions['user_id'] != 0).astype(int)))
user_sessions_train, user_sessions_test = user_sessions.loc[train_idx], user_sessions.loc[test_idx]


# ### Prepare data
# 
# 

# In[30]:


def get_topsites(sites, ngram_min=1, ngram_max=1):
    cnt = Counter()
    for session_sites in sites:
        n_sites = len(session_sites)
        for n_gram in range(ngram_min, ngram_max):
            for i in range(0, n_sites - n_gram + 1):
                token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram])
                token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram])
                cnt.update({token: token_len})
    top_sites, _ = zip(*cnt.most_common())
    return top_sites

joe_top_sites = get_topsites(user_sessions_train.query('user_id == 0')['sites'], ngram_min=1, ngram_max=5)
print(joe_top_sites[:10])


# In[31]:


user_sessions_train = prepare_data(user_sessions_train, has_labels=True)
user_sessions_test = prepare_data(user_sessions_test, has_labels=True)
user_sessions_train.head()


# # Modeling Experiments

# In[32]:


def test_rf_model(X_train, y_train, X_test, y_test, param_distribs=None, n_iter=20):
    if param_distribs is None:
        param_distribs = {
                'n_estimators': randint(low=1, high=200),
                'max_features': randint(low=1, high=X_train.shape[1]),
        }

    rf_clf = RandomForestClassifier(random_state=42)
    rnd_search = RandomizedSearchCV(rf_clf, param_distributions=param_distribs, n_iter=n_iter, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42)
    rnd_search = rnd_search.fit(X_train, y_train)

    train_f1 = rnd_search.best_score_

    rf_clf = rnd_search.best_estimator_
    y_test_pred = rf_clf.predict(X_test)
    test_rocauc = roc_auc_score(y_test, y_test_pred, average='weighted')
    test_report = classification_report(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)

    return train_f1, test_f1, test_report, test_rocauc


# ## Baseline Performance with Random Forest Classifier

# ### Sites vectorized by counts and normalized with TF-IDF. All users&rsquo; sites are used.

# In[33]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", max_features=2000), 'sites_corpus')
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# ### Site vectorized by &ldquo;Length&rdquo; and normalized with TF-IDF. All users&rsquo; sites are used.

# In[34]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, ngram_min=1, ngram_max=1), 'sites'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# ## Site Lengths TF-IDF - Joe&rsquo;s Visited Sites Only
# 
# 

# - The &ldquo;sites&rdquo; entries are vectorized and encoded with TF-IDF. All sites are used.

# In[35]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, top_sites=joe_top_sites, ngram_min=1, ngram_max=1), 'sites'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# -   Overfit since only Joe&rsquo;s visited sites are selected in the training set.
# 
# 

# ## Site Lengths TF-IDF - All Users&rsquo; Sites and N-Gram = (2,3,4,5)
# 
# 

# In[36]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, ngram_min=2, ngram_max=5), 'sites'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# -   Test F1 improved from 0.52 to 0.89, but there is a gap between train vs. test scores, an indicator of slight overfitting.
# 
# 

# ## Site Counts TF-IDF - All Users&rsquo; Sites and N-Gram = (2, 3, 4, 5)

# In[37]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(2,5), max_features=2000), 'sites_corpus'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# -   Test F1 and train f1 both reach 0.89.
# 
# 

# ## N-Gram=(3,4,5,6,7)

# In[38]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values


param_distribs = {
    'n_estimators': randint(low=50, high=1000),
    'max_features': randint(low=50, high=X_train.shape[1]),
    'max_depth': randint(low=1, high=100),
    'min_samples_leaf': randint(low=1, high=100),
    'min_samples_split': randint(low=2, high=100),
}
train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, param_distribs, n_iter=30)

print("Train F1 score: ", train_f1)
print("Test F1 score: ", test_f1)
print(test_report)
print('Test weighted ROC AUC score: ', test_rocauc)


# ## Gradient Boosting

# In[39]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values


param_distribs = {
    'n_estimators': randint(low=50, high=1000),
    'max_features': randint(low=50, high=X_train.shape[1]),
    'max_depth': randint(low=1, high=100),
    'min_samples_leaf': randint(low=1, high=100),
    'min_samples_split': randint(low=2, high=100),
    'n_iter_no_change': [5],
}

gb_clf = GradientBoostingClassifier(random_state=42)
gb_rnd_search = RandomizedSearchCV(gb_clf, param_distributions=param_distribs, n_iter=30, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42)
gb_rnd_search = gb_rnd_search.fit(X_train, y_train)

pd.DataFrame(gb_rnd_search.cv_results_)[['mean_test_score', 'params']].sort_values('mean_test_score', ascending=False)


# In[40]:


y_test_pred = gb_rnd_search.best_estimator_.predict(X_test)

print('balanced_accuracy:', balanced_accuracy_score(y_test, y_test_pred))
print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred))


# ## xgboost.XGBClassifier

# In[54]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values

from scipy.stats import uniform
param_distribs = {
    'n_estimators': randint(low=50, high=1000),
    'max_depth': randint(low=1, high=100),
    'reg_lambda' : uniform(0, 1),
    'min_child_weight' : randint(0, 10),
}

from xgboost import XGBClassifier
gb_clf = XGBClassifier(random_state=42, use_label_encoder=False, verbosity=0, n_iter_no_change=5)
rnd_search = RandomizedSearchCV(gb_clf, param_distributions=param_distribs, n_iter=30, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42)
rnd_search = rnd_search.fit(X_train, y_train)

pd.DataFrame(rnd_search.cv_results_)[['mean_test_score', 'params']].sort_values('mean_test_score', ascending=False)


# In[55]:


y_test_pred = rnd_search.best_estimator_.predict(X_test)

print('balanced_accuracy:', balanced_accuracy_score(y_test, y_test_pred))
print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred))


# ## DNN

# In[56]:


cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day
num_cols = ['start_sin', 'start_cos', 'length_session']
feature_encode_pipeline = ColumnTransformer([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('num_scaler', StandardScaler(), num_cols),
    ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'),
])
feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train)

X_train = feature_encode_pipeline.transform(user_sessions_train)
X_test = feature_encode_pipeline.transform(user_sessions_test)
y_train = (user_sessions_train['user_id'] == 0).astype(int).values  # set joe as positive label so we can use f1 binary metric
y_test = (user_sessions_test['user_id'] == 0).astype(int).values


# In[58]:


from tensorflow import keras

dnn = keras.models.Sequential([
    keras.layers.Input(shape=(None, X_train.shape[1])),
    keras.layers.Dense(1000, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dense(600, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dense(10, activation='relu', kernel_initializer='he_normal'),
    keras.layers.Dense(1, activation='sigmoid'),
])
dnn.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc'])
history = dnn.fit(X_train, y_train, epochs=30)


# In[59]:


y_train_pred = (dnn.predict(X_train) > 0.5).astype(int)
print(classification_report(y_train, y_train_pred))
print('roc_auc:', roc_auc_score(y_train, y_train_pred, average='macro'))


# In[60]:


y_test_pred = (dnn.predict(X_test) > 0.5).astype(int)
print(classification_report(y_test, y_test_pred))
print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='macro'))


# # Summary
# - The dataset is small and skewed/imbalanced for binary classification
# - Feature Extraction and Selection
#   - Users&rsquo; visited web sites are key features in this problem.
#   - Two vectorization methods are used to extract the visited websites feature:
#     -  Vectorize by visiting length
#     -  Vectorize by visiting counts
#   - N-Gram sequences up to 7-gram are used to preserve the orders, which matter a lot for model performance.
#   - TF-IDF normalization follows each vectorization method.
# - Models
#   - Random forest, boosting and DNN are tested without much tuning.
#   - All models can reach a F1 score about 0.89-0.9 on the test dataset, indicating a performance bound with these features.