#!/usr/bin/env python # coding: utf-8 # Catch Joe # ========= # In[2]: # Python libs import json from collections import Counter import numpy as np import pandas as pd from dython.nominal import cramers_v, theils_u, correlation_ratio from scipy.stats import randint # Date/time/timezone import datetime as dt import pytz from geopy.geocoders import Nominatim from timezonefinder import TimezoneFinder # scikit-learn from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.compose import ColumnTransformer from sklearn.base import BaseEstimator, TransformerMixin from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import f1_score, roc_auc_score, balanced_accuracy_score from sklearn.metrics import classification_report from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Visualization import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns # Module settings mpl.rc("figure", facecolor="white", dpi=144) pd.set_option('expand_frame_repr', False) # display dataframe without wrapping # # Data Preparation # # # ### Check data structure # # # - Two data files are given: the training data “dataset.json” and test data “verify.json”. # - The training data file is a big json file, about 77MB, so we’ll firstly peek into the data file and check its structure. # # # In[3]: get_ipython().system('head -n 50 dataset.json') # In[4]: get_ipython().system('head -n 50 test.json') # - It looks the training data have 9 fields, and the sites fields is a nested list, which contains all the sites the user visits in the session. # - To transfer the nested json data into useful features, we'll vectorize the sites column. # # # ### Load data # # # In[5]: with open('dataset.json', 'r') as f: data_json_struct = json.loads(f.read()) user_sessions = pd.DataFrame(data_json_struct) print(data_json_struct[0]) # ### Data Inspection # # # In[6]: user_sessions.head(20) print('\n') user_sessions.info() # In[7]: user_sessions.query("sites.str.len() == 0") # ### Prepare Data: Impute Empty Sites and Add Custom Features # # # #### Impute empty sites # # # In[8]: empty_sites_index = user_sessions.query("sites.str.len() == 0").index user_sessions.loc[empty_sites_index, 'sites'] = user_sessions.loc[empty_sites_index]['sites'].apply(lambda sites: sites + [{'site': 'NONE.NONE', 'length': 0}]) user_sessions.loc[empty_sites_index] # #### Combine date/time columns and convert from string to datetime type # # # In[9]: user_sessions['start_dt'] = pd.to_datetime(user_sessions['date'] + ' ' + user_sessions['time'], utc=True) user_sessions.drop(['time', 'date'], axis=1, inplace=True) user_sessions # #### Convert start time to local time # # # - A class that converts city name to timezone # # # In[10]: class TimezoneByCity: def __init__(self): self.geolocator = Nominatim(user_agent="geoapiExercises") self.tzfinder = TimezoneFinder() def tz_name(self, city: str): loc = self.geolocator.geocode(city) tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude) return tz_name def tz(self, city: str): tz_name = self.tz_name(city) return pytz.timezone(tz_name) # - Build timezone table that maps country/city to timezone # # # In[11]: tz_by_city = TimezoneByCity() timezone_tbl = {loc: tz_by_city.tz_name(loc.split('/')[1]) for loc in user_sessions.location.unique()} print(timezone_tbl) # - Add local_time column to data # # # In[12]: user_sessions['local_time'] = user_sessions.apply(lambda row: row['start_dt'].tz_convert(timezone_tbl[row['location']]).tz_localize(None), axis=1) user_sessions # #### Split starting date and time to year / month / day / weekday and start_hour # # # In[13]: user_sessions["year"] = user_sessions.local_time.dt.year user_sessions["month"] = user_sessions.local_time.dt.month user_sessions["day"] = user_sessions.local_time.dt.day user_sessions["weekday"] = user_sessions.local_time.dt.weekday user_sessions["start_hour"] = user_sessions.local_time.dt.hour user_sessions # #### Sine/Cosine transform of local start time # # # In[14]: start_dt_normalized = (user_sessions['local_time'] - user_sessions['local_time'].dt.normalize()) / pd.Timedelta('1 second') / 86400 user_sessions['start_sin'] = np.sin(2*np.pi* (start_dt_normalized)) user_sessions['start_cos'] = np.cos(2*np.pi* (start_dt_normalized)) user_sessions # #### Split location to country and city # # # In[15]: user_sessions[['country', 'city']] = user_sessions['location'].str.split('/', expand=True) user_sessions # #### Get total length of each user session # # # In[16]: user_sessions['length_session'] = user_sessions['sites'].apply(lambda session_sites: sum(site_entry['length'] for site_entry in session_sites)) user_sessions # ### Vectorize top sites using TF-IDF # # # In[17]: n_top = 100 joe_cnt = Counter() for sites_session in user_sessions.query('user_id == 0')['sites']: for site_entry in sites_session: joe_cnt.update({site_entry['site']: site_entry['length']}) joe_top_sites, _ = zip(*joe_cnt.most_common(n_top)) print("Total sites joe visited: ", len(joe_cnt)) print(f"Top {n_top} sites joe visited: \n", joe_top_sites[:100]) # In[18]: n_top = 100 all_cnt = Counter() for sites_session in user_sessions['sites']: for site_entry in sites_session: all_cnt.update({site_entry['site']: site_entry['length']}) all_top_sites, _ = zip(*all_cnt.most_common(n_top)) print("Total sites all users visited: ", len(all_cnt)) print(f"Top {n_top} sites all users visited: \n", all_top_sites[:100]) # In[20]: def get_topsites_length(session_sites: list, top_sites=all_top_sites): topsites_len_dict = dict.fromkeys(top_sites, 0) for site_entry in session_sites: site = site_entry['site'] if site in topsites_len_dict: topsites_len_dict[site] += site_entry['length'] return list(topsites_len_dict.values()) topsites_length = user_sessions['sites'].apply(get_topsites_length) topsites_length # In[21]: tfidf = TfidfTransformer() topsites_tfidf = tfidf.fit_transform(topsites_length.values.tolist()) topsites_tfidf.toarray()[:2] # ### Add binary class label: Joe=0, Other users=1 # # # In[22]: user_sessions['target'] = (user_sessions['user_id'] != 0).astype(int) user_sessions # # Visual inspection # ### Histogram / Count plot # # # In[1]: def set_xlabel_rotation(ax, deg=90): for label in ax.get_xticklabels(): l = label.set_rotation(deg) # In[23]: fig, ax = plt.subplots(1, 2, figsize=(16, 4)); p = sns.histplot(user_sessions[['user_id']], ax=ax.flatten()[0], discrete=True); p = sns.histplot(user_sessions[['length_session']], ax=ax.flatten()[1], bins=200); fig, ax = plt.subplots(1, 3, figsize=(21, 4)); p = sns.countplot(data=user_sessions, x='browser', ax=ax.flatten()[0]) p = sns.countplot(data=user_sessions, x='os', ax=ax.flatten()[1]) p = sns.countplot(data=user_sessions, x='locale', ax=ax.flatten()[2]) for label in ax.flatten()[2].get_xticklabels(): label.set_rotation(90); fig, ax = plt.subplots(1, 3, figsize=(21, 4)); p = sns.countplot(data=user_sessions, x='gender', ax=ax.flatten()[0]) p = sns.countplot(data=user_sessions, x='city', ax=ax.flatten()[1]) for label in ax.flatten()[1].get_xticklabels(): label.set_rotation(90); p = sns.countplot(data=user_sessions, x='country', ax=ax.flatten()[2]) for label in ax.flatten()[2].get_xticklabels(): label.set_rotation(90); # In[1]: fig, ax = plt.subplots(1, 3, figsize=(16, 4)); p = sns.countplot(data=user_sessions, x='year', ax=ax.flatten()[0]) p = sns.countplot(data=user_sessions, x='month', ax=ax.flatten()[1]) p = sns.countplot(data=user_sessions, x='day', ax=ax.flatten()[2]) for label in ax.flatten()[2].get_xticklabels(): label.set_rotation(90); fig, ax = plt.subplots(1, 2, figsize=(16, 4)); p = sns.countplot(data=user_sessions, x='weekday', ax=ax.flatten()[0]) p = sns.countplot(data=user_sessions, x='start_hour', ax=ax.flatten()[1]) # ### Joe’s Characteristics # # # In[1]: fig, ax = plt.subplots(1, 3, figsize=(20, 4)) for i, feat in enumerate(['browser', 'os', 'locale']): sub_ax = ax.flatten()[i] p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax) set_xlabel_rotation(ax.flatten()[2], 90) # In[1]: fig, ax = plt.subplots(1, 3, figsize=(20, 4)) for i, feat in enumerate(['gender', 'city', 'country']): sub_ax = ax.flatten()[i] p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax) if feat != 'gender': set_xlabel_rotation(sub_ax, 90) # In[1]: fig, ax = plt.subplots(1, 3, figsize=(20, 4)) for i, feat in enumerate(['year', 'month', 'day']): sub_ax = ax.flatten()[i] p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax) if feat == 'day': set_xlabel_rotation(sub_ax, 90) # In[1]: fig, ax = plt.subplots(1, 2, figsize=(16, 4)) for i, feat in enumerate(['weekday', 'start_hour']): sub_ax = ax.flatten()[i] p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax) # # Features / target correlation # ### Category features vs. target correlation with contingency analysis / Cramer’s V # # # #### Cramer’s V # # # In[24]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour'] cat_feat_target_crv = pd.Series([cramers_v(user_sessions[cat_feat], user_sessions['user_id']) for cat_feat in cat_cols], index = cat_cols, name='CramersV').sort_values(ascending=False) cat_feat_target_crv # #### Theil’s U # # # In[25]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour'] cat_feat_target_thu = pd.Series([theils_u(user_sessions[feat], user_sessions['user_id']) for feat in cat_cols], index = cat_cols, name='TheilsU').sort_values(ascending=False) cat_feat_target_thu # #### Plot # # # In[1]: cat_target_corr = pd.DataFrame({'Correlation':cat_feat_target_crv}) cat_target_corr['Stats'] = "Cramer's V" df = pd.DataFrame({'Correlation': cat_feat_target_thu}) df['Stats'] = "Theil's U" cat_target_corr = cat_target_corr.append(df).reset_index().rename(columns={'index': 'Features'}) fig, ax = plt.subplots(1, 1, figsize=(8.5, 4)); p = sns.barplot(data=cat_target_corr, x='Features', y='Correlation', hue='Stats', palette=sns.color_palette('rainbow', 3)) plt.title('Correlation between Categorical Features and Target user_id'); # ### Numerical features vs. target correlation with correlation ratio ( η ) # # # In[26]: num_cols = ['length_session', 'start_sin', 'start_cos'] num_target_corr = pd.Series([correlation_ratio(user_sessions['user_id'], user_sessions[feat]) for feat in num_cols], index = num_cols, name='CorrRatio').sort_values(ascending=False) num_target_corr # In[1]: fig, ax = plt.subplots(1, 1, figsize=(9, 3)); p = sns.barplot(x=num_target_corr.index, y=num_target_corr.values, palette=sns.color_palette('rainbow', 3)) plt.title('Correlation between Session Lengths, Start Time Sin/Cos with Target user_id'); # In[1]: fig, ax = plt.subplots(1, 2, figsize=(24,4)) for i, col in enumerate(['length_session', 'start_sin']): sub_ax = ax.flatten()[i] p = sns.scatterplot(data=user_sessions, x='user_id', y=col, ax=sub_ax)#, label=col) #sub_ax.legend(loc='upper right'); t = sub_ax.set_title(f'{col} vs. user_id') # # Data pipeline for modeling # ### Definitions summarized from exploratory analysis above # # # In[27]: def load_data(data_file_path: Path) -> pd.DataFrame: """Load data from json file.""" with open(data_file_path, "r") as f: data_json_struct = json.loads(f.read()) user_sessions = pd.DataFrame(data_json_struct) return user_sessions class TimezoneByCity: def __init__(self): self.geolocator = Nominatim(user_agent="geoapiExercises") self.tzfinder = TimezoneFinder() def tz_name(self, city: str): loc = self.geolocator.geocode(city) tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude) return tz_name def tz(self, city: str): tz_name = self.tz_name(city) return pytz.timezone(tz_name) def prepare_data(user_sessions: pd.DataFrame, has_labels=False) -> pd.DataFrame: # impute empty sites empty_sites_index = user_sessions.query("sites.str.len() == 0").index user_sessions.loc[empty_sites_index, "sites"] = user_sessions.loc[ empty_sites_index ]["sites"].apply(lambda sites: sites + [{"site": "NONE.NONE", "length": 0}]) # Combine date/time columns and convert from string to datetime type user_sessions["start_dt"] = pd.to_datetime( user_sessions["date"] + " " + user_sessions["time"], utc=True ) # Convert to local date time tz_by_city = TimezoneByCity() timezone_tbl = { loc: tz_by_city.tz_name(loc.split("/")[1]) for loc in user_sessions.location.unique() } user_sessions["local_time"] = user_sessions.apply( lambda row: row["start_dt"] .tz_convert(timezone_tbl[row["location"]]) .tz_localize(None), axis=1, ) # Split start date/time to year / month / day / weekday and start_hour user_sessions["year"] = user_sessions.local_time.dt.year user_sessions["month"] = user_sessions.local_time.dt.month user_sessions["day"] = user_sessions.local_time.dt.day user_sessions["weekday"] = user_sessions.local_time.dt.weekday user_sessions["start_hour"] = user_sessions.local_time.dt.hour # Sine/Cosine transform of local start time start_dt_normalized = ( (user_sessions["local_time"] - user_sessions["local_time"].dt.normalize()) / pd.Timedelta("1 second") / 86400 ) user_sessions["start_sin"] = np.sin(2 * np.pi * (start_dt_normalized)) user_sessions["start_cos"] = np.cos(2 * np.pi * (start_dt_normalized)) # Split location to country and city user_sessions[["country", "city"]] = user_sessions["location"].str.split( "/", expand=True ) # Get total length of each user session user_sessions["length_session"] = user_sessions["sites"].apply( lambda session_sites: sum(site_entry["length"] for site_entry in session_sites) ) # user_sessions['sites_corpus'] = user_sessions['sites'].apply( lambda session_sites: ' '.join(site_entry['site'] for site_entry in session_sites)) # Drop off original date/time columns user_sessions.drop( ["time", "date", "start_dt", "local_time", "location"], axis=1, inplace=True ) return user_sessions class SiteLengthTfIdfTransformer(BaseEstimator, TransformerMixin): def __init__(self, n_top=100, top_sites=None): self.n_top = n_top self.top_sites = top_sites self.top_sites_ = [] self.tfidf = TfidfTransformer() def _get_topsites(self, sites): cnt = Counter() for session_sites in sites: for site_entry in session_sites: cnt.update({site_entry['site']: site_entry['length']}) top_sites, _ = zip(*cnt.most_common(self.n_top)) return top_sites def _vectorize_topsites_by_length(self, session_sites): topsites_len_dict = dict.fromkeys(self.top_sites_, 0) for site_entry in session_sites: site = site_entry['site'] if site in topsites_len_dict: topsites_len_dict[site] += site_entry['length'] return list(topsites_len_dict.values()) def fit(self, X, y=None): if self.top_sites: self.top_sites_ = self.top_sites else: self.top_sites_ = self._get_topsites(X) topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X] self.tfidf = self.tfidf.fit(topsites_length) return self def transform(self, X, y=None): topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X] return self.tfidf.transform(topsites_length) class SiteLengthTfIdfNGramTransformer(BaseEstimator, TransformerMixin): def __init__(self, n_top=100, top_sites=None, ngram_min=1, ngram_max=1): self.n_top = n_top self.top_sites = top_sites self.ngram_min, self.ngram_max = ngram_min, ngram_max self.top_sites_ = [] self.tfidf = TfidfTransformer() def _get_topsites(self, sites): cnt = Counter() for session_sites in sites: n_sites = len(session_sites) for n_gram in range(self.ngram_min, self.ngram_max+1): for i in range(0, n_sites - n_gram + 1): token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram]) token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram]) cnt.update({token: token_len}) top_sites, _ = zip(*cnt.most_common(self.n_top)) return top_sites def _vectorize_topsites_by_length(self, session_sites): topsites_len_dict = dict.fromkeys(self.top_sites_, 0) n_sites = len(session_sites) for n_gram in range(self.ngram_min, self.ngram_max+1): for i in range(0, n_sites - n_gram + 1): token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram]) token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram]) if token in topsites_len_dict: topsites_len_dict[token] += token_len return list(topsites_len_dict.values()) def fit(self, X, y=None): if self.top_sites: self.top_sites_ = self.top_sites else: self.top_sites_ = self._get_topsites(X) topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X] self.tfidf = self.tfidf.fit(topsites_length) return self def transform(self, X, y=None): topsites_length = [self._vectorize_topsites_by_length(session_sites) for session_sites in X] return self.tfidf.transform(topsites_length) # ### Load data # # # In[28]: user_sessions = load_data('dataset.json') # ### Stratified Train / Test Split # # # In[29]: ss_spliter = StratifiedShuffleSplit(n_splits=1, train_size=0.9, random_state=42) train_idx, test_idx = next(ss_spliter.split(user_sessions, (user_sessions['user_id'] != 0).astype(int))) user_sessions_train, user_sessions_test = user_sessions.loc[train_idx], user_sessions.loc[test_idx] # ### Prepare data # # # In[30]: def get_topsites(sites, ngram_min=1, ngram_max=1): cnt = Counter() for session_sites in sites: n_sites = len(session_sites) for n_gram in range(ngram_min, ngram_max): for i in range(0, n_sites - n_gram + 1): token = ' '.join(entry['site'] for entry in session_sites[i:i+n_gram]) token_len = sum(entry['length'] for entry in session_sites[i:i+n_gram]) cnt.update({token: token_len}) top_sites, _ = zip(*cnt.most_common()) return top_sites joe_top_sites = get_topsites(user_sessions_train.query('user_id == 0')['sites'], ngram_min=1, ngram_max=5) print(joe_top_sites[:10]) # In[31]: user_sessions_train = prepare_data(user_sessions_train, has_labels=True) user_sessions_test = prepare_data(user_sessions_test, has_labels=True) user_sessions_train.head() # # Modeling Experiments # In[32]: def test_rf_model(X_train, y_train, X_test, y_test, param_distribs=None, n_iter=20): if param_distribs is None: param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=X_train.shape[1]), } rf_clf = RandomForestClassifier(random_state=42) rnd_search = RandomizedSearchCV(rf_clf, param_distributions=param_distribs, n_iter=n_iter, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42) rnd_search = rnd_search.fit(X_train, y_train) train_f1 = rnd_search.best_score_ rf_clf = rnd_search.best_estimator_ y_test_pred = rf_clf.predict(X_test) test_rocauc = roc_auc_score(y_test, y_test_pred, average='weighted') test_report = classification_report(y_test, y_test_pred) test_f1 = f1_score(y_test, y_test_pred) return train_f1, test_f1, test_report, test_rocauc # ## Baseline Performance with Random Forest Classifier # ### Sites vectorized by counts and normalized with TF-IDF. All users’ sites are used. # In[33]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", max_features=2000), 'sites_corpus') ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # ### Site vectorized by “Length” and normalized with TF-IDF. All users’ sites are used. # In[34]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, ngram_min=1, ngram_max=1), 'sites'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # ## Site Lengths TF-IDF - Joe’s Visited Sites Only # # # - The “sites” entries are vectorized and encoded with TF-IDF. All sites are used. # In[35]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, top_sites=joe_top_sites, ngram_min=1, ngram_max=1), 'sites'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # - Overfit since only Joe’s visited sites are selected in the training set. # # # ## Site Lengths TF-IDF - All Users’ Sites and N-Gram = (2,3,4,5) # # # In[36]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_length_tfidf', SiteLengthTfIdfNGramTransformer(n_top=2000, ngram_min=2, ngram_max=5), 'sites'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # - Test F1 improved from 0.52 to 0.89, but there is a gap between train vs. test scores, an indicator of slight overfitting. # # # ## Site Counts TF-IDF - All Users’ Sites and N-Gram = (2, 3, 4, 5) # In[37]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(2,5), max_features=2000), 'sites_corpus'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, n_iter=20) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # - Test F1 and train f1 both reach 0.89. # # # ## N-Gram=(3,4,5,6,7) # In[38]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values param_distribs = { 'n_estimators': randint(low=50, high=1000), 'max_features': randint(low=50, high=X_train.shape[1]), 'max_depth': randint(low=1, high=100), 'min_samples_leaf': randint(low=1, high=100), 'min_samples_split': randint(low=2, high=100), } train_f1, test_f1, test_report, test_rocauc = test_rf_model(X_train, y_train, X_test, y_test, param_distribs, n_iter=30) print("Train F1 score: ", train_f1) print("Test F1 score: ", test_f1) print(test_report) print('Test weighted ROC AUC score: ', test_rocauc) # ## Gradient Boosting # In[39]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values param_distribs = { 'n_estimators': randint(low=50, high=1000), 'max_features': randint(low=50, high=X_train.shape[1]), 'max_depth': randint(low=1, high=100), 'min_samples_leaf': randint(low=1, high=100), 'min_samples_split': randint(low=2, high=100), 'n_iter_no_change': [5], } gb_clf = GradientBoostingClassifier(random_state=42) gb_rnd_search = RandomizedSearchCV(gb_clf, param_distributions=param_distribs, n_iter=30, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42) gb_rnd_search = gb_rnd_search.fit(X_train, y_train) pd.DataFrame(gb_rnd_search.cv_results_)[['mean_test_score', 'params']].sort_values('mean_test_score', ascending=False) # In[40]: y_test_pred = gb_rnd_search.best_estimator_.predict(X_test) print('balanced_accuracy:', balanced_accuracy_score(y_test, y_test_pred)) print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='weighted')) print(classification_report(y_test, y_test_pred)) # ## xgboost.XGBClassifier # In[54]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values from scipy.stats import uniform param_distribs = { 'n_estimators': randint(low=50, high=1000), 'max_depth': randint(low=1, high=100), 'reg_lambda' : uniform(0, 1), 'min_child_weight' : randint(0, 10), } from xgboost import XGBClassifier gb_clf = XGBClassifier(random_state=42, use_label_encoder=False, verbosity=0, n_iter_no_change=5) rnd_search = RandomizedSearchCV(gb_clf, param_distributions=param_distribs, n_iter=30, scoring='f1', n_jobs=-1, refit=True, cv=5, random_state=42) rnd_search = rnd_search.fit(X_train, y_train) pd.DataFrame(rnd_search.cv_results_)[['mean_test_score', 'params']].sort_values('mean_test_score', ascending=False) # In[55]: y_test_pred = rnd_search.best_estimator_.predict(X_test) print('balanced_accuracy:', balanced_accuracy_score(y_test, y_test_pred)) print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='weighted')) print(classification_report(y_test, y_test_pred)) # ## DNN # In[56]: cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'weekday'] # discard start_hour and uncorrelated year/month/day num_cols = ['start_sin', 'start_cos', 'length_session'] feature_encode_pipeline = ColumnTransformer([ ('cat_encoder', OneHotEncoder(handle_unknown='ignore'), cat_cols), ('num_scaler', StandardScaler(), num_cols), ('site_count_tfidf', TfidfVectorizer(token_pattern=r"(?u)\b[-\w@:%.\+~#=][-\w@:%.\+~#=]+\b", ngram_range=(3,7), max_features=5000), 'sites_corpus'), ]) feature_encode_pipeline = feature_encode_pipeline.fit(user_sessions_train) X_train = feature_encode_pipeline.transform(user_sessions_train) X_test = feature_encode_pipeline.transform(user_sessions_test) y_train = (user_sessions_train['user_id'] == 0).astype(int).values # set joe as positive label so we can use f1 binary metric y_test = (user_sessions_test['user_id'] == 0).astype(int).values # In[58]: from tensorflow import keras dnn = keras.models.Sequential([ keras.layers.Input(shape=(None, X_train.shape[1])), keras.layers.Dense(1000, activation='relu', kernel_initializer='he_normal'), keras.layers.Dense(600, activation='relu', kernel_initializer='he_normal'), keras.layers.Dense(300, activation='relu', kernel_initializer='he_normal'), keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal'), keras.layers.Dense(10, activation='relu', kernel_initializer='he_normal'), keras.layers.Dense(1, activation='sigmoid'), ]) dnn.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['acc']) history = dnn.fit(X_train, y_train, epochs=30) # In[59]: y_train_pred = (dnn.predict(X_train) > 0.5).astype(int) print(classification_report(y_train, y_train_pred)) print('roc_auc:', roc_auc_score(y_train, y_train_pred, average='macro')) # In[60]: y_test_pred = (dnn.predict(X_test) > 0.5).astype(int) print(classification_report(y_test, y_test_pred)) print('roc_auc:', roc_auc_score(y_test, y_test_pred, average='macro')) # # Summary # - The dataset is small and skewed/imbalanced for binary classification # - Feature Extraction and Selection # - Users’ visited web sites are key features in this problem. # - Two vectorization methods are used to extract the visited websites feature: # - Vectorize by visiting length # - Vectorize by visiting counts # - N-Gram sequences up to 7-gram are used to preserve the orders, which matter a lot for model performance. # - TF-IDF normalization follows each vectorization method. # - Models # - Random forest, boosting and DNN are tested without much tuning. # - All models can reach a F1 score about 0.89-0.9 on the test dataset, indicating a performance bound with these features.