#!/usr/bin/env python # coding: utf-8 # In[32]: import os import pickle import numpy as np import pandas as pd from scipy.sparse import hstack import eli5 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import StandardScaler from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV from sklearn.metrics import roc_auc_score from sklearn.linear_model import LogisticRegression from matplotlib import pyplot as plt import seaborn as sns from IPython.display import display_html import re # In[33]: PATH_TO_DATA = '/Users/user/Dropbox/ods/alice/' SEED = 17 # In[34]: def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict, vectorizer_params): times = ['time%s' % i for i in range(1, 11)] train_df = pd.read_csv(path_to_train, index_col='session_id', parse_dates=times) test_df = pd.read_csv(path_to_test, index_col='session_id', parse_dates=times) # Sort the data by time train_df = train_df.sort_values(by='time1') train_df = train_df.loc[(train_df.time1 < '2014-04-16') & (train_df.time1 > '2013-02-12')] # Reading site -> id mapping provided by competition organizers with open(path_to_site_dict, 'rb') as f: site2id = pickle.load(f) # Create an inverse id _> site mapping id2site = {v:k.replace('www.', '') for (k, v) in site2id.items()} # We treat site with id 0 as "unknown" id2site[0] = 'unknown' # Transform data into format which can be fed into TfidfVectorizer # This time we prefer to represent sessions with site names, not site ids. # It's less efficient but thus it'll be more convenient to interpret model weights. sites = ['site%s' % i for i in range(1, 11)] train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist() test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: ' '.join([id2site[i] for i in row]), axis=1).tolist() # We'll tell TfidfVectorizer that we'd like to split data by whitespaces only # So that it doesn't split by dots (we wouldn't like to have 'mail.google.com' # To be split into 'mail', 'google' and 'com') vectorizer = TfidfVectorizer(**vectorizer_params) X_train = vectorizer.fit_transform(train_sessions) X_test = vectorizer.transform(test_sessions) y_train = train_df['target'].astype('int').values # We'll need site visit times for further feature engineering train_times, test_times = train_df[times], test_df[times] return X_train, X_test, y_train, vectorizer, train_times, test_times # In[35]: get_ipython().run_cell_magic('time', '', "X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(\n path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),\n path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),\n path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),\n vectorizer_params={'ngram_range': (1, 5), \n 'max_features': 25000,\n 'sublinear_tf': True,\n 'tokenizer': lambda s: s.split()}\n)\n") # In[36]: # Lets draw the distribution of all session start hours session_start_hour = train_times['time1'].apply(lambda ts: ts.hour).values sns.countplot(session_start_hour) # Now the same separately for Alice and everybody else. # In[37]: plt.subplots(1, 2, figsize = (12, 6)) plt.subplot(1, 2, 1) sns.countplot(session_start_hour[y_train == 1]) plt.title("Alice") plt.xlabel('Session start hour') plt.subplot(1, 2, 2) sns.countplot(session_start_hour[y_train == 0]) plt.title('Others') plt.xlabel('Session start hour'); # Now we definitely see that Alice mostly prefers 4-5 pm for browsing # In[38]: time_split = TimeSeriesSplit(n_splits=10) # In[39]: logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear') # In[40]: # A helper function for writing predictions to a file def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"): predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target]) predicted_df.to_csv(out_file, index_label=index_label) # In[41]: def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), new_feature_names=None, cv=time_split, scoring='roc_auc', top_n_features_to_show=30, submission_file_name='submission.csv'): cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring, n_jobs=4) print('CV scores', cv_scores) print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std())) model.fit(X_train, y_train) if new_feature_names: all_feature_names = site_feature_names + new_feature_names else: all_feature_names = site_feature_names display_html(eli5.show_weights(estimator=model, feature_names=all_feature_names, top=top_n_features_to_show)) if new_feature_names: print('New feature weights:') print(pd.DataFrame({'feature': new_feature_names, 'coef': model.coef_.flatten()[-len(new_feature_names):]})) test_pred = model.predict_proba(X_test)[:, 1] write_to_submission_file(test_pred, submission_file_name) return cv_scores # In[42]: # Adding new features def add_time_features(times, X_sparse, add_hour=True): hour = times['time1'].apply(lambda ts: ts.hour) morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1) day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1) evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1) month = times['time1'].apply(lambda ts: ts.month) summer = ((month >= 6) & (month <= 8)).values.reshape(-1, 1) alice_hour = [12,13,16,17,18] alice_hours = hour.apply(lambda x: 1 if x in alice_hour else 0).values.reshape(-1, 1) session_duration = ((times.max(axis=1) - times.min(axis=1)).astype('timedelta64[ms]').astype(int) ** 0.2).values.reshape(-1, 1) number_of_sites = times.isnull().sum(axis=1).apply(lambda x: 10 - x).astype('int').values.reshape(-1, 1) time_per_site = ((session_duration / number_of_sites) ** 0.2).astype('int') objects_to_hstack = [X_sparse, morning, day, evening, summer, number_of_sites, time_per_site, alice_hours] feature_names = ['summer', 'morning', 'day', 'evening', 'number_of_sites', 'time_per_site', 'alice_hours'] if add_hour: # We'll do it right and scale hour dividing by 24 objects_to_hstack.append(hour.values.reshape(-1, 1) / 24) feature_names.append('hour') X = hstack(objects_to_hstack) return X, feature_names # In[43]: X_train_with_times2, new_feat_names = add_time_features(train_times, X_train_sites, add_hour=False) X_test_with_times2, _ = add_time_features(test_times, X_test_sites, add_hour=False) # In[44]: train_durations = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int) test_durations = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int) scaler = StandardScaler() train_dur_scaled = scaler.fit_transform(train_durations.values.reshape(-1, 1)) test_dur_scaled = scaler.transform(test_durations.values.reshape(-1, 1)) # In[45]: X_train_with_time_correct = hstack([X_train_with_times2, train_dur_scaled]) X_test_with_time_correct = hstack([X_test_with_times2, test_dur_scaled]) # In[46]: def add_day_month(times, X_sparse): day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1) month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) # Linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5 objects_to_hstack = [X_sparse, day_of_week, year_month] feature_names = ['day_of_week','year_month'] X = hstack(objects_to_hstack) return X, feature_names # In[47]: X_train_final, more_feat_names = add_day_month(train_times, X_train_with_time_correct) X_test_final, _ = add_day_month(test_times, X_test_with_time_correct) # In[48]: cv_scores6 = train_and_predict(model=logit, X_train=X_train_final, y_train=y_train, X_test=X_test_final, site_feature_names=vectorizer.get_feature_names(), new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names, cv=time_split, submission_file_name='alice_subm.csv') # In[49]: # Here we've already narrowed down c_values to such a range c_values = np.logspace(-3, 1, 20) logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1) # In[50]: get_ipython().run_cell_magic('time', '', 'logit_grid_searcher.fit(X_train_final, y_train); \n') # In[51]: logit_grid_searcher.best_score_, logit_grid_searcher.best_params_ # In[52]: final_model = logit_grid_searcher.best_estimator_ # In[53]: cv_scores7 = train_and_predict(model=final_model, X_train=X_train_final, y_train=y_train, X_test=X_test_final, site_feature_names=vectorizer.get_feature_names(), new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names, cv=time_split, submission_file_name='alice_subm_final.csv')