# Python libs
import json
from collections import Counter
import numpy as np
import pandas as pd
from dython.nominal import cramers_v, theils_u, correlation_ratio
from scipy.stats import randint
# Date/time/timezone
import datetime as dt
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder
# scikit-learn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# Module settings
mpl.rc("figure", facecolor="white", dpi=144)
pd.set_option('expand_frame_repr', False) # display dataframe without wrapping
!head -n 50 dataset.json
!head -n 50 test.json
with open('dataset.json', 'r') as f:
data_json_struct = json.loads(f.read())
user_sessions = pd.DataFrame(data_json_struct)
print(data_json_struct[0])
user_sessions.head(20)
print('\n')
user_sessions.info()
user_sessions.query("sites.str.len() == 0")
empty_sites_index = user_sessions.query("sites.str.len() == 0").index
user_sessions.loc[empty_sites_index, 'sites'] = user_sessions.loc[empty_sites_index]['sites'].apply(lambda sites: sites + [{'site': 'NONE.NONE', 'length': 0}])
user_sessions.loc[empty_sites_index]
user_sessions['start_dt'] = pd.to_datetime(user_sessions['date'] + ' ' + user_sessions['time'], utc=True)
user_sessions.drop(['time', 'date'], axis=1, inplace=True)
user_sessions
class TimezoneByCity:
def __init__(self):
self.geolocator = Nominatim(user_agent="geoapiExercises")
self.tzfinder = TimezoneFinder()
def tz_name(self, city: str):
loc = self.geolocator.geocode(city)
tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude)
return tz_name
def tz(self, city: str):
tz_name = self.tz_name(city)
return pytz.timezone(tz_name)
tz_by_city = TimezoneByCity()
timezone_tbl = {loc: tz_by_city.tz_name(loc.split('/')[1]) for loc in user_sessions.location.unique()}
print(timezone_tbl)
user_sessions['local_time'] = user_sessions.apply(lambda row: row['start_dt'].tz_convert(timezone_tbl[row['location']]).tz_localize(None), axis=1)
user_sessions
user_sessions["year"] = user_sessions.local_time.dt.year
user_sessions["month"] = user_sessions.local_time.dt.month
user_sessions["day"] = user_sessions.local_time.dt.day
user_sessions["weekday"] = user_sessions.local_time.dt.weekday
user_sessions["start_hour"] = user_sessions.local_time.dt.hour
user_sessions
start_dt_normalized = (user_sessions['local_time'] - user_sessions['local_time'].dt.normalize()) / pd.Timedelta('1 second') / 86400
user_sessions['start_sin'] = np.sin(2*np.pi* (start_dt_normalized))
user_sessions['start_cos'] = np.cos(2*np.pi* (start_dt_normalized))
user_sessions
user_sessions[['country', 'city']] = user_sessions['location'].str.split('/', expand=True)
user_sessions
user_sessions['length_session'] = user_sessions['sites'].apply(lambda session_sites: sum(site_entry['length'] for site_entry in session_sites))
user_sessions
n_top = 100
joe_cnt = Counter()
for sites_session in user_sessions.query('user_id == 0')['sites']:
for site_entry in sites_session:
joe_cnt.update({site_entry['site']: site_entry['length']})
joe_top_sites, _ = zip(*joe_cnt.most_common(n_top))
print("Total sites joe visited: ", len(joe_cnt))
print(f"Top {n_top} sites joe visited: \n", joe_top_sites[:100])
n_top = 100
all_cnt = Counter()
for sites_session in user_sessions['sites']:
for site_entry in sites_session:
all_cnt.update({site_entry['site']: site_entry['length']})
all_top_sites, _ = zip(*all_cnt.most_common(n_top))
print("Total sites all users visited: ", len(all_cnt))
print(f"Top {n_top} sites all users visited: \n", all_top_sites[:100])
def get_topsites_length(session_sites: list, top_sites=all_top_sites):
topsites_len_dict = dict.fromkeys(top_sites, 0)
for site_entry in session_sites:
site = site_entry['site']
if site in topsites_len_dict:
topsites_len_dict[site] += site_entry['length']
return list(topsites_len_dict.values())
topsites_length = user_sessions['sites'].apply(get_topsites_length)
topsites_length
tfidf = TfidfTransformer()
topsites_tfidf = tfidf.fit_transform(topsites_length.values.tolist())
topsites_tfidf.toarray()[:2]
user_sessions['target'] = (user_sessions['user_id'] != 0).astype(int)
user_sessions
def set_xlabel_rotation(ax, deg=90):
for label in ax.get_xticklabels():
l = label.set_rotation(deg)
fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.histplot(user_sessions[['user_id']], ax=ax.flatten()[0], discrete=True);
p = sns.histplot(user_sessions[['length_session']], ax=ax.flatten()[1], bins=200);
fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='browser', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='os', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='locale', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
label.set_rotation(90);
fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='gender', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='city', ax=ax.flatten()[1])
for label in ax.flatten()[1].get_xticklabels():
label.set_rotation(90);
p = sns.countplot(data=user_sessions, x='country', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
label.set_rotation(90);
fig, ax = plt.subplots(1, 3, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='year', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='month', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='day', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
label.set_rotation(90);
fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='weekday', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='start_hour', ax=ax.flatten()[1])
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['browser', 'os', 'locale']):
sub_ax = ax.flatten()[i]
p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
set_xlabel_rotation(ax.flatten()[2], 90)
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['gender', 'city', 'country']):
sub_ax = ax.flatten()[i]
p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
if feat != 'gender':
set_xlabel_rotation(sub_ax, 90)
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['year', 'month', 'day']):
sub_ax = ax.flatten()[i]
p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
if feat == 'day':
set_xlabel_rotation(sub_ax, 90)
fig, ax = plt.subplots(1, 2, figsize=(16, 4))
for i, feat in enumerate(['weekday', 'start_hour']):
sub_ax = ax.flatten()[i]
p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_crv = pd.Series([cramers_v(user_sessions[cat_feat], user_sessions['user_id']) for cat_feat in cat_cols], index = cat_cols, name='CramersV').sort_values(ascending=False)
cat_feat_target_crv
cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_thu = pd.Series([theils_u(user_sessions[feat], user_sessions['user_id']) for feat in cat_cols], index = cat_cols, name='TheilsU').sort_values(ascending=False)
cat_feat_target_thu
cat_target_corr = pd.DataFrame({'Correlation':cat_feat_target_crv})
cat_target_corr['Stats'] = "Cramer's V"
df = pd.DataFrame({'Correlation': cat_feat_target_thu})
df['Stats'] = "Theil's U"
cat_target_corr = cat_target_corr.append(df).reset_index().rename(columns={'index': 'Features'})
fig, ax = plt.subplots(1, 1, figsize=(8.5, 4));
p = sns.barplot(data=cat_target_corr, x='Features', y='Correlation', hue='Stats', palette=sns.color_palette('rainbow', 3))
plt.title('Correlation between Categorical Features and Target user_id');