Catch Joe

In [2]:
# Python libs
import json
from collections import Counter
import numpy as np
import pandas as pd
from dython.nominal import cramers_v, theils_u, correlation_ratio
from scipy.stats import randint

# Date/time/timezone
import datetime as dt
import pytz
from geopy.geocoders import Nominatim
from timezonefinder import TimezoneFinder

# scikit-learn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Module settings
mpl.rc("figure", facecolor="white", dpi=144)
pd.set_option('expand_frame_repr', False)  # display dataframe without wrapping

Data Preparation

Check data structure

  • Two data files are given: the training data “dataset.json” and test data “verify.json”.
  • The training data file is a big json file, about 77MB, so we’ll firstly peek into the data file and check its structure.
In [3]:
!head -n 50 dataset.json
[
    {
        "browser": "Firefox",
        "os": "Ubuntu",
        "locale": "ru_RU",
        "user_id": 105,
        "gender": "m",
        "location": "USA/Chicago",
        "sites": [
            {
                "site": "mail.google.com",
                "length": 50
            },
            {
                "site": "toptal.com",
                "length": 132
            },
            {
                "site": "slack.com",
                "length": 65
            },
            {
                "site": "lenta.ru",
                "length": 59
            },
            {
                "site": "youtube.com",
                "length": 67
            },
            {
                "site": "chitay-knigi.ru",
                "length": 108
            }
        ],
        "time": "09:03:00",
        "date": "2017-01-08"
    },
    {
        "browser": "Firefox",
        "os": "Windows 8",
        "locale": "pl-PL",
        "user_id": 11,
        "gender": "m",
        "location": "USA/Chicago",
        "sites": [
            {
                "site": "meduza.org",
                "length": 40
            },
            {
In [4]:
!head -n 50 test.json
[
    {
        "browser": "Chrome",
        "os": "Ubuntu",
        "locale": "ru_RU",
        "gender": "m",
        "location": "Canada/Toronto",
        "sites": [
            {
                "site": "mail.google.com",
                "length": 383
            },
            {
                "site": "vk.com",
                "length": 108
            },
            {
                "site": "toptal.com",
                "length": 138
            },
            {
                "site": "lenta.ru",
                "length": 47
            },
            {
                "site": "slack.com",
                "length": 93
            },
            {
                "site": "gazzettaobjects.it",
                "length": 92
            },
            {
                "site": "youtube.com",
                "length": 48
            }
        ],
        "time": "16:59:00",
        "date": "2017-09-05"
    },
    {
        "browser": "Chrome",
        "os": "Ubuntu",
        "locale": "ru_RU",
        "gender": "m",
        "location": "Canada/Toronto",
        "sites": [
            {
                "site": "mail.google.com",
                "length": 59
  • It looks the training data have 9 fields, and the sites fields is a nested list, which contains all the sites the user visits in the session.
  • To transfer the nested json data into useful features, we'll vectorize the sites column.

Load data

In [5]:
with open('dataset.json', 'r') as f:
     data_json_struct = json.loads(f.read())
user_sessions = pd.DataFrame(data_json_struct)
print(data_json_struct[0])
{'browser': 'Firefox', 'os': 'Ubuntu', 'locale': 'ru_RU', 'user_id': 105, 'gender': 'm', 'location': 'USA/Chicago', 'sites': [{'site': 'mail.google.com', 'length': 50}, {'site': 'toptal.com', 'length': 132}, {'site': 'slack.com', 'length': 65}, {'site': 'lenta.ru', 'length': 59}, {'site': 'youtube.com', 'length': 67}, {'site': 'chitay-knigi.ru', 'length': 108}], 'time': '09:03:00', 'date': '2017-01-08'}

Data Inspection

In [6]:
user_sessions.head(20)
print('\n')
user_sessions.info()
Out[6]:
browser os locale user_id gender location sites time date
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 09:03:00 2017-01-08
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 13:57:00 2016-10-05
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 02:06:00 2017-03-28
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 21:49:00 2017-06-25
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 00:05:00 2016-02-10
5 Internet Explorer Windows 8 xh-ZA 120 m France/Paris [{'site': 'cnn.com', 'length': 65}, {'site': '... 14:55:00 2017-03-28
6 Chrome Ubuntu ja-JP 158 f Germany/Berlin [{'site': 'toptal.com', 'length': 59}, {'site'... 21:26:00 2017-08-14
7 Chrome Windows 10 en-NZ 34 m USA/Chicago [{'site': 'google.com', 'length': 86}, {'site'... 23:00:00 2016-02-19
8 Internet Explorer Windows 7 en-CA 173 m USA/San Francisco [{'site': 'booking.com', 'length': 93}, {'site... 17:08:00 2017-01-04
9 Safari MacOS zh-CN 51 m Australia/Sydney [{'site': 'bing.com', 'length': 166}, {'site':... 22:31:00 2016-01-20
10 Chrome Windows 8 pt-BR 46 f China/Shanghai [{'site': 'slack.com', 'length': 64}, {'site':... 11:06:00 2016-11-26
11 Safari MacOS xh-ZA 182 f New Zealand/Auckland [{'site': 'googleapis.com', 'length': 60}, {'s... 21:35:00 2016-07-15
12 Chrome Windows 10 en-CA 130 m New Zealand/Auckland [{'site': 'booking.com', 'length': 51}, {'site... 17:44:00 2017-05-27
13 Safari MacOS ur-PK 77 f Malaysia/Kuala Lumpur [{'site': 'verisign.com', 'length': 190}, {'si... 10:33:00 2017-01-22
14 Chrome Windows 10 nl-NL 91 m USA/New York [{'site': 'googlevideo.com', 'length': 98}, {'... 11:32:00 2016-04-28
15 Chrome Windows 10 bg-BG 121 m Malaysia/Kuala Lumpur [{'site': 'youtube.com', 'length': 173}, {'sit... 01:48:00 2017-04-17
16 Firefox Windows 7 en-SG 26 m Russia/Moscow [{'site': 'googleapis.com', 'length': 96}, {'s... 16:17:00 2017-01-22
17 Chrome Windows 10 ro-RO 188 f Russia/Moscow [{'site': 'facebook.com', 'length': 392}, {'si... 08:27:00 2017-08-12
18 Firefox Windows 7 uk-UA 176 f New Zealand/Auckland [{'site': 'youtube.com', 'length': 82}, {'site... 00:00:00 2016-06-16
19 Safari MacOS fr-FR 136 f Brazil/Rio de Janeiro [{'site': 'baidu.com', 'length': 240}, {'site'... 13:13:00 2017-06-10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   browser   80000 non-null  object
 1   os        80000 non-null  object
 2   locale    80000 non-null  object
 3   user_id   80000 non-null  int64 
 4   gender    80000 non-null  object
 5   location  80000 non-null  object
 6   sites     80000 non-null  object
 7   time      80000 non-null  object
 8   date      80000 non-null  object
dtypes: int64(1), object(8)
memory usage: 5.5+ MB
In [7]:
user_sessions.query("sites.str.len() == 0")
Out[7]:
browser os locale user_id gender location sites time date
312 Chrome Windows 7 uk-UA 12 m Canada/Vancouver [] 17:45:00 2017-08-12
604 Firefox Windows 10 en-GB 78 f Japan/Tokyo [] 08:36:00 2016-11-01
821 Firefox Windows 8 pt-BR 172 m China/Shanghai [] 05:48:00 2016-10-13
945 Chrome Windows 8 en-AU 108 m France/Paris [] 11:24:00 2016-10-14
1073 Internet Explorer Windows 10 nl-NL 75 m Germany/Berlin [] 01:57:00 2016-06-12
... ... ... ... ... ... ... ... ... ...
78883 Chrome Windows 10 vi-VN 93 m Singapore/Singapore [] 12:10:00 2016-03-20
79149 Safari MacOS it-IT 32 m Netherlands/Amsterdam [] 13:03:00 2016-11-23
79603 Firefox Windows 8 pt-BR 172 m China/Shanghai [] 05:28:00 2016-08-12
79903 Safari MacOS nl-NL 197 m Canada/Toronto [] 02:37:00 2016-12-09
79904 Chrome Windows 8 en-AU 108 m France/Paris [] 14:45:00 2017-01-27

393 rows × 9 columns

Prepare Data: Impute Empty Sites and Add Custom Features

Impute empty sites

In [8]:
empty_sites_index = user_sessions.query("sites.str.len() == 0").index
user_sessions.loc[empty_sites_index, 'sites'] = user_sessions.loc[empty_sites_index]['sites'].apply(lambda sites: sites + [{'site': 'NONE.NONE', 'length': 0}])
user_sessions.loc[empty_sites_index]
Out[8]:
browser os locale user_id gender location sites time date
312 Chrome Windows 7 uk-UA 12 m Canada/Vancouver [{'site': 'NONE.NONE', 'length': 0}] 17:45:00 2017-08-12
604 Firefox Windows 10 en-GB 78 f Japan/Tokyo [{'site': 'NONE.NONE', 'length': 0}] 08:36:00 2016-11-01
821 Firefox Windows 8 pt-BR 172 m China/Shanghai [{'site': 'NONE.NONE', 'length': 0}] 05:48:00 2016-10-13
945 Chrome Windows 8 en-AU 108 m France/Paris [{'site': 'NONE.NONE', 'length': 0}] 11:24:00 2016-10-14
1073 Internet Explorer Windows 10 nl-NL 75 m Germany/Berlin [{'site': 'NONE.NONE', 'length': 0}] 01:57:00 2016-06-12
... ... ... ... ... ... ... ... ... ...
78883 Chrome Windows 10 vi-VN 93 m Singapore/Singapore [{'site': 'NONE.NONE', 'length': 0}] 12:10:00 2016-03-20
79149 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'NONE.NONE', 'length': 0}] 13:03:00 2016-11-23
79603 Firefox Windows 8 pt-BR 172 m China/Shanghai [{'site': 'NONE.NONE', 'length': 0}] 05:28:00 2016-08-12
79903 Safari MacOS nl-NL 197 m Canada/Toronto [{'site': 'NONE.NONE', 'length': 0}] 02:37:00 2016-12-09
79904 Chrome Windows 8 en-AU 108 m France/Paris [{'site': 'NONE.NONE', 'length': 0}] 14:45:00 2017-01-27

393 rows × 9 columns

Combine date/time columns and convert from string to datetime type

In [9]:
user_sessions['start_dt'] = pd.to_datetime(user_sessions['date'] + ' ' + user_sessions['time'], utc=True)
user_sessions.drop(['time', 'date'], axis=1, inplace=True)
user_sessions
Out[9]:
browser os locale user_id gender location sites start_dt
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00
... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00

80000 rows × 8 columns

Convert start time to local time

  • A class that converts city name to timezone
In [10]:
class TimezoneByCity:
    def __init__(self):
        self.geolocator = Nominatim(user_agent="geoapiExercises")
        self.tzfinder = TimezoneFinder()

    def tz_name(self, city: str):
        loc = self.geolocator.geocode(city)
        tz_name = self.tzfinder.timezone_at(lng=loc.longitude, lat=loc.latitude)
        return tz_name

    def tz(self, city: str):
        tz_name = self.tz_name(city)
        return pytz.timezone(tz_name)
  • Build timezone table that maps country/city to timezone
In [11]:
tz_by_city = TimezoneByCity()
timezone_tbl = {loc: tz_by_city.tz_name(loc.split('/')[1]) for loc in  user_sessions.location.unique()}
print(timezone_tbl)
{'USA/Chicago': 'America/Chicago', 'Singapore/Singapore': 'Asia/Singapore', 'Australia/Sydney': 'Australia/Sydney', 'France/Paris': 'Europe/Paris', 'Germany/Berlin': 'Europe/Berlin', 'USA/San Francisco': 'America/Los_Angeles', 'China/Shanghai': 'Asia/Shanghai', 'New Zealand/Auckland': 'Pacific/Auckland', 'Malaysia/Kuala Lumpur': 'Asia/Kuala_Lumpur', 'USA/New York': 'America/New_York', 'Russia/Moscow': 'Europe/Moscow', 'Brazil/Rio de Janeiro': 'America/Sao_Paulo', 'Canada/Toronto': 'America/Toronto', 'Spain/Madrid': 'Europe/Madrid', 'USA/Miami': 'America/New_York', 'India/Delhi': 'Asia/Kolkata', 'Netherlands/Amsterdam': 'Europe/Amsterdam', 'UK/London': 'Europe/London', 'Japan/Tokyo': 'Asia/Tokyo', 'Italy/Rome': 'Europe/Rome', 'Canada/Vancouver': 'America/Vancouver'}
  • Add local_time column to data
In [12]:
user_sessions['local_time'] = user_sessions.apply(lambda row: row['start_dt'].tz_convert(timezone_tbl[row['location']]).tz_localize(None), axis=1)
user_sessions
Out[12]:
browser os locale user_id gender location sites start_dt local_time
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00
... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00

80000 rows × 9 columns

Split starting date and time to year / month / day / weekday and start_hour

In [13]:
user_sessions["year"] = user_sessions.local_time.dt.year
user_sessions["month"] = user_sessions.local_time.dt.month
user_sessions["day"] = user_sessions.local_time.dt.day
user_sessions["weekday"] = user_sessions.local_time.dt.weekday
user_sessions["start_hour"] = user_sessions.local_time.dt.hour
user_sessions
Out[13]:
browser os locale user_id gender location sites start_dt local_time year month day weekday start_hour
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00 2017 1 8 6 3
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00 2016 10 5 2 8
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00 2017 3 28 1 10
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00 2017 6 26 0 7
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00 2016 2 9 1 18
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00 2016 12 30 4 19
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00 2017 1 27 4 11
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00 2017 3 13 0 12
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00 2016 12 6 1 15
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00 2016 2 18 3 19

80000 rows × 14 columns

Sine/Cosine transform of local start time

In [14]:
start_dt_normalized = (user_sessions['local_time'] - user_sessions['local_time'].dt.normalize()) / pd.Timedelta('1 second') / 86400
user_sessions['start_sin'] = np.sin(2*np.pi* (start_dt_normalized))
user_sessions['start_cos'] = np.cos(2*np.pi* (start_dt_normalized))
user_sessions
Out[14]:
browser os locale user_id gender location sites start_dt local_time year month day weekday start_hour start_sin start_cos
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00 2017 1 8 6 3 0.716302 0.697790
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00 2016 10 5 2 8 0.716302 -0.697790
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00 2017 3 28 1 10 0.477159 -0.878817
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00 2017 6 26 0 7 0.889017 -0.457874
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00 2016 2 9 1 18 -0.999762 0.021815
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00 2016 12 30 4 19 -0.963630 0.267238
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00 2017 1 27 4 11 0.143493 -0.989651
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00 2017 3 13 0 12 -0.237686 -0.971342
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00 2016 12 6 1 15 -0.757565 -0.652760
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00 2016 2 18 3 19 -0.872496 0.488621

80000 rows × 16 columns

Split location to country and city

In [15]:
user_sessions[['country', 'city']] = user_sessions['location'].str.split('/', expand=True)
user_sessions
Out[15]:
browser os locale user_id gender location sites start_dt local_time year month day weekday start_hour start_sin start_cos country city
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00 2017 1 8 6 3 0.716302 0.697790 USA Chicago
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00 2016 10 5 2 8 0.716302 -0.697790 USA Chicago
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00 2017 3 28 1 10 0.477159 -0.878817 Singapore Singapore
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00 2017 6 26 0 7 0.889017 -0.457874 Australia Sydney
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00 2016 2 9 1 18 -0.999762 0.021815 USA Chicago
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00 2016 12 30 4 19 -0.963630 0.267238 New Zealand Auckland
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00 2017 1 27 4 11 0.143493 -0.989651 Netherlands Amsterdam
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00 2017 3 13 0 12 -0.237686 -0.971342 USA Chicago
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00 2016 12 6 1 15 -0.757565 -0.652760 France Paris
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00 2016 2 18 3 19 -0.872496 0.488621 New Zealand Auckland

80000 rows × 18 columns

Get total length of each user session

In [16]:
user_sessions['length_session'] = user_sessions['sites'].apply(lambda session_sites: sum(site_entry['length'] for site_entry in session_sites))
user_sessions
Out[16]:
browser os locale user_id gender location sites start_dt local_time year month day weekday start_hour start_sin start_cos country city length_session
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00 2017 1 8 6 3 0.716302 0.697790 USA Chicago 481
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00 2016 10 5 2 8 0.716302 -0.697790 USA Chicago 1076
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00 2017 3 28 1 10 0.477159 -0.878817 Singapore Singapore 1280
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00 2017 6 26 0 7 0.889017 -0.457874 Australia Sydney 1323
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00 2016 2 9 1 18 -0.999762 0.021815 USA Chicago 224
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00 2016 12 30 4 19 -0.963630 0.267238 New Zealand Auckland 509
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00 2017 1 27 4 11 0.143493 -0.989651 Netherlands Amsterdam 267
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00 2017 3 13 0 12 -0.237686 -0.971342 USA Chicago 698
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00 2016 12 6 1 15 -0.757565 -0.652760 France Paris 696
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00 2016 2 18 3 19 -0.872496 0.488621 New Zealand Auckland 1597

80000 rows × 19 columns

Vectorize top sites using TF-IDF

In [17]:
n_top = 100

joe_cnt = Counter()
for sites_session in user_sessions.query('user_id == 0')['sites']:
    for site_entry in sites_session:
        joe_cnt.update({site_entry['site']: site_entry['length']})

joe_top_sites, _ = zip(*joe_cnt.most_common(n_top))

print("Total sites joe visited: ", len(joe_cnt))
print(f"Top {n_top} sites joe visited: \n", joe_top_sites[:100])
Total sites joe visited:  1166
Top 100 sites joe visited: 
 ('lenta.ru', 'toptal.com', 'mail.google.com', 'slack.com', 'vk.com', 'youtube.com', 'mairie-gruson.fr', 'tdg.ch', 'smbg.fr', 'multiplayer.com', 'free.fr', 'play3-live.com', '127.107', 'rollingstone.com', 'crous-clermont.fr', 'machine-outil.com', 'starbucks.com', 'arooze.com', 'jeux-mini.com', 'games.la', 'fntp.fr', 'yale.edu', 'ireasoning.com', 'marianne.net', 'doctrine-project.org', 'annonceetudiant.com', 'dico-ecolo.com', 'getadblock.com', 'tecnitude.com', 'alluserpics.com', 'iufm.fr', 'geowiki.fr', 'moonbasa.com', 'loreal-finance.com', 'apogee-systems.com', 'mgc-prevention.fr', 'lacoope.org', 'alexgorbatchev.com', 'bookryanair.com', 'wikio.fr', 'cpubenchmark.net', 'thequestionsnetwork.org', 'seloger.com', 'lektorat.de', 'coza.net', 'linuxplanet.com', 'gagnantduprix.com', 'smart-tribune.com', 'onescreen.net', 'obspm.fr', 'granthweb.com', 'restotel.net', 'citea.info', 'virginmobile.fr', 'imaginetonfutur.com', 'encyclopediadramatica.com', 'joueurdugrenier.fr', 'jobanim.com', 'epresspack-dev.net', 'arrondirmesfinsdemois.com', 'abaenglish.com', 'media-imdb.com', 'copy.com', 'corsematin.com', 'videogamereviewerkid.com', 'eklablog.fr', 'roundcube.net', 'breizh-portal.com', 'wallpaperzet.com', 'lecture-en-ligne.com', 'ruvr.ru', 'covoiturage.fr', 'nordlittoral.fr', 'tu-dresden.de', 'intuitwebsites.com', 'frontierstrategygroup.com', 'instanttimezone.com', 'rive-gauche.fr', 'jobthread.com', '365euros.com', 'cinejaude.fr', 'infolignes.com', 'jminformatique.biz', 'ebmeditions.fr', 'lacoccinelle.net', 'fromquarkstoquasars.com', 'ssbwiki.com', 'likefood.us', 'cicic.ca', 'dalloz.fr', 'edublogawards.com', 'science-et-vie.com', 'chartsinfrance.net', 'yowindow.com', 'itdevspace.com', 'epresse.fr', 'galaxys5.fr', 'biologycorner.com', 'hdslb.com', 'stgbssint.com')
In [18]:
n_top = 100

all_cnt = Counter()
for sites_session in user_sessions['sites']:
    for site_entry in sites_session:
        all_cnt.update({site_entry['site']: site_entry['length']})

all_top_sites, _ = zip(*all_cnt.most_common(n_top))
print("Total sites all users visited: ", len(all_cnt))
print(f"Top {n_top} sites all users visited: \n", all_top_sites[:100])
Total sites all users visited:  11132
Top 100 sites all users visited: 
 ('youtube.com', 'toptal.com', 'slack.com', 'lenta.ru', 'vk.com', 'mail.google.com', 'oracle.com', 'wikimedia.org', 'googleapis.com', 'vimeo.com', 'airbnb.com', 'geotrust.com', 'google.com', 'booking.com', 'facebook.com', 'live.com', 'cedexis.com', 'ggpht.com', 'baidu.com', 'ytimg.com', 'facebook.net', 'mangafox.me', 'googlevideo.com', 'verisign.com', 'lijit.com', 'yahoo.com', 'openclassrooms.com', 'twitter.com', 'cloudfront.net', 'meduza.org', 'digicert.com', 'disqus.com', 'jboss.org', 'microsoft.com', 'instagram.com', 'wikipedia.org', 'bing.com', 'bing.net', 'skyscanner.com', 'com.cn', 'cnn.com', 'allpostersimages.com', 'ecns.cn', 'csdn.net', 'food-4tots.com', 'trafiz.net', 'ca-centrefrance.fr', 'designmodo.com', 'stid-france.com', 'technoratimedia.com', 'mlmd.fr', 'irs01.net', 'daxon.fr', 'synten.com', 'ac-mayotte.fr', 'retetedesuflet.ro', 'dmca.com', 'megaportail.eu', 'autotitre.com', 'letudiant.fr', 'bookryanair.com', 'mibdepot.com', 'webartex.ru', 'mathon.fr', 'filedanstachambre.com', 'toutestfacile.com', 'ldd.fr', 'cfasup2000.net', 'back-end.dk', 'sg-autorepondeur.com', 'mal-au-dos.be', 'horaires-mairie.fr', 'wayne.edu', 'cbao.fr', 'edf.com', 'lafistiniere.com', 'geoplay.fr', 'eternia-fr.net', 'kejet.net', 'sosav.fr', 'consulfrance-montreal.org', 'yourdressmaker.com', 'kingsandlegends.com', 'gralon.net', 'activolcans.info', 'fileformat.info', 'bestinlinux.com', 'studyrama.be', 'lonelyplanet.com', 'tisserant.org', 'telechargervideoyoutube.com', 'joomladay.fr', 'crawl-anywhere.com', 'dress-for-less.com', 'clickintext.net', 'dartfish.tv', 'man7.org', 'ipage.com', 'adriagate.com', 'autrement-ussel.fr')
In [20]:
def get_topsites_length(session_sites: list, top_sites=all_top_sites):
    topsites_len_dict = dict.fromkeys(top_sites, 0)
    for site_entry in session_sites:
        site = site_entry['site']
        if site in topsites_len_dict:
            topsites_len_dict[site] += site_entry['length']
    return list(topsites_len_dict.values())

topsites_length = user_sessions['sites'].apply(get_topsites_length)
topsites_length
Out[20]:
0        [67, 132, 65, 59, 0, 50, 0, 0, 0, 0, 0, 0, 0, ...
1        [202, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
2        [109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
3        [0, 0, 191, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 112,...
4        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                               ...                        
79995    [46, 0, 63, 0, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
79996    [43, 0, 74, 0, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
79997    [0, 0, 71, 0, 44, 54, 0, 0, 0, 0, 0, 0, 0, 0, ...
79998    [251, 103, 0, 82, 133, 127, 0, 0, 0, 0, 0, 0, ...
79999    [0, 0, 0, 0, 0, 0, 118, 0, 0, 0, 0, 0, 145, 0,...
Name: sites, Length: 80000, dtype: object
In [21]:
tfidf = TfidfTransformer()
topsites_tfidf = tfidf.fit_transform(topsites_length.values.tolist())
topsites_tfidf.toarray()[:2]
Out[21]:
array([[0.3356709 , 0.73258813, 0.36172768, 0.35462643, 0.        ,
        0.30664778, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.57288012, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.35902355,
        0.        , 0.        , 0.        , 0.        , 0.18299643,
        0.        , 0.28371111, 0.        , 0.        , 0.        ,
        0.60967766, 0.        , 0.        , 0.        , 0.23921522,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

Add binary class label: Joe=0, Other users=1

In [22]:
user_sessions['target'] = (user_sessions['user_id'] != 0).astype(int)
user_sessions
Out[22]:
browser os locale user_id gender location sites start_dt local_time year month day weekday start_hour start_sin start_cos country city length_session target
0 Firefox Ubuntu ru_RU 105 m USA/Chicago [{'site': 'mail.google.com', 'length': 50}, {'... 2017-01-08 09:03:00+00:00 2017-01-08 03:03:00 2017 1 8 6 3 0.716302 0.697790 USA Chicago 481 1
1 Firefox Windows 8 pl-PL 11 m USA/Chicago [{'site': 'meduza.org', 'length': 40}, {'site'... 2016-10-05 13:57:00+00:00 2016-10-05 08:57:00 2016 10 5 2 8 0.716302 -0.697790 USA Chicago 1076 1
2 Chrome Ubuntu zh-CN 17 m Singapore/Singapore [{'site': 'facebook.net', 'length': 74}, {'sit... 2017-03-28 02:06:00+00:00 2017-03-28 10:06:00 2017 3 28 1 10 0.477159 -0.878817 Singapore Singapore 1280 1
3 Chrome Windows 10 pt-BR 134 f Australia/Sydney [{'site': 'verisign.com', 'length': 111}, {'si... 2017-06-25 21:49:00+00:00 2017-06-26 07:49:00 2017 6 26 0 7 0.889017 -0.457874 Australia Sydney 1323 1
4 Firefox Windows 10 en-SG 92 f USA/Chicago [{'site': 'live.com', 'length': 79}, {'site': ... 2016-02-10 00:05:00+00:00 2016-02-09 18:05:00 2016 2 9 1 18 -0.999762 0.021815 USA Chicago 224 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
79995 Chrome Windows 10 pt-PT 178 m New Zealand/Auckland [{'site': 'vk.com', 'length': 126}, {'site': '... 2016-12-30 06:02:00+00:00 2016-12-30 19:02:00 2016 12 30 4 19 -0.963630 0.267238 New Zealand Auckland 509 1
79996 Safari MacOS it-IT 32 m Netherlands/Amsterdam [{'site': 'slack.com', 'length': 74}, {'site':... 2017-01-27 10:27:00+00:00 2017-01-27 11:27:00 2017 1 27 4 11 0.143493 -0.989651 Netherlands Amsterdam 267 1
79997 Firefox Ubuntu ru_RU 0 m USA/Chicago [{'site': 'vk.com', 'length': 44}, {'site': 's... 2017-03-13 17:55:00+00:00 2017-03-13 12:55:00 2017 3 13 0 12 -0.237686 -0.971342 USA Chicago 698 0
79998 Firefox Windows 10 ru_RU 56 m France/Paris [{'site': 'lenta.ru', 'length': 82}, {'site': ... 2016-12-06 14:17:00+00:00 2016-12-06 15:17:00 2016 12 6 1 15 -0.757565 -0.652760 France Paris 696 1
79999 Firefox Windows 10 pt-BR 113 f New Zealand/Auckland [{'site': 'baidu.com', 'length': 60}, {'site':... 2016-02-18 06:57:00+00:00 2016-02-18 19:57:00 2016 2 18 3 19 -0.872496 0.488621 New Zealand Auckland 1597 1

80000 rows × 20 columns

Visual inspection

Histogram / Count plot

In [1]:
def set_xlabel_rotation(ax, deg=90):
    for label in ax.get_xticklabels():
        l = label.set_rotation(deg)
In [23]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.histplot(user_sessions[['user_id']], ax=ax.flatten()[0], discrete=True);
p = sns.histplot(user_sessions[['length_session']], ax=ax.flatten()[1], bins=200);

fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='browser', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='os', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='locale', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);

fig, ax = plt.subplots(1, 3, figsize=(21, 4));
p = sns.countplot(data=user_sessions, x='gender', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='city', ax=ax.flatten()[1])
for label in ax.flatten()[1].get_xticklabels():
    label.set_rotation(90);
p = sns.countplot(data=user_sessions, x='country', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);
In [1]:
fig, ax = plt.subplots(1, 3, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='year', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='month', ax=ax.flatten()[1])
p = sns.countplot(data=user_sessions, x='day', ax=ax.flatten()[2])
for label in ax.flatten()[2].get_xticklabels():
    label.set_rotation(90);
fig, ax = plt.subplots(1, 2, figsize=(16, 4));
p = sns.countplot(data=user_sessions, x='weekday', ax=ax.flatten()[0])
p = sns.countplot(data=user_sessions, x='start_hour', ax=ax.flatten()[1])

Joe’s Characteristics

In [1]:
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['browser', 'os', 'locale']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
set_xlabel_rotation(ax.flatten()[2], 90)
In [1]:
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['gender', 'city', 'country']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
    if feat != 'gender':
        set_xlabel_rotation(sub_ax, 90)
In [1]:
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
for i, feat in enumerate(['year', 'month', 'day']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)
    if feat == 'day':
        set_xlabel_rotation(sub_ax, 90)
In [1]:
fig, ax = plt.subplots(1, 2, figsize=(16, 4))
for i, feat in enumerate(['weekday', 'start_hour']):
    sub_ax = ax.flatten()[i]
    p = sns.boxplot(x=feat, y='length_session', hue='target', data=user_sessions, palette='rainbow', ax=sub_ax)

Features / target correlation

Category features vs. target correlation with contingency analysis / Cramer’s V

Cramer’s V

In [24]:
cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_crv = pd.Series([cramers_v(user_sessions[cat_feat], user_sessions['user_id']) for cat_feat in cat_cols], index = cat_cols, name='CramersV').sort_values(ascending=False)
cat_feat_target_crv
Out[24]:
gender        0.998762
browser       0.876672
locale        0.867128
os            0.854367
country       0.780666
city          0.777243
start_hour    0.423355
year          0.000000
month         0.000000
day           0.000000
weekday       0.000000
Name: CramersV, dtype: float64

Theil’s U

In [25]:
cat_cols = ['browser', 'os', 'locale', 'gender', 'country', 'city', 'year', 'month', 'day', 'weekday', 'start_hour']
cat_feat_target_thu = pd.Series([theils_u(user_sessions[feat], user_sessions['user_id']) for feat in cat_cols], index = cat_cols, name='TheilsU').sort_values(ascending=False)
cat_feat_target_thu
Out[25]:
gender        1.000000
locale        0.900952
os            0.827354
city          0.813216
country       0.800623
browser       0.799798
start_hour    0.377777
day           0.003827
weekday       0.001455
month         0.001136
year          0.000548
Name: TheilsU, dtype: float64

Plot

In [1]:
cat_target_corr = pd.DataFrame({'Correlation':cat_feat_target_crv})
cat_target_corr['Stats'] = "Cramer's V"
df = pd.DataFrame({'Correlation': cat_feat_target_thu})
df['Stats'] = "Theil's U"
cat_target_corr = cat_target_corr.append(df).reset_index().rename(columns={'index': 'Features'})
fig, ax = plt.subplots(1, 1, figsize=(8.5, 4));
p = sns.barplot(data=cat_target_corr, x='Features', y='Correlation', hue='Stats', palette=sns.color_palette('rainbow', 3))
plt.title('Correlation between Categorical Features and Target user_id');