!pip install -q -U kaggle
!pip install --upgrade --force-reinstall --no-deps kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d mkechinov/ecommerce-behavior-data-from-multi-category-store

!gdown --id 1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P
!gdown --id 1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT
!gdown --id 1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB
!gdown --id 1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8
!gdown --id 1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL

import os
import gc
import glob
import pandas as pd

!unzip /content/ecommerce-behavior-data-from-multi-category-store.zip
!rm /content/ecommerce-behavior-data-from-multi-category-store.zip

list_gz_files = glob.glob('/content/*.gz')
list_gz_files

for file in list_gz_files:
  print(file)
  !gunzip $file

snapshot = pd.read_csv("/content/2019-Oct.csv").sample(1000)
snapshot.head()

!mkdir -p /content/data/tmp
snapshot.to_csv('/content/data/tmp/snapshot.csv', index=False)

gc.collect()

import os
import gc
import glob
import pandas as pd
from pathlib import Path

file = '/content/data/tmp/snapshot.csv'

df_tmp = pd.read_csv(file)
df_tmp['session_purchase'] =  df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str)
df_purchase = df_tmp[df_tmp['event_type']=='purchase']
df_cart = df_tmp[df_tmp['event_type']=='cart']
df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])]
df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))]
df_cart['target'] = 0
df_purchase['target'] = 1
df = pd.concat([df_cart, df_purchase])
df = df.drop('category_id', axis=1)
df = df.drop('session_purchase', axis=1)
df.head()

df[['cat_0', 'cat_1', 'cat_2']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA')
# df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA')
df['brand'] = df['brand'].fillna('NA')
df = df.drop('category_code', axis=1)
df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', ''))
df['ts_hour'] = df['timestamp'].dt.hour
df['ts_minute'] = df['timestamp'].dt.minute
df['ts_weekday'] = df['timestamp'].dt.weekday
df['ts_day'] = df['timestamp'].dt.day
df['ts_month'] = df['timestamp'].dt.month
df['ts_year'] = df['timestamp'].dt.year
df.head()

list_files = glob.glob('/content/*.csv')
list_files

def process_files(df_tmp, chunkname):
    df_tmp['session_purchase'] =  df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str)
    df_purchase = df_tmp[df_tmp['event_type']=='purchase']
    df_cart = df_tmp[df_tmp['event_type']=='cart']
    df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])]
    df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))]
    df_cart['target'] = 0
    df_purchase['target'] = 1
    df = pd.concat([df_cart, df_purchase])
    df = df.drop('category_id', axis=1)
    df = df.drop('session_purchase', axis=1)
    # df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA')
    df['brand'] = df['brand'].fillna('NA')
    # df = df.drop('category_code', axis=1)
    df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', ''))
    df['ts_hour'] = df['timestamp'].dt.hour
    df['ts_minute'] = df['timestamp'].dt.minute
    df['ts_weekday'] = df['timestamp'].dt.weekday
    df['ts_day'] = df['timestamp'].dt.day
    df['ts_month'] = df['timestamp'].dt.month
    df['ts_year'] = df['timestamp'].dt.year
    df.to_csv(chunkname, index=False)

base_path_silver = "/content/data/silver"
!mkdir -p $base_path_silver

for idx, chunk in enumerate(list_files[:2]):
  chunkname = os.path.join(base_path_silver, Path(chunk).stem + '-' + str(idx) + '.csv')
  print(chunkname)

chunksize = 10 ** 6

for file in list_files:
    print(file)
    for idx, chunk in enumerate(pd.read_csv(file, chunksize=chunksize)):
      chunkname = os.path.join(base_path_silver, Path(file).stem + '-' + str(idx) + '.csv')
      print(chunkname)
      if not os.path.exists(chunkname):
        process_files(chunk, chunkname)

for file in list_files:
  !rm $file

list_chunks = glob.glob(os.path.join(base_path_silver,'*.csv'))
list_chunks[:10]

!cd $base_path_silver && zip /content/data_silver.zip ./*.csv

!cp data_silver.zip /content/drive/MyDrive/Recommendation

lp = []
for file in list_chunks:
  lp.append(pd.read_csv(file))

df = pd.concat(lp)
df.shape

df.head()

# df2 = df['category_code'].str.split("\.", n=3, expand=True).fillna('NA')
# df2.columns = ['cat_{}'.format(x+1) for x in df2.columns]
# df2.to_parquet('/content/data/silver_l2/df_cat.parquet', index=False)

df_test = df[df['ts_month']==4]
df_valid = df[df['ts_month']==3]
df_train = df[(df['ts_month']!=3)&(df['ts_month']!=4)]

df_train.shape, df_valid.shape, df_test.shape

!mkdir -p /content/data/silver_l2
df_train.to_parquet('/content/data/silver_l2/train.parquet', index=False)
df_valid.to_parquet('/content/data/silver_l2/valid.parquet', index=False)
df_test.to_parquet('/content/data/silver_l2/test.parquet', index=False)

!cd /content/data/silver_l2 && zip /content/data_silver_l2.zip ./*.parquet
!cp /content/data_silver_l2.zip /content/drive/MyDrive/Recommendation

import pandas as pd

df_train = pd.read_parquet('/content/data/silver_l2/train.parquet')
df_valid = pd.read_parquet('/content/data/silver_l2/valid.parquet')
df_test = pd.read_parquet('/content/data/silver_l2/test.parquet')

df = pd.concat([df_train, df_valid, df_test],ignore_index=True)

df.shape

import IPython

import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

df.dtypes

df['timestamp'] = pd.to_datetime(df['timestamp'])

df.target.mean()

df['event_type'].value_counts(normalize=True)

print('# of datapoints:' + str(df.shape))
print('# of unique users:' + str(df['user_id'].drop_duplicates().shape))
print('# of unique products:' + str(df['product_id'].drop_duplicates().shape))
print('# of unique sessions:' + str(df['user_session'].drop_duplicates().shape))

def plot_sparse(df, col):
    stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum'])
    stats = stats.reset_index()
    stats.columns = [col, 'count', 'mean', 'sum']
    stats_sort = stats['count'].value_counts().reset_index()
    stats_sort = stats_sort.sort_values('index')
    plt.figure(figsize=(15,4))
    plt.bar(stats_sort['index'].astype(str).values[0:20], stats_sort['count'].values[0:20])
    plt.title('Frequency of ' + str(col))
    plt.xlabel('Number frequency')
    plt.ylabel('Frequency')

plot_sparse(df, 'product_id')

plot_sparse(df, 'user_id')

plot_sparse(df, 'brand')

def plot_top20(df, col):
    stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum'])
    stats = stats.reset_index()
    stats.columns = [col, 'count', 'mean', 'sum']
    stats = stats.sort_values('count', ascending=False)
    fig, ax1 = plt.subplots(figsize=(15,4))
    ax2 = ax1.twinx()
    ax1.bar(stats[col].astype(str).values[0:20], stats['count'].values[0:20])
    ax1.set_xticklabels(stats[col].astype(str).values[0:20], rotation='vertical')
    ax2.plot(stats['mean'].values[0:20], color='red')
    ax2.set_ylim(0,1)
    ax2.set_ylabel('Mean Target')
    ax1.set_ylabel('Frequency')
    ax1.set_xlabel(col)
    ax1.set_title('Top20 ' + col + 's based on frequency')

plot_top20(df, 'product_id')

plot_top20(df, 'user_id')

plot_top20(df, 'brand')

df['date'] = pd.to_datetime(df['timestamp']).dt.date

plt.figure(figsize=(15,4))
plt.plot(df[['date', 'target']].groupby('date').target.mean())
plt.ylabel('average mean')
plt.xlabel('date')
plt.xticks(df[['date', 'target']].groupby('date').target.mean().index[::3], rotation='vertical')
print('')

df[['date', 'target']].groupby('date').target.mean().sort_values().head(20)