!pip install -q -U kaggle !pip install --upgrade --force-reinstall --no-deps kaggle !mkdir ~/.kaggle !cp /content/drive/MyDrive/kaggle.json ~/.kaggle/ !chmod 600 ~/.kaggle/kaggle.json !kaggle datasets download -d mkechinov/ecommerce-behavior-data-from-multi-category-store !gdown --id 1qZIwMbMgMmgDC5EoMdJ8aI9lQPsWA3-P !gdown --id 1x5ohrrZNhWQN4Q-zww0RmXOwctKHH9PT !gdown --id 1-Rov9fFtGJqb7_ePc6qH-Rhzxn0cIcKB !gdown --id 1zr_RXpGvOWN2PrWI6itWL8HnRsCpyqz8 !gdown --id 1g5WoIgLe05UMdREbxAjh0bEFgVCjA1UL import os import gc import glob import pandas as pd !unzip /content/ecommerce-behavior-data-from-multi-category-store.zip !rm /content/ecommerce-behavior-data-from-multi-category-store.zip list_gz_files = glob.glob('/content/*.gz') list_gz_files for file in list_gz_files: print(file) !gunzip $file snapshot = pd.read_csv("/content/2019-Oct.csv").sample(1000) snapshot.head() !mkdir -p /content/data/tmp snapshot.to_csv('/content/data/tmp/snapshot.csv', index=False) gc.collect() import os import gc import glob import pandas as pd from pathlib import Path file = '/content/data/tmp/snapshot.csv' df_tmp = pd.read_csv(file) df_tmp['session_purchase'] = df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str) df_purchase = df_tmp[df_tmp['event_type']=='purchase'] df_cart = df_tmp[df_tmp['event_type']=='cart'] df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])] df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))] df_cart['target'] = 0 df_purchase['target'] = 1 df = pd.concat([df_cart, df_purchase]) df = df.drop('category_id', axis=1) df = df.drop('session_purchase', axis=1) df.head() df[['cat_0', 'cat_1', 'cat_2']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA') # df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA') df['brand'] = df['brand'].fillna('NA') df = df.drop('category_code', axis=1) df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', '')) df['ts_hour'] = df['timestamp'].dt.hour df['ts_minute'] = df['timestamp'].dt.minute df['ts_weekday'] = df['timestamp'].dt.weekday df['ts_day'] = df['timestamp'].dt.day df['ts_month'] = df['timestamp'].dt.month df['ts_year'] = df['timestamp'].dt.year df.head() list_files = glob.glob('/content/*.csv') list_files def process_files(df_tmp, chunkname): df_tmp['session_purchase'] = df_tmp['user_session'] + '_' + df_tmp['product_id'].astype(str) df_purchase = df_tmp[df_tmp['event_type']=='purchase'] df_cart = df_tmp[df_tmp['event_type']=='cart'] df_purchase = df_purchase[df_purchase['session_purchase'].isin(df_cart['session_purchase'])] df_cart = df_cart[~(df_cart['session_purchase'].isin(df_purchase['session_purchase']))] df_cart['target'] = 0 df_purchase['target'] = 1 df = pd.concat([df_cart, df_purchase]) df = df.drop('category_id', axis=1) df = df.drop('session_purchase', axis=1) # df[['cat_0', 'cat_1', 'cat_2', 'cat_3']] = df['category_code'].str.split("\.", n = 3, expand = True).fillna('NA') df['brand'] = df['brand'].fillna('NA') # df = df.drop('category_code', axis=1) df['timestamp'] = pd.to_datetime(df['event_time'].str.replace(' UTC', '')) df['ts_hour'] = df['timestamp'].dt.hour df['ts_minute'] = df['timestamp'].dt.minute df['ts_weekday'] = df['timestamp'].dt.weekday df['ts_day'] = df['timestamp'].dt.day df['ts_month'] = df['timestamp'].dt.month df['ts_year'] = df['timestamp'].dt.year df.to_csv(chunkname, index=False) base_path_silver = "/content/data/silver" !mkdir -p $base_path_silver for idx, chunk in enumerate(list_files[:2]): chunkname = os.path.join(base_path_silver, Path(chunk).stem + '-' + str(idx) + '.csv') print(chunkname) chunksize = 10 ** 6 for file in list_files: print(file) for idx, chunk in enumerate(pd.read_csv(file, chunksize=chunksize)): chunkname = os.path.join(base_path_silver, Path(file).stem + '-' + str(idx) + '.csv') print(chunkname) if not os.path.exists(chunkname): process_files(chunk, chunkname) for file in list_files: !rm $file list_chunks = glob.glob(os.path.join(base_path_silver,'*.csv')) list_chunks[:10] !cd $base_path_silver && zip /content/data_silver.zip ./*.csv !cp data_silver.zip /content/drive/MyDrive/Recommendation lp = [] for file in list_chunks: lp.append(pd.read_csv(file)) df = pd.concat(lp) df.shape df.head() # df2 = df['category_code'].str.split("\.", n=3, expand=True).fillna('NA') # df2.columns = ['cat_{}'.format(x+1) for x in df2.columns] # df2.to_parquet('/content/data/silver_l2/df_cat.parquet', index=False) df_test = df[df['ts_month']==4] df_valid = df[df['ts_month']==3] df_train = df[(df['ts_month']!=3)&(df['ts_month']!=4)] df_train.shape, df_valid.shape, df_test.shape !mkdir -p /content/data/silver_l2 df_train.to_parquet('/content/data/silver_l2/train.parquet', index=False) df_valid.to_parquet('/content/data/silver_l2/valid.parquet', index=False) df_test.to_parquet('/content/data/silver_l2/test.parquet', index=False) !cd /content/data/silver_l2 && zip /content/data_silver_l2.zip ./*.parquet !cp /content/data_silver_l2.zip /content/drive/MyDrive/Recommendation import pandas as pd df_train = pd.read_parquet('/content/data/silver_l2/train.parquet') df_valid = pd.read_parquet('/content/data/silver_l2/valid.parquet') df_test = pd.read_parquet('/content/data/silver_l2/test.parquet') df = pd.concat([df_train, df_valid, df_test],ignore_index=True) df.shape import IPython import pandas as pd import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") %matplotlib inline df.dtypes df['timestamp'] = pd.to_datetime(df['timestamp']) df.target.mean() df['event_type'].value_counts(normalize=True) print('# of datapoints:' + str(df.shape)) print('# of unique users:' + str(df['user_id'].drop_duplicates().shape)) print('# of unique products:' + str(df['product_id'].drop_duplicates().shape)) print('# of unique sessions:' + str(df['user_session'].drop_duplicates().shape)) def plot_sparse(df, col): stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum']) stats = stats.reset_index() stats.columns = [col, 'count', 'mean', 'sum'] stats_sort = stats['count'].value_counts().reset_index() stats_sort = stats_sort.sort_values('index') plt.figure(figsize=(15,4)) plt.bar(stats_sort['index'].astype(str).values[0:20], stats_sort['count'].values[0:20]) plt.title('Frequency of ' + str(col)) plt.xlabel('Number frequency') plt.ylabel('Frequency') plot_sparse(df, 'product_id') plot_sparse(df, 'user_id') plot_sparse(df, 'brand') def plot_top20(df, col): stats = df[[col, 'target']].groupby(col).agg(['count', 'mean', 'sum']) stats = stats.reset_index() stats.columns = [col, 'count', 'mean', 'sum'] stats = stats.sort_values('count', ascending=False) fig, ax1 = plt.subplots(figsize=(15,4)) ax2 = ax1.twinx() ax1.bar(stats[col].astype(str).values[0:20], stats['count'].values[0:20]) ax1.set_xticklabels(stats[col].astype(str).values[0:20], rotation='vertical') ax2.plot(stats['mean'].values[0:20], color='red') ax2.set_ylim(0,1) ax2.set_ylabel('Mean Target') ax1.set_ylabel('Frequency') ax1.set_xlabel(col) ax1.set_title('Top20 ' + col + 's based on frequency') plot_top20(df, 'product_id') plot_top20(df, 'user_id') plot_top20(df, 'brand') df['date'] = pd.to_datetime(df['timestamp']).dt.date plt.figure(figsize=(15,4)) plt.plot(df[['date', 'target']].groupby('date').target.mean()) plt.ylabel('average mean') plt.xlabel('date') plt.xticks(df[['date', 'target']].groupby('date').target.mean().index[::3], rotation='vertical') print('') df[['date', 'target']].groupby('date').target.mean().sort_values().head(20)