# Check Python Version !python --version # Check Ubuntu Version !lsb_release -a # Check CUDA/cuDNN Version !nvcc -V && which nvcc # Check GPU !nvidia-smi # This get the RAPIDS-Colab install files and test check your GPU. Run this and the next cell only. # Please read the output of this cell. If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps. !git clone https://github.com/rapidsai/rapidsai-csp-utils.git !python rapidsai-csp-utils/colab/env-check.py # This will update the Colab environment and restart the kernel. Don't run the next cell until you see the session crash. !bash rapidsai-csp-utils/colab/update_gcc.sh import os os._exit(00) # This will install CondaColab. This will restart your kernel one last time. Run this cell by itself and only run the next cell once you see the session crash. import condacolab condacolab.install() # you can now run the rest of the cells as normal import condacolab condacolab.check() # Installing RAPIDS is now 'python rapidsai-csp-utils/colab/install_rapids.py ' # The options are 'stable' and 'nightly'. Leaving it blank or adding any other words will default to stable. # The option are default blank or 'core'. By default, we install RAPIDSAI and BlazingSQL. The 'core' option will install only RAPIDSAI and not include BlazingSQL, !python rapidsai-csp-utils/colab/install_rapids.py stable # import IPython # import pandas as pd # import cudf # import numpy as np # import cupy # import matplotlib.pyplot as plt # !cp /content/drive/MyDrive/Recommendation/data_silver_l2.zip /content # !unzip /content/data_silver_l2.zip # df_train = cudf.read_parquet('/content/train.parquet') # df_valid = cudf.read_parquet('/content/valid.parquet') # df_test = cudf.read_parquet('/content/test.parquet') # df_train.isna().sum() # _temp = df_train['category_code'].str.split(".", n=3, expand=True).fillna('NA') # _temp.columns = ['cat_{}'.format(x) for x in _temp.columns] # df_train.drop('category_code', axis=1, inplace=True) # df_train = df_train.join(_temp) # _temp = df_valid['category_code'].str.split(".", n=3, expand=True).fillna('NA') # _temp.columns = ['cat_{}'.format(x) for x in _temp.columns] # df_valid.drop('category_code', axis=1, inplace=True) # df_valid = df_valid.join(_temp) # _temp = df_test['category_code'].str.split(".", n=3, expand=True).fillna('NA') # _temp.columns = ['cat_{}'.format(x) for x in _temp.columns] # df_test.drop('category_code', axis=1, inplace=True) # df_test = df_test.join(_temp) # !mkdir -p /content/data/silver_l3 # df_train.to_parquet('/content/data/silver_l3/train.parquet', index=False) # df_valid.to_parquet('/content/data/silver_l3/valid.parquet', index=False) # df_test.to_parquet('/content/data/silver_l3/test.parquet', index=False) # !cd /content/data/silver_l3 && zip /content/data_silver_l3.zip ./*.parquet # !cp /content/data_silver_l3.zip /content/drive/MyDrive/Recommendation !pip install nvtabular import glob import nvtabular as nvt from nvtabular import ops !cp /content/drive/MyDrive/Recommendation/data_silver_l3.zip /content !unzip /content/data_silver_l3.zip train_paths = glob.glob('/content/train.parquet') valid_paths = glob.glob('/content/valid.parquet') train_dataset = nvt.Dataset(train_paths, engine='parquet', part_mem_fraction=0.15) valid_dataset = nvt.Dataset(valid_paths, engine='parquet', part_mem_fraction=0.15) train_dataset.head() nvt.Workflow() proc = nvt.Workflow( cat_features=['product_id', 'brand', 'user_id', 'user_session', 'cat_0', 'cat_1', 'cat_2', 'cat_3', 'ts_hour', 'ts_minute', 'ts_weekday', 'ts_day', 'ts_month', 'ts_year'], cont_features=['price', 'timestamp'], label_name=['target'] ) proc.add_feature([ ops.LambdaOp( op_name = 'user_id', f = lambda col, gdf: col.astype(str) + '_' + gdf['user_id'].astype(str), columns = ['product_id', 'brand', 'ts_hour', 'ts_minute'], replace=False ), ops.LambdaOp( op_name = 'user_id_brand', f = lambda col, gdf: col.astype(str) + '_' + gdf['user_id'].astype(str) + '_' + gdf['brand'].astype(str), columns = ['ts_hour', 'ts_weekday', 'cat_0', 'cat_1', 'cat_2'], replace=False ), ops.Categorify( freq_threshold=15, columns = [x + '_user_id' for x in ['product_id', 'brand', 'ts_hour', 'ts_minute']] + [x + '_user_id_brand' for x in ['ts_hour', 'ts_weekday', 'cat_0', 'cat_1', 'cat_2']] + ['product_id', 'brand', 'user_id', 'user_session', 'cat_0', 'cat_1', 'cat_2', 'cat_3', 'ts_hour', 'ts_minute', 'ts_weekday', 'ts_day', 'ts_month', 'ts_year'] ), ops.LambdaOp( op_name = 'product_id', f = lambda col, gdf: col.astype(str) + '_' + gdf['product_id'].astype(str), columns = ['brand', 'user_id', 'cat_0'], replace=False ), ops.JoinGroupby( cont_names=[] ), ops.TargetEncoding( cat_groups = ['brand', 'user_id', 'product_id', 'cat_2', ['ts_weekday','ts_day']], cont_target= 'target', kfold=5, fold_seed=42, p_smooth=20, ) ])