Loading and transformation of epinions user item interaction dataset
!pip install lenskit
!wget -q --show-progress https://github.com/RecoHut-Datasets/epinions/raw/v1/trust_data.txt
trust_data.txt 100%[===================>] 6.06M --.-KB/s in 0.08s
import pandas as pd
import lenskit.crossfold as xf
import numpy as np
import json
ratings = pd.read_csv('trust_data.txt', header=None, index_col=None, sep=' ')
ratings.dropna(axis=1, how='all', inplace=True)
columns = ['user', 'item', 'rating']
ratings.columns = columns
print(ratings.head())
user item rating 0 22605 42915 1 1 22605 5052 1 2 22605 42913 1 3 22605 18420 1 4 22605 42914 1
n_user = len(pd.unique(ratings.user))
n_item = len(pd.unique(ratings.item))
print("Num_of_users: {}\nNum_of_items: {}".format(n_user, n_item))
Num_of_users: 33960 Num_of_items: 49288
ratings.head()
user | item | rating | |
---|---|---|---|
0 | 22605 | 42915 | 1 |
1 | 22605 | 5052 | 1 |
2 | 22605 | 42913 | 1 |
3 | 22605 | 18420 | 1 |
4 | 22605 | 42914 | 1 |
df_25 = ratings[ratings.user.isin(ratings.user.value_counts()[ratings.user.value_counts() >= 25].index)]
df_25 = df_25.reset_index(drop=True)
print("\033[4mCount after only keeping users with at least 25 relevant interactions\033[0m")
print("Num_of_users: {}\nNum_of_items: {}\nTotal_interactions: {}".format(len(pd.unique(df_25.user)), len(pd.unique(df_25.item)), len(df_25)))
Count after only keeping users with at least 25 relevant interactions
Num_of_users: 4718
Num_of_items: 36165
Total_interactions: 346035
print(df_25.head())
user item rating 0 2824 2696 1 1 2824 14915 1 2 2824 18333 1 3 2824 2143 1 4 2824 10308 1
def get_unique_id(data_pd: pd.DataFrame, column: str) -> (dict, pd.DataFrame):
"""
clear the ids
:param data_pd: pd.DataFrame
:param column: specified col
:return: dict: {value: id}
"""
new_column = '{}_id'.format(column)
assert new_column not in data_pd.columns
temp = data_pd.loc[:, [column]].drop_duplicates().reset_index(drop=True)
temp[new_column] = temp.index
temp.index = temp[column]
del temp[column]
# data_pd.merge()
data_pd = pd.merge(left=data_pd,
right=temp,
left_on=column,
right_index=True,
how='left')
return temp[new_column].to_dict(), data_pd
_, df_25 = get_unique_id(df_25, 'user')
_, df_25 = get_unique_id(df_25, 'item')
print(df_25.head())
user item rating user_id item_id 0 2824 2696 1 0 0 1 2824 14915 1 0 1 2 2824 18333 1 0 2 3 2824 2143 1 0 3 4 2824 10308 1 0 4
n_user = df_25.user_id.drop_duplicates().size
n_item = df_25.item_id.drop_duplicates().size
print(n_user, n_item)
4718 36165
import os
dataset_meta_info = {'dataset_size': len(df_25),
'user_size': n_user,
'item_size': n_item
}
with open(os.path.join('dataset_meta_info.json'), 'w') as f:
json.dump(dataset_meta_info, f)
seeds = [1, 777, 1992, 2003, 2020]
for j in range(len(seeds)):
for i, tp in enumerate(xf.partition_users(df_25, partitions=1, method=xf.SampleN(20), rng_spec=seeds[j])):
save_path = '.'
if not os.path.exists(save_path):
os.makedirs(save_path)
train = tp.test
test = tp.train
train.to_csv(os.path.join(save_path, 'train.csv'))
test.to_csv(os.path.join(save_path, 'test.csv'))
print(len(tp.train))
print(len(tp.test))
251675 94360 251675 94360 251675 94360 251675 94360 251675 94360
train_df = pd.read_csv('train.csv', index_col=0)
train_df.head()
user | item | rating | user_id | item_id | |
---|---|---|---|---|---|
86621 | 1 | 14 | 1 | 1201 | 5233 |
86470 | 1 | 77 | 1 | 1201 | 2226 |
86531 | 1 | 163 | 1 | 1201 | 1312 |
86603 | 1 | 297 | 1 | 1201 | 6344 |
86451 | 1 | 319 | 1 | 1201 | 426 |
train_df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 94360 entries, 86621 to 238809 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user 94360 non-null int64 1 item 94360 non-null int64 2 rating 94360 non-null int64 3 user_id 94360 non-null int64 4 item_id 94360 non-null int64 dtypes: int64(5) memory usage: 4.3 MB
train_df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
user | 94360.0 | 6666.024375 | 6900.551097 | 1.0 | 1652.0 | 4199.0 | 9581.0 | 47624.0 |
item | 94360.0 | 6524.838650 | 9194.674987 | 1.0 | 729.0 | 2287.0 | 8658.0 | 49046.0 |
rating | 94360.0 | 1.000000 | 0.000000 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
user_id | 94360.0 | 2358.500000 | 1361.976471 | 0.0 | 1179.0 | 2358.5 | 3538.0 | 4717.0 |
item_id | 94360.0 | 4957.928116 | 7146.040509 | 0.0 | 684.0 | 2012.0 | 5690.0 | 36158.0 |
!ls -al .
total 15684 drwxr-xr-x 1 root root 4096 Nov 25 14:04 . drwxr-xr-x 1 root root 4096 Nov 25 13:08 .. drwxr-xr-x 4 root root 4096 Nov 18 14:35 .config -rw-r--r-- 1 root root 63 Nov 25 14:04 dataset_meta_info.json drwxr-xr-x 1 root root 4096 Nov 18 14:36 sample_data -rw-r--r-- 1 root root 7020158 Nov 25 14:04 test.csv -rw-r--r-- 1 root root 2655420 Nov 25 14:04 train.csv -rw-r--r-- 1 root root 6357397 Nov 25 13:13 trust_data.txt
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-11-25 12:56:37 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit numpy : 1.19.5 pandas : 1.1.5 lenskit: 0.13.1 json : 2.0.9 IPython: 5.5.0
END