!wget -q --show-progress https://files.grouplens.org/datasets/movielens/ml-1m.zip
ml-1m.zip 100%[===================>] 5.64M 5.84MB/s in 1.0s
import os
import shutil
import zipfile
import numpy as np
import scipy.sparse as sp
from collections import Counter
class GroupGenerator(object):
"""
Group Data Generator
"""
def __init__(self, data_path, output_path, rating_threshold, num_groups,
group_sizes, min_num_ratings, train_ratio, val_ratio,
negative_sample_size, verbose=False):
self.rating_threshold = rating_threshold
self.negative_sample_size = negative_sample_size
users_path = os.path.join(data_path, 'users.dat')
items_path = os.path.join(data_path, 'movies.dat')
ratings_path = os.path.join(data_path, 'ratings.dat')
users = self.load_users_file(users_path)
items = self.load_items_file(items_path)
rating_mat, timestamp_mat = \
self.load_ratings_file(ratings_path, max(users), max(items))
groups, group_ratings, groups_rated_items_dict, groups_rated_items_set = \
self.generate_group_ratings(users, rating_mat, timestamp_mat,
num_groups=num_groups,
group_sizes=group_sizes,
min_num_ratings=min_num_ratings)
members, group_ratings_train, group_ratings_val, group_ratings_test, \
group_negative_items_val, group_negative_items_test, \
user_ratings_train, user_ratings_val, user_ratings_test, \
user_negative_items_val, user_negative_items_test = \
self.split_ratings(group_ratings, rating_mat, timestamp_mat,
groups, groups_rated_items_dict, groups_rated_items_set,
train_ratio=train_ratio, val_ratio=val_ratio)
groups_path = os.path.join(output_path, 'groupMember.dat')
group_ratings_train_path = os.path.join(output_path, 'groupRatingTrain.dat')
group_ratings_val_path = os.path.join(output_path, 'groupRatingVal.dat')
group_ratings_test_path = os.path.join(output_path, 'groupRatingTest.dat')
group_negative_items_val_path = os.path.join(output_path, 'groupRatingValNegative.dat')
group_negative_items_test_path = os.path.join(output_path, 'groupRatingTestNegative.dat')
user_ratings_train_path = os.path.join(output_path, 'userRatingTrain.dat')
user_ratings_val_path = os.path.join(output_path, 'userRatingVal.dat')
user_ratings_test_path = os.path.join(output_path, 'userRatingTest.dat')
user_negative_items_val_path = os.path.join(output_path, 'userRatingValNegative.dat')
user_negative_items_test_path = os.path.join(output_path, 'userRatingTestNegative.dat')
self.save_groups(groups_path, groups)
self.save_ratings(group_ratings_train, group_ratings_train_path)
self.save_ratings(group_ratings_val, group_ratings_val_path)
self.save_ratings(group_ratings_test, group_ratings_test_path)
self.save_negative_samples(group_negative_items_val, group_negative_items_val_path)
self.save_negative_samples(group_negative_items_test, group_negative_items_test_path)
self.save_ratings(user_ratings_train, user_ratings_train_path)
self.save_ratings(user_ratings_val, user_ratings_val_path)
self.save_ratings(user_ratings_test, user_ratings_test_path)
self.save_negative_samples(user_negative_items_val, user_negative_items_val_path)
self.save_negative_samples(user_negative_items_test, user_negative_items_test_path)
shutil.copyfile(src=os.path.join(data_path, 'movies.dat'), dst=os.path.join(output_path, 'movies.dat'))
shutil.copyfile(src=os.path.join(data_path, 'users.dat'), dst=os.path.join(output_path, 'users.dat'))
if verbose:
num_group_ratings = len(group_ratings)
num_user_ratings = len(user_ratings_train) + len(user_ratings_val) + len(user_ratings_test)
num_rated_items = len(groups_rated_items_set)
print('Save data: ' + output_path)
print('# Users: ' + str(len(members)))
print('# Items: ' + str(num_rated_items))
print('# Groups: ' + str(len(groups)))
print('# U-I ratings: ' + str(num_user_ratings))
print('# G-I ratings: ' + str(num_group_ratings))
print('Avg. # ratings / user: {:.2f}'.format(num_user_ratings / len(members)))
print('Avg. # ratings / group: {:.2f}'.format(num_group_ratings / len(groups)))
print('Avg. group size: {:.2f}'.format(np.mean(list(map(len, groups)))))
def load_users_file(self, users_path):
users = []
with open(users_path, 'r') as file:
for line in file.readlines():
users.append(int(line.split('::')[0]))
return users
def load_items_file(self, items_path):
items = []
with open(items_path, 'r', encoding='iso-8859-1') as file:
for line in file.readlines():
items.append(int(line.split('::')[0]))
return items
def load_ratings_file(self, ratings_path, max_num_users, max_num_items):
rating_mat = sp.dok_matrix((max_num_users + 1, max_num_items + 1),
dtype=np.int)
timestamp_mat = rating_mat.copy()
with open(ratings_path, 'r') as file:
for line in file.readlines():
arr = line.replace('\n', '').split('::')
user, item, rating, timestamp = \
int(arr[0]), int(arr[1]), int(arr[2]), int(arr[3])
rating_mat[user, item] = rating
timestamp_mat[user, item] = timestamp
return rating_mat, timestamp_mat
def generate_group_ratings(self, users, rating_mat, timestamp_mat,
num_groups, group_sizes, min_num_ratings):
np.random.seed(0)
groups = set()
groups_ratings = []
groups_rated_items_dict = {}
groups_rated_items_set = set()
while len(groups) < num_groups:
group_id = len(groups) + 1
while True:
group = tuple(np.sort(
np.random.choice(users, np.random.choice(group_sizes),
replace=False)))
if group not in groups:
break
pos_group_rating_counter = Counter()
neg_group_rating_counter = Counter()
group_rating_list = []
group_rated_items = set()
for member in group:
_, items = rating_mat[member, :].nonzero()
pos_items = [item for item in items
if rating_mat[member, item] >= self.rating_threshold]
neg_items = [item for item in items
if rating_mat[member, item] < self.rating_threshold]
pos_group_rating_counter.update(pos_items)
neg_group_rating_counter.update(neg_items)
for item, num_ratings in pos_group_rating_counter.items():
if num_ratings == len(group):
timestamp = max([timestamp_mat[member, item]
for member in group])
group_rated_items.add(item)
group_rating_list.append((group_id, item, 1, timestamp))
for item, num_ratings in neg_group_rating_counter.items():
if (num_ratings == len(group)) \
or (num_ratings + pos_group_rating_counter[item] == len(group)):
timestamp = max([timestamp_mat[member, item]
for member in group])
group_rated_items.add(item)
group_rating_list.append((group_id, item, 0, timestamp))
if len(group_rating_list) >= min_num_ratings:
groups.add(group)
groups_rated_items_dict[group_id] = group_rated_items
groups_rated_items_set.update(group_rated_items)
for group_rating in group_rating_list:
groups_ratings.append(group_rating)
return list(groups), groups_ratings, groups_rated_items_dict, groups_rated_items_set
def split_ratings(self, group_ratings, rating_mat, timestamp_mat,
groups, groups_rated_items_dict, groups_rated_items_set, train_ratio, val_ratio):
num_group_ratings = len(group_ratings)
num_train = int(num_group_ratings * train_ratio)
num_test = int(num_group_ratings * (1 - train_ratio - val_ratio))
group_ratings = \
sorted(group_ratings, key=lambda group_rating: group_rating[-1])
group_ratings_train = group_ratings[:num_train]
group_ratings_val = group_ratings[num_train:-num_test]
group_ratings_test = group_ratings[-num_test:]
timestamp_split_train = group_ratings_train[-1][-1]
timestamp_split_val = group_ratings_val[-1][-1]
user_ratings_train = []
user_ratings_val = []
user_ratings_test = []
members = set()
users_rated_items_dict = {}
for group in groups:
for member in group:
if member in members:
continue
members.add(member)
user_rated_items = set()
_, items = rating_mat[member, :].nonzero()
for item in items:
if item not in groups_rated_items_set:
continue
user_rated_items.add(item)
if rating_mat[member, item] >= self.rating_threshold:
rating_tuple = (member, item, 1,
timestamp_mat[member, item])
else:
rating_tuple = (member, item, 0,
timestamp_mat[member, item])
if timestamp_mat[member, item] <= timestamp_split_train:
user_ratings_train.append(rating_tuple)
elif timestamp_split_train < timestamp_mat[member, item] <= timestamp_split_val:
user_ratings_val.append(rating_tuple)
else:
user_ratings_test.append(rating_tuple)
users_rated_items_dict[member] = user_rated_items
np.random.seed(0)
user_negative_items_val = self.get_negative_samples(
user_ratings_val, groups_rated_items_set, users_rated_items_dict)
user_negative_items_test = self.get_negative_samples(
user_ratings_test, groups_rated_items_set, users_rated_items_dict)
group_negative_items_val = self.get_negative_samples(
group_ratings_val, groups_rated_items_set, groups_rated_items_dict)
group_negative_items_test = self.get_negative_samples(
group_ratings_test, groups_rated_items_set, groups_rated_items_dict)
return members, group_ratings_train, group_ratings_val, group_ratings_test, \
group_negative_items_val, group_negative_items_test, \
user_ratings_train, user_ratings_val, user_ratings_test, \
user_negative_items_val, user_negative_items_test
def get_negative_samples(self, ratings, groups_rated_items_set, rated_items_dict):
negative_items_list = []
for sample in ratings:
sample_id, item, _, _ = sample
missed_items = groups_rated_items_set - rated_items_dict[sample_id]
negative_items = \
np.random.choice(list(missed_items), self.negative_sample_size,
replace=(len(missed_items) < self.negative_sample_size))
negative_items_list.append((sample_id, item, negative_items))
return negative_items_list
def save_groups(self, groups_path, groups):
with open(groups_path, 'w') as file:
for i, group in enumerate(groups):
file.write(str(i + 1) + ' '
+ ','.join(map(str, list(group))) + '\n')
def save_ratings(self, ratings, ratings_path):
with open(ratings_path, 'w') as file:
for rating in ratings:
file.write(' '.join(map(str, list(rating))) + '\n')
def save_negative_samples(self, negative_items, negative_items_path):
with open(negative_items_path, 'w') as file:
for samples in negative_items:
user, item, negative_items = samples
file.write('({},{}) '.format(user, item)
+ ' '.join(map(str, list(negative_items))) + '\n')
import pandas as pd
files_info = pd.DataFrame({
'Files': ['movies.dat', 'users.dat', 'groupMember.dat',
'group(user)RatingTrain.dat', 'group(user)RatingVal(Test).dat',
'group(user)RatingVal(Test)Negative.dat'],
'Description': ['Movie information file from MovieLens-1M',
'User information file from MovieLens-1M',
'File including group members. Each line is a group instance: groupID userID1,userID2,...',
'Train file. Each line is a training instance: groupID(userID) itemID rating timestamp',
'group (user) validation (test) file (positive instances). Each line is a validation (test) instance: groupID(userID) itemID rating timestamp',
'group (user) validation (test) file (negative instances). Each line corresponds to the line of group(user)RatingVal(Test).dat, containing 100 negative samples. Each line is in the format: (groupID(userID),itemID) negativeItemID1, negativeItemID2, ...',
]})
print(files_info.to_markdown())
| | Files | Description | |---:|:---------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | 0 | movies.dat | Movie information file from MovieLens-1M | | 1 | users.dat | User information file from MovieLens-1M | | 2 | groupMember.dat | File including group members. Each line is a group instance: groupID userID1,userID2,... | | 3 | group(user)RatingTrain.dat | Train file. Each line is a training instance: groupID(userID) itemID rating timestamp | | 4 | group(user)RatingVal(Test).dat | group (user) validation (test) file (positive instances). Each line is a validation (test) instance: groupID(userID) itemID rating timestamp | | 5 | group(user)RatingVal(Test)Negative.dat | group (user) validation (test) file (negative instances). Each line corresponds to the line of group(user)RatingVal(Test).dat, containing 100 negative samples. Each line is in the format: (groupID(userID),itemID) negativeItemID1, negativeItemID2, ... |
print('\nTakes approx. 5 mins...')
'Takes approx. 5 mins...'
data_folder_path = '.'
data_path = os.path.join(data_folder_path, 'ml-1m')
data_zip_path = os.path.join(data_folder_path, 'ml-1m.zip')
output_path = os.path.join(data_folder_path, 'MovieLens-Rand')
if not os.path.exists(data_path):
with zipfile.ZipFile(data_zip_path, 'r') as data_zip:
data_zip.extractall(data_folder_path)
print('Unzip file: ' + data_zip_path)
if not os.path.exists(output_path):
os.mkdir(output_path)
group_generator = GroupGenerator(data_path, output_path,
rating_threshold=4,
num_groups=1000,
group_sizes=[2, 3, 4, 5],
min_num_ratings=20,
train_ratio=0.7,
val_ratio=0.1,
negative_sample_size=100,
verbose=True)
Save data: ./MovieLens-Rand # Users: 1626 # Items: 1998 # Groups: 1000 # U-I ratings: 438129 # G-I ratings: 53248 Avg. # ratings / user: 269.45 Avg. # ratings / group: 53.25 Avg. group size: 2.19
!apt-get -qq install tree
Selecting previously unselected package tree. (Reading database ... 155222 files and directories currently installed.) Preparing to unpack .../tree_1.7.0-5_amd64.deb ... Unpacking tree (1.7.0-5) ... Setting up tree (1.7.0-5) ... Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
!tree --du -h -C .
. ├── [ 24M] ml-1m │ ├── [167K] movies.dat │ ├── [ 23M] ratings.dat │ ├── [5.4K] README │ └── [131K] users.dat ├── [5.6M] ml-1m.zip └── [ 55M] MovieLens-Rand ├── [ 14K] groupMember.dat ├── [218K] groupRatingTest.dat ├── [4.9M] groupRatingTestNegative.dat ├── [749K] groupRatingTrain.dat ├── [107K] groupRatingVal.dat ├── [2.5M] groupRatingValNegative.dat ├── [167K] movies.dat ├── [1.1M] userRatingTest.dat ├── [ 24M] userRatingTestNegative.dat ├── [7.4M] userRatingTrain.dat ├── [561K] userRatingVal.dat ├── [ 13M] userRatingValNegative.dat └── [131K] users.dat 84M used in 2 directories, 18 files
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-11-25 17:34:14 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit IPython: 5.5.0 numpy : 1.19.5 scipy : 1.4.1
END