This splits the BookCrossing data for a train-test sweep.
library(readr)
library(dplyr)
options(repr.matrix.max.rows = 20)
ratings = read_csv("build/bx-ratings.csv", col_names = TRUE, col_types = 'iid') %>%
rename(user=userID, item=bookID)
dim(ratings)
explicit.test.users = ratings %>%
group_by(user) %>%
summarize(nratings=n()) %>%
filter(nratings >= 10) %>%
sample_n(5000)
explicit.test.users
dim(explicit.test.users)
ratings.group = ratings %>%
group_by(user) %>%
mutate(urid = sample(n())) %>%
ungroup()
test.ratings = explicit.test.users %>%
select(user) %>%
inner_join(ratings.group) %>%
filter(urid <= 5) %>%
select(user, item, rating)
dim(test.ratings)
train.ratings = explicit.test.users %>%
right_join(ratings.group) %>%
filter(urid > 5 | is.na(nratings)) %>%
select(user, item, rating)
dim(train.ratings)
write_csv(test.ratings, "build/bx-ratings-test.csv")
write_csv(train.ratings, "build/bx-ratings-train.csv")
ratings = read_csv("build/bx-implicit.csv", col_names = TRUE, col_types = 'iid') %>%
rename(user=userID, item=bookID)
dim(ratings)
implicit.test.users = ratings %>%
group_by(user) %>%
summarize(nratings=n()) %>%
filter(nratings >= 10) %>%
sample_n(5000)
implicit.test.users
dim(implicit.test.users)
ratings.group = ratings %>%
group_by(user) %>%
mutate(urid = sample(n())) %>%
ungroup()
test.ratings = implicit.test.users %>%
select(user) %>%
inner_join(ratings.group) %>%
filter(urid <= 5) %>%
select(user, item, rating) %>%
mutate(rating=1)
dim(test.ratings)
train.ratings = implicit.test.users %>%
right_join(ratings.group) %>%
filter(urid > 5 | is.na(nratings)) %>%
select(user, item, rating)
dim(train.ratings)
write_csv(test.ratings, "build/bx-implicit-test.csv")
write_csv(train.ratings, "build/bx-implicit-train.csv")