supressWarnings <- function(expr) suppressPackageStartupMessages(suppressWarnings(suppressMessages(expr)))
supressWarnings({
library(tidyverse);
library(httr);
library(jsonlite);
library(glue);
library(plyr);
library(tictoc)
})
options(digits = 3, scipen = 50)
cx_data <- read.delim('secrets/cx_data_raw.tsv', sep='\t')
check_mt_availability <- function(lang1, lang2) {
url <- glue('https://cxserver.wikimedia.org/v1/list/mt/{lang1}/{lang2}')
response <- GET(url)
services <- fromJSON(content(response, 'text'))
if(length(services) > 0){
return(TRUE)
} else {
return(FALSE)
}
}
cx_data <- (
cx_data %>%
mutate(
source_lang_code = sub("wiki", "", source_db),
target_lang_code = sub("wiki", "", target_db)
)
)
lang_pairs <- (
cx_data %>%
select(source_lang_code, target_lang_code) %>%
distinct()
)
tic()
lang_pairs$is_mt_available <- mapply(check_mt_availability, lang_pairs$source_lang_code, lang_pairs$target_lang_code)
toc()
write.table(lang_pairs, "mt_availability.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
lang_pairs <- read.delim('mt_availability.tsv', sep='\t')
cx_data <- merge(cx_data, lang_pairs,
by.x = c('source_lang_code', 'target_lang_code'),
by.y = c('source_lang_code', 'target_lang_code'))
wiki_comp_path <- 'https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2024.tsv'
wiki_comp <- read.delim(wiki_comp_path, sep='\t')
wp_comp <- wiki_comp[, c('database.code', 'overall.size.rank', 'project.code')]
wp_comp <- wp_comp[wp_comp$project.code == 'wikipedia', ]
wp_comp = (
wp_comp %>%
mutate(overall.size.rank = seq_along(overall.size.rank))
)
rank_bin_edges <- c(0, 5, 10, 20, 50, Inf)
rank_bin_labels <- c('1-5', '6-10', '11-20', '21-50', '51-max')
# bin target wikipedia
wp_comp$target_wp_rank_bin <- cut(
wp_comp$overall.size.rank,
breaks = rank_bin_edges,
labels = rank_bin_labels,
include.lowest = TRUE
)
# bin source wikipedia
wp_comp$source_wp_rank_bin <- cut(
wp_comp$overall.size.rank,
breaks = rank_bin_edges,
labels = rank_bin_labels,
include.lowest = TRUE
)
cx_data <- merge(cx_data, wp_comp[, c('database.code', 'target_wp_rank_bin')], by.x='target_db', by.y='database.code')
cx_data <- merge(cx_data, wp_comp[, c('database.code', 'source_wp_rank_bin')], by.x='source_db', by.y='database.code')
cx_data <- (
cx_data %>%
mutate(
cx_start_ts = as.POSIXct(as.character(cx_start_ts), format = '%Y%m%d%H%M%S', tz = 'UTC'),
cx_update_ts = as.POSIXct(as.character(cx_update_ts), format = '%Y%m%d%H%M%S', tz = 'UTC')
)
)
cx_data$duration_mins <- as.numeric(difftime(cx_data$cx_update_ts, cx_data$cx_start_ts, units = 'min'))
nulls <- data.frame(
col = names(colSums(is.na(cx_data))),
null_count = colSums(is.na(cx_data))
) %>% filter(null_count > 0)
nulls
col | null_count | |
---|---|---|
<chr> | <dbl> | |
target_bytes | target_bytes | 35 |
time_since_prev_edit | time_since_prev_edit | 12663 |
source_quality | source_quality | 4120 |
is_source_std_quality | is_source_std_quality | 3563 |
target_quality | target_quality | 37239 |
is_target_std_quality | is_target_std_quality | 31326 |
Summary
time_since_prev_edit
& is_target_std_quality
, which will be explored and addressed below.source_quality
and target_quality
data has been gathered, they won't used in the final analysis, so the null values in those columns are not of a concern.target_bytes
and is_source_std_quality
columns, which will be omitted during the final export.prev_edit_nulls <- cx_data[is.na(cx_data$time_since_prev_edit), ]
paste('Number of observations having null values for time since previous edit:', nrow(prev_edit_nulls))
paste('^ of the above number of observations belonging to users making the first edit:', table(prev_edit_nulls$is_first_edit)[['True']])
All of the null values in the time_since_prev_edit
column is the because the edit was the first edit by user. The null will be replaced with 0.
cx_data <- cx_data %>% mutate(time_since_prev_edit = coalesce(time_since_prev_edit, 0))
the null values will be filled based on the is_page_deleted. Most of them are due to the page being already deleted by the time content gap metrics were calculated.
cx_data <- cx_data %>%
mutate(is_target_std_quality = if_else(is.na(is_target_std_quality),
if_else(is_page_deleted == 'True', 0, 1),
is_target_std_quality))
final_cols <- c(
# unique identifier
'target_rev_id',
# source article related variables
'source_bytes', 'is_source_std_quality', 'is_source_human', 'source_wp_rank_bin',
# target article related variables
'target_bytes', 'is_target_std_quality', 'target_wp_rank_bin',
# translation related variables
'is_mt_available', 'mt_pct', 'human_pct', 'duration_mins', 'is_mobile_edit',
# user related variables
'is_first_edit', 'user_edit_bucket', 'user_rights_level', 'time_since_prev_edit',
# translations by user in preceeding time frames
'creations_1hr', 'creations_6hr', 'creations_24hr', 'creations_72hr',
'creations_7days','creations_15days', 'creations_30days',
# outcome variable
'is_page_deleted')
cx_data_final <- cx_data[, final_cols]
names(cx_data_final)[names(cx_data_final) == 'time_since_prev_edit'] <- 'secs_since_prev_edit'
convert_to_logical <- function(values) {
return(
as.logical(ifelse((values == 'True' | values == 1), TRUE, ifelse((values == 'False' | values == 0), FALSE, NA)))
)
}
logical_cols <- c('is_source_std_quality', 'is_source_human', 'is_target_std_quality',
'is_mt_available', 'is_first_edit', 'is_mobile_edit', 'is_page_deleted')
for (col in logical_cols) {
cx_data_final[[col]] <- convert_to_logical(cx_data_final[[col]])
}
edit_buckets <- c('1-10', '11-99', '100-999', '1000-4999', '5000+')
wp_ranks <- c('1-5', '6-10', '11-20', '21-50', '51-max')
user_rights <- c('none', 'confirmed', 'extended')
cx_data_final <- (
cx_data_final %>%
mutate(
source_wp_rank_bin = factor(source_wp_rank_bin, levels = wp_ranks, ordered = TRUE),
target_wp_rank_bin = factor(target_wp_rank_bin, levels = wp_ranks, ordered = TRUE),
user_edit_bucket = factor(user_edit_bucket, levels = edit_buckets, ordered = TRUE),
user_rights_level = factor(user_rights_level, levels = user_rights, ordered = TRUE),
)
)
cx_data_final <- (
cx_data_final %>%
mutate(
target_bytes = as.integer(target_bytes),
duration_mins = as.numeric(duration_mins),
mt_pct = mt_pct * 100,
human_pct = human_pct * 100
)
)
nulls_final <- data.frame(
col = names(colSums(is.na(cx_data_final))),
null_count = colSums(is.na(cx_data_final))
) %>% filter(null_count > 0)
nulls_final
col | null_count | |
---|---|---|
<chr> | <dbl> | |
is_source_std_quality | is_source_std_quality | 3563 |
is_source_human | is_source_human | 3404 |
target_bytes | target_bytes | 35 |
cx_data_final <- cx_data_final %>% filter(!is.na(is_source_std_quality) & !is.na(target_bytes) & !is.na(is_source_human))
final_null_pct <- round((1 - nrow(cx_data_final) / nrow(cx_data)) * 100, 2)
glue('Percentage of collected data omitted due to null values: {final_null_pct}%')
row.names(cx_data_final) <- NULL
saveRDS(cx_data_final, 'secrets/cx_data.rds')
write.table(cx_data_final, "secrets/cx_data.tsv", sep = "\t", row.names = FALSE, quote = FALSE)
sapply(cx_data_final, class)