library(dplyr, warn=F)
library(ggplot2)
# Read history dates for all articles
delay_df = file.path('data', 'delays.tsv.gz') %>%
readr::read_tsv(col_types = list(date = readr::col_date())) %>%
dplyr::mutate(year = lubridate::year(date)) %>%
dplyr::mutate(date_decimal = lubridate::decimal_date(date))
head(delay_df, 2)
journal_nlm_id | pubmed_id | delay_type | date | delay | year | date_decimal | |
---|---|---|---|---|---|---|---|
1 | 0001027 | 22221113 | Acceptance | 2011-11-15 | 111 | 2011 | 2011.871 |
2 | 0001027 | 22221113 | Publication | 2012-01-05 | 51 | 2012 | 2012.011 |
journal_df = file.path('data', 'pubmed-journals.tsv') %>%
readr::read_tsv() %>%
dplyr::transmute(journal_nlm_id = NlmId, journal_abbrev = MedAbbr, journal = JournalTitle) %>%
dplyr::inner_join(
delay_df %>%
dplyr::group_by(journal_nlm_id, delay_type) %>%
dplyr::summarize(
n_articles = n(),
median_delay = median(delay),
mean_delay = signif(mean(delay), 6),
n_unique_dates = length(unique(date)),
n_unique_years = length(unique(year)),
n_unique_delays = length(unique(delay))
)
)
head(journal_df, 2)
Joining by: "journal_nlm_id"
journal_nlm_id | journal_abbrev | journal | delay_type | n_articles | median_delay | mean_delay | n_unique_dates | n_unique_years | n_unique_delays | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 0001027 | Aust N Z J Obstet Gynaecol | The Australian & New Zealand journal of obstetrics & gynaecology | Acceptance | 397 | 111 | 122.453 | 258 | 5 | 183 |
2 | 0001027 | Aust N Z J Obstet Gynaecol | The Australian & New Zealand journal of obstetrics & gynaecology | Publication | 325 | 56 | 68.3015 | 138 | 4 | 107 |
journal_df %>%
readr::write_tsv(file.path('data', 'journal-summaries.tsv'))
year_summary_df = delay_df %>%
dplyr::group_by(delay_type, year) %>%
dplyr::summarize(
n_journals = n_distinct(journal_nlm_id),
n_articles = n(),
median_delay = median(delay),
mean_delay = signif(mean(delay), 6)
)
year_summary_df %>%
readr::write_tsv(file.path('data', 'yearly-summaries.tsv'))
head(year_summary_df, 2)
delay_type | year | n_journals | n_articles | median_delay | mean_delay | |
---|---|---|---|---|---|---|
1 | Acceptance | 1965 | 1 | 85 | 91 | 114.788 |
2 | Acceptance | 1966 | 1 | 191 | 60 | 87.0785 |
summarize_year <- function(df) {
# Return delay percentiles from df
probs = seq(0, 1, 0.025)
dplyr::data_frame(
percentile = 100 * probs,
delay = quantile(df$delay, probs=probs)
)
}
year_percentile_df = delay_df %>%
dplyr::group_by(delay_type, year) %>%
dplyr::do(summarize_year(.))
year_percentile_df %>%
format(digits=3) %>%
readr::write_tsv(file.path('data', 'yearly-percentiles.tsv'))
head(year_percentile_df, 2)
delay_type | year | percentile | delay | |
---|---|---|---|---|
1 | Acceptance | 1965 | 0 | 16 |
2 | Acceptance | 1965 | 2.5 | 23.3 |
xlab_accept = 'Date accepted'
xlab_publish = 'Date published online'
ylab_accept = 'Acceptance delay (days)'
ylab_publish = 'Publication delay (days)'
plot_years <- function(percentile_df, delay_df, min_year, max_delay, xlab, ylab) {
pctl_col = '#00693E'
gg = percentile_df %>%
ggplot2::ggplot(aes(x = year, y = delay, group = percentile)) +
ggplot2::geom_smooth(aes(group=NULL), data=delay_df, color='#B4B3B3', fill = '#B4B3B3', alpha=1) +
ggplot2::geom_line(size=0.1, color = pctl_col) +
ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile == 50), size=0.8, color = pctl_col) +
ggplot2::geom_line(data = dplyr::filter(percentile_df, percentile %in% c(25, 75)), size=0.5, color = pctl_col) +
ggplot2::coord_cartesian(xlim = c(min_year, 2015), ylim = c(0, max_delay), expand = FALSE) +
ggplot2::theme_bw() +
ggplot2::xlab(xlab) +
ggplot2::ylab(ylab) +
ggplot2::theme(plot.margin=grid::unit(c(2, 10, 2, 2), 'points'))
return(gg)
}
# Acceptance delay plot
gg = plot_years(
percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Acceptance'),
delay_df = delay_df %>% dplyr::filter(delay_type == 'Acceptance'),
min_year = 1981,
max_delay = 205,
xlab = xlab_accept,
ylab = ylab_accept
)
file.path('viz', 'acceptance-by-article.png') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)
file.path('viz', 'acceptance-by-article.pdf') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)
# Publication delay plot
gg = plot_years(
percentile_df = year_percentile_df %>% dplyr::filter(delay_type == 'Publication'),
delay_df = delay_df %>% dplyr::filter(delay_type == 'Publication'),
min_year = 2000,
max_delay = 105,
xlab = xlab_publish,
ylab = ylab_publish
)
file.path('viz', 'publication-by-article.png') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)
file.path('viz', 'publication-by-article.pdf') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 0.75 * 5.5)
journal_subset_df = journal_df %>%
dplyr::filter(n_articles >= 100) %>%
dplyr::filter(n_unique_dates >= 5) %>%
dplyr::filter(n_unique_delays >= 5) %>%
dplyr::select(journal_nlm_id:delay_type)
table(journal_subset_df$delay_type)
Acceptance Publication 3086 2756
plot_journal_history <- function(delay_df, journal_abbrev, xlab, ylab) {
# Plot all delays for a single journal and delay_type
x_box = boxplot.stats(delay_df$date_decimal, coef = 1.5)
xlimits = x_box$stats[c(1, 5)]
if(length(x_box$out) >= 35) {
xlimits = boxplot.stats(delay_df$date_decimal, coef = 2.75)$stats[c(1, 5)]
}
ylimits = c(0, boxplot.stats(delay_df$delay, coef = 2.8)$stats[5])
alpha = log(nrow(delay_df)) ^ -1.03 - 0.02
gg = delay_df %>%
ggplot2::ggplot(aes(x = date_decimal, y = delay)) +
ggplot2::geom_point(alpha=alpha, color='#e60000', size=0.8) +
ggplot2::geom_smooth(linetype=0, fill='#454343', alpha=0.26) +
ggplot2::coord_cartesian(xlim = xlimits, ylim = ylimits) +
ggplot2::theme_bw() +
ggplot2::xlab(xlab) +
ggplot2::ylab(ylab) +
ggplot2::theme(plot.margin=grid::unit(c(5, 12, 2, 2), 'points')) +
ggplot2::annotate('text', x = Inf, y = Inf, label = journal_abbrev, fontface='italic', hjust=1.12, vjust=2)
return(gg)
}
property_map = list(
Acceptance = list(directory = 'accept', xlab = xlab_accept, ylab = ylab_accept),
Publication = list(directory = 'publish', xlab = xlab_publish, ylab = ylab_publish)
)
for (i in 1:nrow(journal_subset_df)) {
# Iterate through journal-delay_type pairs
delay_type_ = journal_subset_df$delay_type[i]
props = property_map[[delay_type_]]
nlm_id = journal_subset_df$journal_nlm_id[i]
journal_abbrev = journal_subset_df$journal_abbrev[i]
gg = delay_df %>%
dplyr::filter(delay_type == delay_type_) %>%
dplyr::filter(journal_nlm_id == nlm_id) %>%
plot_journal_history(
journal_abbrev = journal_abbrev,
xlab = props$xlab,
ylab = props$ylab
)
path = file.path('viz', 'journal', props$directory, sprintf('%s.png', nlm_id))
ggplot2::ggsave(filename = path, plot = gg, width = 5.5, height = 3.8, dpi = 150)
}
get_beta = function(df) {
# Regress delay against date and return model information
model = lm(delay ~ date_decimal, data = df)
model_df = broom::tidy(model)
row_df = dplyr::data_frame(
slope = model_df$estimate[2],
slope_as_percent = 100 * model_df$estimate[2] / mean(df$delay),
p_value = model_df$p.value[2]
)
return(row_df)
}
slope_df = journal_subset_df %>%
dplyr::group_by(journal_nlm_id, delay_type) %>%
dplyr::inner_join(delay_df) %>%
dplyr::do(get_beta(.))
slope_df %>%
format(digits=3, scientific=3) %>%
readr::write_tsv(file.path('data', 'slopes.tsv'))
head(slope_df, 2)
journal_nlm_id | delay_type | slope | slope_as_percent | p_value | |
---|---|---|---|---|---|
1 | 0001027 | Acceptance | 2.38085 | 1.94429 | 0.385 |
2 | 0001027 | Publication | 14.75634 | 21.60469 | 4.01e-20 |
# Violin plots of per-journal change in delays
gg = slope_df %>%
ggplot2::ggplot(aes(x = delay_type, y = slope)) +
ggplot2::geom_violin(linetype=0, fill='#80a5f9') +
ggplot2::stat_summary(fun.y=mean, geom='point', size=6, color="white", shape='+') +
ggplot2::stat_summary(fun.y=quantile, geom='point', size=6, color="white", shape='|') +
ggplot2::geom_hline(yintercept = 0, linetype = 'dashed') +
ggplot2::theme_bw() +
ggplot2::ylab('Δ days of delay per year') +
ggplot2::scale_x_discrete(name = NULL) +
ggplot2::coord_flip(ylim = c(-20, 20)) +
ggplot2::theme(plot.margin=grid::unit(c(2, 2, 2, 2), 'points'))
file.path('viz', 'slope-distributions.png') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3)
file.path('viz', 'slope-distributions.pdf') %>%
ggplot2::ggsave(plot = gg, width = 5.5, height = 2.3, device = cairo_pdf)
# Medians of violins
slope_df %>%
dplyr::group_by(delay_type) %>%
dplyr::summarize(
percent_decreasing = mean(slope < 0),
lower = t.test(slope, conf.level = 0.99)$conf.int[1],
upper = t.test(slope, conf.level = 0.99)$conf.int[2]
)
delay_type | percent_decreasing | lower | upper | |
---|---|---|---|---|
1 | Acceptance | 0.4656513 | 0.02669577 | 1.379158 |
2 | Publication | 0.6901306 | -5.369522 | -3.420653 |