quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(ArchR)
quiet_library(dplyr)
quiet_library(purrr)
quiet_library(ggplot2)
/ | / \ . / |. \\\ / |. \\\ / `|. \\\ / |. \ / |\ \\#####\ / || ==###########> / || \\##==......\ / || ______ = =|__ /__ || \\\ ,--' ,----`-,__ ___/' --,-`-===================##========> \ ' ##_______ _____ ,--,__,=##,__ /// , __== ___,-,__,--'#' ===' `-' | ##,-/ -,____,---' \\####\\________________,--\\_##,/ ___ .______ ______ __ __ .______ / \ | _ \ / || | | | | _ \ / ^ \ | |_) | | ,----'| |__| | | |_) | / /_\ \ | / | | | __ | | / / _____ \ | |\ \\___ | `----.| | | | | |\ \\___. /__/ \__\ | _| `._____| \______||__| |__| | _| `._____|
Now, we'll use the HISE SDK package to retrieve the DEG results and motif annotations based on their file UUIDs. These will be placed in the cache/
subdirectory by default.
file_uuids <- list(
"fc83b89f-fd26-43b8-ac91-29c539703a45", # MAST DEG results
"1f1d68ad-a7dc-45fb-9f4b-67ce1e49790d", # Motif-Peak annotation matrices
"a7ecb9ba-26d3-4221-8610-bfc60c6c5c4b",
"66d7a494-2e0a-449e-8bfb-eea1e7ca1a25",
"f1f82eb9-201c-403a-86cd-5d1f5fec877d",
"cae2811c-3fdc-44ab-ae44-a365fb35ccb2",
"5d53e1f1-4f10-4b35-81ee-602b4a491160",
"aa410188-cfdc-4234-ba4a-21c40a8110c7", # Peak-to-gene correlations
"ea924bd2-e3f5-40b7-9d49-20219cc6eea2",
"4cd86b32-cba5-4721-a559-66f1a4ccb1f3",
"652db815-097b-460a-b314-202e578a37eb",
"e601997d-904b-4746-aadc-b22a582c6179",
"e7bab98a-7a15-421a-8e09-574a340dd499"
)
file_res <- cacheFiles(file_uuids)
p2g_files <- list.files(
"cache",
pattern = "peak-to-gene",
full.names = TRUE,
recursive = TRUE
)
p2g_types <- sub(".+-t(.+)_20.+", "t\\1", p2g_files)
type_p2g <- map(
p2g_files,
read.csv
)
names(type_p2g) <- p2g_types
anno_files <- list.files(
"cache",
pattern = "peak-motif-matches",
full.names = TRUE,
recursive = TRUE
)
anno_types <- sub(".+-t(.+)_20.+", "t\\1", anno_files)
type_peak_anno <- map(
anno_files,
readRDS
)
names(type_peak_anno) <- anno_types
We'll filter based on peak2gene correlation scores
cor_cutoff <- 0.3
motif_gene_sets <- map2(
type_peak_anno,
type_p2g,
function(peak_anno, p2g) {
res <- map(colnames(peak_anno),
function(motif) {
tf_peak_idx <- which(peak_anno[,motif])
target_p2g <- p2g %>%
filter(abs(Correlation) > cor_cutoff) %>%
filter(idxATAC %in% tf_peak_idx)
unique(target_p2g$gene)
})
names(res) <- colnames(peak_anno)
res
}
)
n_motif_genes_list <- map(motif_gene_sets, map_int, length)
all_deg <- read.csv("cache/fc83b89f-fd26-43b8-ac91-29c539703a45/all_mast_deg_2023-09-06.csv")
all_deg$test_group <- paste0(all_deg$fg, "_", all_deg$timepoint, "_", all_deg$aifi_cell_type)
split_deg <- split(all_deg, all_deg$test_group)
deg <- split_deg[["dexamethasone_4_t_cd4_naive"]]
deg <- deg %>%
filter(!is.na(logFC)) %>%
mutate(weight = abs(logFC))
deg %>%
filter(gene == "IKZF1")
aifi_cell_type | timepoint | fg | bg | n_sample | gene | coef_C | coef_D | logFC | nomP | adjP | test_group | weight |
---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <int> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> |
t_cd4_naive | 4 | dexamethasone | dmso | 1739 | IKZF1 | 0.07850894 | 0.3771654 | 0.1317326 | 1.557141e-05 | 0.0002793878 | dexamethasone_4_t_cd4_naive | 0.1317326 |
anno <- type_peak_anno[["t_cd4_naive"]]
p2g <- type_p2g[["t_cd4_naive"]]
all_genes <- intersect(deg$gene, p2g$gene)
all_tf_ids <- colnames(anno)
all_tf_genes <- sub("_.+","",all_tf_ids)
expressed_tf_genes <- all_tf_genes[all_tf_genes %in% all_genes]
expressed_tf_ids <- all_tf_ids[all_tf_genes %in% all_genes]
expressed_tf_anno <- anno[,expressed_tf_ids]
link_df <- map_dfr(
colnames(expressed_tf_anno),
function(tf_id) {
idx <- which(expressed_tf_anno[,tf_id])
p2g_weights <- p2g %>%
filter(idxATAC %in% idx) %>%
group_by(gene) %>%
summarise(max_cor = max(abs(Correlation)),
sum_cor = sum(abs(Correlation))) %>%
mutate(weight = max_cor)
genes <- p2g_weights$gene
genes <- genes[genes %in% all_genes]
data.frame(
from = sub("_.+","",tf_id),
to = genes,
weight = abs(p2g_weights$weight[match(genes, p2g_weights$gene)])
)
}
)
link_df <- link_df %>%
mutate(weight = scale(weight))
head(link_df)
from | to | weight | |
---|---|---|---|
<chr> | <chr> | <dbl[,1]> | |
1 | ARID5B | AAK1 | -1.0283430 |
2 | ARID5B | ABCA1 | 2.6476449 |
3 | ARID5B | ABCA7 | 0.7534332 |
4 | ARID5B | ABCB8 | -0.8212072 |
5 | ARID5B | ABCC10 | -0.6602297 |
6 | ARID5B | ABCD2 | 0.7298766 |
remove self-references
link_df <- link_df %>%
filter(from != to)
suppressPackageStartupMessages(library(igraph))
g <- graph_from_data_frame(
link_df
)
g
IGRAPH 1b7cab9 DNW- 6569 600143 -- + attr: name (v/c), weight (e/n) + edges from 1b7cab9 (vertex names): [1] ARID5B->AAK1 ARID5B->ABCA1 ARID5B->ABCA7 ARID5B->ABCB8 [5] ARID5B->ABCC10 ARID5B->ABCD2 ARID5B->ABHD10 ARID5B->ABHD13 [9] ARID5B->ABHD16A ARID5B->ABHD17A ARID5B->ABHD17B ARID5B->ABHD2 [13] ARID5B->ABLIM2 ARID5B->ABRACL ARID5B->ACAA1 ARID5B->ACAD9 [17] ARID5B->ACADM ARID5B->ACAP1 ARID5B->ACAP3 ARID5B->ACIN1 [21] ARID5B->ACOT13 ARID5B->ACOX1 ARID5B->ACSL5 ARID5B->ACTA2 [25] ARID5B->ACTB ARID5B->ACTG1 ARID5B->ACTN1 ARID5B->ACTN4 [29] ARID5B->ACTR1A ARID5B->ACVR1 ARID5B->ACVR1C ARID5B->ADAM10 + ... omitted several edges
pr <- page_rank(g, vids = unique(link_df$from))
pr <- pr$vector[order(pr$vector, decreasing = TRUE)]
head(pr)
names(pr)[1:20]
which(names(pr) == "NR3C2")
which(names(pr) == "IKZF1")
which(names(pr) == "CTCF")
sessionInfo()
R version 4.3.1 (2023-06-16) Platform: x86_64-conda-linux-gnu (64-bit) Running under: Ubuntu 20.04.6 LTS Matrix products: default BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.24.so; LAPACK version 3.11.0 locale: [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C time zone: Etc/UTC tzcode source: system (glibc) attached base packages: [1] stats4 grid stats graphics grDevices utils datasets [8] methods base other attached packages: [1] igraph_1.5.1 purrr_1.0.2 [3] dplyr_1.1.3 rhdf5_2.44.0 [5] SummarizedExperiment_1.30.2 Biobase_2.60.0 [7] MatrixGenerics_1.12.3 Rcpp_1.0.11 [9] Matrix_1.6-1.1 GenomicRanges_1.52.0 [11] GenomeInfoDb_1.36.3 IRanges_2.34.1 [13] S4Vectors_0.38.1 BiocGenerics_0.46.0 [15] matrixStats_1.0.0 data.table_1.14.8 [17] stringr_1.5.0 plyr_1.8.8 [19] magrittr_2.0.3 ggplot2_3.4.3 [21] gtable_0.3.4 gtools_3.9.4 [23] gridExtra_2.3 ArchR_1.0.2 [25] hise_2.16.0 loaded via a namespace (and not attached): [1] utf8_1.2.3 generics_0.1.3 bitops_1.0-7 [4] stringi_1.7.12 lattice_0.21-8 digest_0.6.33 [7] evaluate_0.21 pbdZMQ_0.3-10 fastmap_1.1.1 [10] jsonlite_1.8.7 httr_1.4.7 fansi_1.0.4 [13] scales_1.2.1 abind_1.4-5 cli_3.6.1 [16] rlang_1.1.1 crayon_1.5.2 XVector_0.40.0 [19] munsell_0.5.0 DelayedArray_0.26.7 base64enc_0.1-3 [22] withr_2.5.1 repr_1.1.6.9000 S4Arrays_1.0.6 [25] tools_4.3.1 uuid_1.1-1 Rhdf5lib_1.22.1 [28] colorspace_2.1-0 GenomeInfoDbData_1.2.10 curl_5.0.2 [31] assertthat_0.2.1 IRdisplay_1.1 vctrs_0.6.3 [34] R6_2.5.1 lifecycle_1.0.3 zlibbioc_1.46.0 [37] pkgconfig_2.0.3 pillar_1.9.0 glue_1.6.2 [40] tibble_3.2.1 tidyselect_1.2.0 rhdf5filters_1.12.1 [43] IRkernel_1.3.2 htmltools_0.5.6 compiler_4.3.1 [46] RCurl_1.98-1.12