In [1]:

quiet_library <- function(...) { suppressPackageStartupMessages(library(...)) }
quiet_library(hise)
quiet_library(ArchR)
quiet_library(dplyr)
quiet_library(purrr)
quiet_library(ggplot2)

                                                   / |
                                                 /    \
            .                                  /      |.
            \\\                              /        |.
              \\\                          /           `|.
                \\\                      /              |.
                  \                    /                |\
                  \\#####\           /                  ||
                ==###########>      /                   ||
                 \\##==......\    /                     ||
            ______ =       =|__ /__                     ||      \\\
        ,--' ,----`-,__ ___/'  --,-`-===================##========>
       \               '        ##_______ _____ ,--,__,=##,__   ///
        ,    __==    ___,-,__,--'#'  ==='      `-'    | ##,-/
        -,____,---'       \\####\\________________,--\\_##,/
           ___      .______        ______  __    __  .______      
          /   \     |   _  \      /      ||  |  |  | |   _  \     
         /  ^  \    |  |_)  |    |  ,----'|  |__|  | |  |_)  |    
        /  /_\  \   |      /     |  |     |   __   | |      /     
       /  _____  \  |  |\  \\___ |  `----.|  |  |  | |  |\  \\___.
      /__/     \__\ | _| `._____| \______||__|  |__| | _| `._____|

Retrieve files¶

Now, we'll use the HISE SDK package to retrieve the DEG results and motif annotations based on their file UUIDs. These will be placed in the cache/ subdirectory by default.

In [2]:

file_uuids <- list(
    "fc83b89f-fd26-43b8-ac91-29c539703a45", # MAST DEG results
    "1f1d68ad-a7dc-45fb-9f4b-67ce1e49790d", # Motif-Peak annotation matrices
    "a7ecb9ba-26d3-4221-8610-bfc60c6c5c4b",
    "66d7a494-2e0a-449e-8bfb-eea1e7ca1a25",
    "f1f82eb9-201c-403a-86cd-5d1f5fec877d",
    "cae2811c-3fdc-44ab-ae44-a365fb35ccb2",
    "5d53e1f1-4f10-4b35-81ee-602b4a491160",
    "aa410188-cfdc-4234-ba4a-21c40a8110c7", # Peak-to-gene correlations
    "ea924bd2-e3f5-40b7-9d49-20219cc6eea2",
    "4cd86b32-cba5-4721-a559-66f1a4ccb1f3",
    "652db815-097b-460a-b314-202e578a37eb",
    "e601997d-904b-4746-aadc-b22a582c6179",
    "e7bab98a-7a15-421a-8e09-574a340dd499"
)

In [3]:

file_res <- cacheFiles(file_uuids)

Get peak annotations and metadata per cell type¶

In [4]:

p2g_files <- list.files(
    "cache",
    pattern = "peak-to-gene",
    full.names = TRUE,
    recursive = TRUE
)
p2g_types <- sub(".+-t(.+)_20.+", "t\\1", p2g_files)

In [5]:

type_p2g <- map(
    p2g_files,
    read.csv
)
names(type_p2g) <- p2g_types

In [6]:

anno_files <- list.files(
    "cache",
    pattern = "peak-motif-matches",
    full.names = TRUE,
    recursive = TRUE
)
anno_types <- sub(".+-t(.+)_20.+", "t\\1", anno_files)

In [7]:

type_peak_anno <- map(
    anno_files,
    readRDS
)
names(type_peak_anno) <- anno_types

Generate gene lists for each motif¶

We'll filter based on peak2gene correlation scores

In [9]:

cor_cutoff <- 0.3

In [10]:

motif_gene_sets <- map2(
    type_peak_anno,
    type_p2g,
    function(peak_anno, p2g) {
        res <- map(colnames(peak_anno),
            function(motif) {
                tf_peak_idx <- which(peak_anno[,motif])
                
                target_p2g <- p2g %>%
                  filter(abs(Correlation) > cor_cutoff) %>%
                  filter(idxATAC %in% tf_peak_idx)
                
                unique(target_p2g$gene)
        })
        names(res) <- colnames(peak_anno)
        res
    }
)

In [11]:

n_motif_genes_list <- map(motif_gene_sets, map_int, length)

Get DEG per condition¶

In [12]:

all_deg <- read.csv("cache/fc83b89f-fd26-43b8-ac91-29c539703a45/all_mast_deg_2023-09-06.csv")

In [13]:

all_deg$test_group <- paste0(all_deg$fg, "_", all_deg$timepoint, "_", all_deg$aifi_cell_type)

In [14]:

split_deg <- split(all_deg, all_deg$test_group)

In [15]:

deg <- split_deg[["dexamethasone_4_t_cd4_naive"]]
deg <- deg %>%
  filter(!is.na(logFC)) %>%
  mutate(weight = abs(logFC))

In [16]:

deg %>%
  filter(gene == "IKZF1")

A data.frame: 1 × 13
aifi_cell_type	timepoint	fg	bg	n_sample	gene	coef_C	coef_D	logFC	nomP	adjP	test_group	weight
<chr>	<int>	<chr>	<chr>	<int>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	<dbl>
t_cd4_naive	4	dexamethasone	dmso	1739	IKZF1	0.07850894	0.3771654	0.1317326	1.557141e-05	0.0002793878	dexamethasone_4_t_cd4_naive	0.1317326

In [17]:

anno <- type_peak_anno[["t_cd4_naive"]]
p2g <- type_p2g[["t_cd4_naive"]]

In [18]:

all_genes <- intersect(deg$gene, p2g$gene)

In [19]:

all_tf_ids <- colnames(anno)
all_tf_genes <- sub("_.+","",all_tf_ids)
expressed_tf_genes <- all_tf_genes[all_tf_genes %in% all_genes]
expressed_tf_ids <- all_tf_ids[all_tf_genes %in% all_genes]

In [20]:

expressed_tf_anno <- anno[,expressed_tf_ids]

In [21]:

link_df <- map_dfr(
    colnames(expressed_tf_anno),
    function(tf_id) {
        idx <- which(expressed_tf_anno[,tf_id])
        p2g_weights <- p2g %>%
          filter(idxATAC %in% idx) %>%
          group_by(gene) %>%
          summarise(max_cor = max(abs(Correlation)),
                    sum_cor = sum(abs(Correlation))) %>%
          mutate(weight = max_cor)
        genes <- p2g_weights$gene
        genes <- genes[genes %in% all_genes]
        data.frame(
            from = sub("_.+","",tf_id),
            to = genes,
            weight = abs(p2g_weights$weight[match(genes, p2g_weights$gene)])
        )
    }
)

In [22]:

link_df <- link_df %>%
  mutate(weight = scale(weight))

In [23]:

head(link_df)

A data.frame: 6 × 3
	from	to	weight
	<chr>	<chr>	<dbl[,1]>
1	ARID5B	AAK1	-1.0283430
2	ARID5B	ABCA1	2.6476449
3	ARID5B	ABCA7	0.7534332
4	ARID5B	ABCB8	-0.8212072
5	ARID5B	ABCC10	-0.6602297
6	ARID5B	ABCD2	0.7298766

remove self-references

In [24]:

link_df <- link_df %>%
  filter(from != to)

In [25]:

suppressPackageStartupMessages(library(igraph))

In [26]:

g <- graph_from_data_frame(
    link_df
)

In [27]:

IGRAPH 1b7cab9 DNW- 6569 600143 -- 
+ attr: name (v/c), weight (e/n)
+ edges from 1b7cab9 (vertex names):
 [1] ARID5B->AAK1    ARID5B->ABCA1   ARID5B->ABCA7   ARID5B->ABCB8  
 [5] ARID5B->ABCC10  ARID5B->ABCD2   ARID5B->ABHD10  ARID5B->ABHD13 
 [9] ARID5B->ABHD16A ARID5B->ABHD17A ARID5B->ABHD17B ARID5B->ABHD2  
[13] ARID5B->ABLIM2  ARID5B->ABRACL  ARID5B->ACAA1   ARID5B->ACAD9  
[17] ARID5B->ACADM   ARID5B->ACAP1   ARID5B->ACAP3   ARID5B->ACIN1  
[21] ARID5B->ACOT13  ARID5B->ACOX1   ARID5B->ACSL5   ARID5B->ACTA2  
[25] ARID5B->ACTB    ARID5B->ACTG1   ARID5B->ACTN1   ARID5B->ACTN4  
[29] ARID5B->ACTR1A  ARID5B->ACVR1   ARID5B->ACVR1C  ARID5B->ADAM10 
+ ... omitted several edges

In [28]:

pr <- page_rank(g, vids = unique(link_df$from))

In [29]:

pr <- pr$vector[order(pr$vector, decreasing = TRUE)]

In [30]:

head(pr)

ETS1: 0.000215854553709126
JUND: 0.000202327775954216
JUNB: 0.000196925225611133
FOS: 0.000194984893671836
IRF1: 0.000190818317021198
KLF6: 0.000189996576454598

In [31]:

names(pr)[1:20]

'ETS1'
'JUND'
'JUNB'
'FOS'
'IRF1'
'KLF6'
'STAT1'
'TCF7'
'MYB'
'HIF1A'
'BHLHE40'
'KLF7'
'ARID5B'
'IRF4'
'ELK3'
'STAT2'
'AHR'
'ARNT'
'NFAT5'
'RUNX1'

In [32]:

which(names(pr) == "NR3C2")

81

In [33]:

which(names(pr) == "IKZF1")

33

In [35]:

which(names(pr) == "CTCF")

150

In [34]:

sessionInfo()

R version 4.3.1 (2023-06-16)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS/LAPACK: /opt/conda/lib/libopenblasp-r0.3.24.so;  LAPACK version 3.11.0

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    grid      stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] igraph_1.5.1                purrr_1.0.2                
 [3] dplyr_1.1.3                 rhdf5_2.44.0               
 [5] SummarizedExperiment_1.30.2 Biobase_2.60.0             
 [7] MatrixGenerics_1.12.3       Rcpp_1.0.11                
 [9] Matrix_1.6-1.1              GenomicRanges_1.52.0       
[11] GenomeInfoDb_1.36.3         IRanges_2.34.1             
[13] S4Vectors_0.38.1            BiocGenerics_0.46.0        
[15] matrixStats_1.0.0           data.table_1.14.8          
[17] stringr_1.5.0               plyr_1.8.8                 
[19] magrittr_2.0.3              ggplot2_3.4.3              
[21] gtable_0.3.4                gtools_3.9.4               
[23] gridExtra_2.3               ArchR_1.0.2                
[25] hise_2.16.0                

loaded via a namespace (and not attached):
 [1] utf8_1.2.3              generics_0.1.3          bitops_1.0-7           
 [4] stringi_1.7.12          lattice_0.21-8          digest_0.6.33          
 [7] evaluate_0.21           pbdZMQ_0.3-10           fastmap_1.1.1          
[10] jsonlite_1.8.7          httr_1.4.7              fansi_1.0.4            
[13] scales_1.2.1            abind_1.4-5             cli_3.6.1              
[16] rlang_1.1.1             crayon_1.5.2            XVector_0.40.0         
[19] munsell_0.5.0           DelayedArray_0.26.7     base64enc_0.1-3        
[22] withr_2.5.1             repr_1.1.6.9000         S4Arrays_1.0.6         
[25] tools_4.3.1             uuid_1.1-1              Rhdf5lib_1.22.1        
[28] colorspace_2.1-0        GenomeInfoDbData_1.2.10 curl_5.0.2             
[31] assertthat_0.2.1        IRdisplay_1.1           vctrs_0.6.3            
[34] R6_2.5.1                lifecycle_1.0.3         zlibbioc_1.46.0        
[37] pkgconfig_2.0.3         pillar_1.9.0            glue_1.6.2             
[40] tibble_3.2.1            tidyselect_1.2.0        rhdf5filters_1.12.1    
[43] IRkernel_1.3.2          htmltools_0.5.6         compiler_4.3.1         
[46] RCurl_1.98-1.12