import pandas as pd
pd.set_option('display.max_rows', 100)
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
sc.settings.set_figure_params(dpi=300)
def rank_genes_groups_to_df(adata, cell):
dic = adata.uns['rank_genes_groups']
df = pd.DataFrame({
'names' : dic['names'][cell],
'scores' : dic['scores'][cell],
'pvals' : dic['pvals'][cell],
'pvals_adj' : dic['pvals_adj'][cell],
'logfoldchanges' : dic['logfoldchanges'][cell]
})
return df
# rank_genes_groups_to_df(sanp, 'SAN_P_cell')
# based on sctk.pseudo_bulk
# https://github.com/Teichlab/sctk/blob/c6d0c7b2ffe3243d83d743a3beb04680150cb6e7/sctk/_utils.py#L632
import scipy.sparse as sp
def get_exp_proportion(
adata, groupby, use_rep='X'):
"""get proportion of expressing cells from grouped sc data
"""
if adata.obs[groupby].dtype.name == 'category':
group_attr = adata.obs[groupby].values
groups = adata.obs[groupby].cat.categories.values
else:
group_attr = adata.obs[groupby].astype(str).values
groups = np.unique(group_attr)
n_level = len(groups)
if use_rep == 'X':
x = adata.X
features = adata.var_names.values
elif use_rep == 'raw':
x = adata.raw.X
features = adata.raw.var_names.values
elif use_rep in adata.layers.keys():
x = adata.layers[use_rep]
features = adata.var_names.values
elif use_rep in adata.obsm.keys():
x = adata.obsm[use_rep]
features = np.arange(x.shape[1])
elif (isinstance(use_rep, np.ndarray) and
use_rep.shape[0] == adata.shape[0]):
x = use_rep
features = np.arange(x.shape[1])
else:
raise KeyError(f'{use_rep} invalid.')
summarised = np.zeros((n_level, x.shape[1]))
for i, grp in enumerate(groups):
k_grp = group_attr == grp
if sp.issparse(x):
summarised[i] = np.sum(x[k_grp,:]>0,axis=0)/x[k_grp,:].shape[0]
else:
print('X is not sparce')
################# added here #################
summarised[i] = np.sum(x[k_grp,:]>0,axis=0)/x[k_grp,:].shape[0]
##############################################
return pd.DataFrame(summarised.T, columns=groups, index=features)
# pyscenic output
import yaml
with open("/nfs/team205/heart/pyscenic/regulons/adult_multiome_aCM_regulons.yaml", "r") as f:
regulons_all = yaml.safe_load(f)
# select positive or negative regulons
pos_or_neg = '(+)'
regulons = {}
for k,v in regulons_all.items():
if pos_or_neg in k:
regulons[k.replace(pos_or_neg,'')] = v
len(regulons.keys())
221
# Identified targetable genes basd on ATAC data
import json
path = '/nfs/team205/heart/anndata_objects/8regions/ArchR/project_output/TF_downstreamGene/AtrialCardiomyocyte_p2gCorr-0.2.json'
with open(path) as f:
atac_nw = json.load(f)
len(atac_nw)
870
# read in rna anndata
adata_rna = sc.read_h5ad('/nfs/team205/heart/anndata_objects/8regions/RNA_adult-8reg_full_raw_cellstate-annotated.h5ad')
# set gene name as var_names
adata_rna.var['gene_id']=adata_rna.var_names.copy()
adata_rna.var.set_index('gene_name-new',inplace=True)
adata_rna.var_names=adata_rna.var_names.astype('str')
adata_rna.var_names_make_unique()
# remove unclassified cells
adata_rna=adata_rna[adata_rna.obs['cell_state']!='unclassified']
# filter and lognormalised
sc.pp.filter_genes(adata_rna, min_cells=3)
sc.pp.normalize_total(adata_rna, target_sum=1e4)
sc.pp.log1p(adata_rna)
adata_rna
/home/jovyan/my-conda-envs/cellpymc/lib/python3.7/site-packages/scanpy/preprocessing/_simple.py:251: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual. adata.var['n_cells'] = number
AnnData object with n_obs × n_vars = 690558 × 31460 obs: 'sangerID', 'combinedID', 'donor', 'donor_type', 'region', 'region_finest', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score', 'scrublet_leiden', 'cluster_scrublet_score', 'doublet_pval', 'doublet_bh_pval', 'batch_key', 'leiden_scVI', 'cell_type', 'cell_state_HCAv1', 'cell_state_scNym', 'cell_state_scNym_confidence', 'cell_state', 'latent_RT_efficiency', 'latent_cell_probability', 'latent_scale', 'n_counts', '_scvi_batch', '_scvi_labels', 'clus20', 'doublet_cls', 'original_or_new', 'batch', 'scANVI_predictions', 'leiden_scArches' var: 'gene_name_scRNA-0-original', 'gene_name_snRNA-1-original', 'gene_name_multiome-2-original', 'gene_id', 'n_cells' uns: 'age_colors', 'cell_or_nuclei_colors', 'cell_state_colors', 'cell_type_colors', 'donor_colors', 'donor_type_colors', 'facility_colors', 'flushed_colors', 'gender_colors', 'kit_10x_colors', 'leiden', 'modality_colors', 'neighbors', 'original_or_new_colors', 'region_colors', 'region_finest_colors', 'scANVI_predictions_colors', 'umap', 'log1p' obsm: 'X_scArches', 'X_umap' obsp: 'connectivities', 'distances'
# subset only multiome data (since 'P_cell' is purely from multiome)
adata_rna = adata_rna[adata_rna.obs['modality']=='Multiome-RNA']
# combine P cells
adata_rna.obs['cell_state_2'] = adata_rna.obs['cell_state'].astype('str')
adata_rna.obs.replace({'cell_state_2':{
'AVN_P_cell':'P_cell',
'SAN_P_cell':'P_cell'
}}, inplace=True)
/home/jovyan/my-conda-envs/cellpymc/lib/python3.7/site-packages/ipykernel_launcher.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.
# subset cell types of interest
target_celltype = 'P_cell'
celltype_group = ['P_cell','aCM1','aCM2','aCM3','aCM4','aCM5']
adata_rna = adata_rna[adata_rna.obs['cell_state_2'].isin(celltype_group)]
adata_rna.obs['modality'].value_counts()
Multiome-RNA 19337 Name: modality, dtype: int64
adata_rna.obs['cell_state_2'].value_counts()
aCM2 9083 aCM1 5004 aCM4 3388 aCM3 1462 P_cell 400 Name: cell_state_2, dtype: int64
%%time
# DEG calculation
sc.tl.rank_genes_groups(adata_rna, 'cell_state_2',
groups=[target_celltype], reference='rest',
method='wilcoxon')
degs_df = rank_genes_groups_to_df(adata_rna, target_celltype)
/home/jovyan/my-conda-envs/cellpymc/lib/python3.7/site-packages/anndata/_core/anndata.py:1235: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual. df[key] = c
CPU times: user 31.8 s, sys: 3.53 s, total: 35.3 s Wall time: 35.9 s
# select degs
FC_thresh = 0
pval_adj_thresh = 0.05
degs_df = degs_df[degs_df['pvals_adj'] < pval_adj_thresh]
rna_degs = list(degs_df[degs_df['logfoldchanges'] > FC_thresh]['names'])
len(rna_degs)
847
# select genes which has larger proportion in a group of interest
# here 10%
exp_prop = get_exp_proportion(adata_rna, groupby='cell_state_2')
exp_genes = exp_prop.index[exp_prop['P_cell'] > 0.1]
len(exp_genes)
8660
# select target genes and TFs from the regulons
key_TF_TG = {}
for tf, tg in regulons.items():
if (tf in rna_degs)&(tf in exp_genes):
key_tgs = list(set(tg).intersection(exp_genes).intersection(rna_degs))
if len(key_tgs)>0:
key_TF_TG[tf] = key_tgs
del key_tgs
# prepare dataframe of TF and TGs and save
tftg_df = pd.DataFrame({'TF':[],'TF_logFC':[],'TargetGene':[],'TargetGene_logFC':[],'ATAC_interaction':[]})
for k,v in key_TF_TG.items():
# get logFC for TF
tf_logfc = degs_df.set_index('names').loc[k,'logfoldchanges']
for vv in v:
# get logFC for TG
tg_logfc = degs_df.set_index('names').loc[vv,'logfoldchanges']
# check whether the interaction in ATAC-network and make dataframe
if k in atac_nw.keys():
atac_nw_tg = atac_nw[k]
if vv in atac_nw_tg:
df = pd.DataFrame({'TF':[k],'TF_logFC':[tf_logfc],'TargetGene':[vv],'TargetGene_logFC':[tg_logfc],'ATAC_interaction':['Yes']})
else:
df = pd.DataFrame({'TF':[k],'TF_logFC':[tf_logfc],'TargetGene':[vv],'TargetGene_logFC':[tg_logfc],'ATAC_interaction':['No']})
else:
df = pd.DataFrame({'TF':[k],'TF_logFC':[tf_logfc],'TargetGene':[vv],'TargetGene_logFC':[tg_logfc],'ATAC_interaction':['No']})
# concatenate
tftg_df = pd.concat([tftg_df,df],axis=0)
print(len(tftg_df))
# save
tftg_df.to_csv('/home/jovyan/projects/P61_Adult-heart_pyscenic/notebooks/csv/Pcell_GRN_UP_corr02.csv', index=False)
719
# set a threshold of logFC (for both TF and TG) for plotting
FC_thresh_for_plot = 0.5
df = tftg_df[(tftg_df['TF_logFC'] > FC_thresh_for_plot)&(tftg_df['TargetGene_logFC'] > FC_thresh_for_plot)]
# make TF:TG dictionary for plotting
key_TF_TG_plot = {}
for tf in set(df['TF']):
key_TF_TG_plot[tf] = list(df[df['TF']==tf]['TargetGene'])
len(df)
221
# definition of function
def generate_edges(graph):
edges = []
# for each node in graph
for node in graph:
# for each neighbour node of a single node
for neighbour in graph[node]:
# if edge exists then append
edges.append((node, neighbour))
return edges
# prepare GPCR and Ion channel genes
# for coloring TG nodes
gpcr = list(pd.read_csv('/nfs/team205/kk18/data/geneset/HGNC/HGNC_GID139_G-protein-coupled-receptors.txt',sep='\t')['Approved symbol'].unique())
ion = list(pd.read_csv('/nfs/team205/kk18/data/geneset/HGNC/HGNC_GID177_Ion-channels.txt',sep='\t')['Approved symbol'].unique())
tfs = list(pd.read_csv('/nfs/team205/kk18/data/geneset/humanTFs_Lambert_Cell_2018.csv')['Gene'].unique())
# solutecarrier = list(pd.read_csv('/nfs/team205/kk18/data/geneset/HGNC/HGNC_GID752_SoluteCarriers.txt',sep='\t')['Approved symbol'].unique())
# snare = list(pd.read_csv('/nfs/team205/kk18/data/geneset/HGNC/HGNC_GID1124_SNAREs.txt',sep='\t')['Approved symbol'].unique())
import networkx as nx
import random
random.seed(12) # or any integer
import numpy
numpy.random.seed(1)
import math
G = nx.Graph()
G.add_edges_from(generate_edges(key_TF_TG_plot))
col_map = {}
size_map = {}
edgecolors_map = {}
for k in key_TF_TG_plot.keys():
col_map[k]='silver'
size_map[k]=1000
edgecolors_map[k]='black' # '#FF000000'
colors = [col_map.get(node, 'white') for node in G.nodes()]
# add color if gpcr
colors = ['lightgreen' if tup[1] in gpcr else tup[0] for tup in zip(colors,G.nodes())]
# add color if ion channel
colors = ['deepskyblue' if tup[1] in ion else tup[0] for tup in zip(colors,G.nodes())]
colors = ['gold' if (tup[1] in tfs)&(tup[1] not in key_TF_TG_plot.keys()) else tup[0] for tup in zip(colors,G.nodes())]
sizes = [size_map.get(node, 300) for node in G.nodes()]
# change edgecolor if its in ATAC-network
## edges of the nodes
genes_matchs_with_atacnetwork = []
for k,v in key_TF_TG_plot.items():
if k in atac_nw.keys():
g = list(set(v).intersection(atac_nw[k]))
genes_matchs_with_atacnetwork = genes_matchs_with_atacnetwork + g
genes_matchs_with_atacnetwork = list(set(genes_matchs_with_atacnetwork))
edgecolors= [edgecolors_map.get(node, 'grey') for node in G.nodes()]
edgecolors = ['red' if (tup[1] in genes_matchs_with_atacnetwork)&(tup[1] not in key_TF_TG_plot.keys()) else tup[0] for tup in zip(edgecolors,G.nodes())]
## edges between nodes
edges = G.edges(data=True)
edge_color_list = []
edge_width_list = []
for x in edges:
if x[0] in key_TF_TG_plot.keys():
if x[0] in atac_nw.keys():
if x[1] in atac_nw[x[0]]:
edge_color_list.append('red')
edge_width_list.append(1)
else:
edge_color_list.append('dimgrey')
edge_width_list.append(0.5)
else:
edge_color_list.append('dimgrey')
edge_width_list.append(0.5)
elif x[1] in key_TF_TG_plot.keys():
if x[1] in atac_nw.keys():
if x[0] in atac_nw[x[1]]:
edge_color_list.append('red')
edge_width_list.append(1)
else:
edge_color_list.append('dimgrey')
edge_width_list.append(0.5)
else:
edge_color_list.append('dimgrey')
edge_width_list.append(0.5)
pos = nx.spring_layout(G,k=0.2,scale=2)
#https://networkx.org/documentation/latest/reference/generated/networkx.drawing.layout.spring_layout.html
plt.figure(figsize = (10,12))
nx.draw(G, pos, node_color = colors, node_size = sizes, edgecolors=edgecolors,
with_labels = True, width=0.4, font_size=7, linewidths=0.7)
# https://networkx.org/documentation/stable/auto_examples/drawing/plot_labels_and_colors.html
nx.draw_networkx_edges(G, pos, edge_color=edge_color_list, width=edge_width_list)
plt.savefig('/home/jovyan/projects/P61_Adult-heart_pyscenic/notebooks/figures/Pcell_GRN_UP_corr02.pdf')
plt.show()
key_TF_TG_plot.keys()
dict_keys(['SHOX2', 'FOXP2', 'CREM', 'NR6A1', 'CREB5', 'ELK4', 'TCF4', 'BCL11A', 'SREBF2', 'PRDM6'])
adata_rna.X.data[:10]
array([0.55767274, 0.40395418, 3.9371305 , 1.7454512 , 3.7030277 , 0.91356945, 3.7152216 , 4.345797 , 3.9798887 , 0.91356945], dtype=float32)
sc.pl.dotplot(adata_rna,
list(key_TF_TG_plot.keys()),
groupby='cell_state',
dendrogram=False,
# use_raw=True,
standard_scale="var",
color_map="Reds",
swap_axes=False,
)
sc.pl.dotplot(adata_rna,
list(key_TF_TG_plot.keys()), # + ['MSX2','FOXC1','OSR2','GATA2','DEAF1'],
groupby='cell_state_2',
dendrogram=False,
# use_raw=True,
standard_scale="var",
color_map="Reds",
swap_axes=False,
# title='Neurexin receptors'
)