from clustergrammer2 import net
df = {}
import numpy as np
import pandas as pd
clustergrammer2 backend version 0.2.9
import matplotlib.pyplot as plt
from copy import deepcopy
from scipy.spatial.distance import pdist
import itertools as it
filename = '../data/CITE-seq_data/GSE100866_CBMC_8K_13AB_10X-RNA_umi_HUMAN.csv.gz'
df['gex-ini'] = pd.read_csv(filename, compression='gzip', index_col=0)
df['gex-ini'].shape
(20400, 7339)
net.load_file('../data/CITE-seq_data/adt_ashz_trim_cats.txt')
df['adt-ini'] = net.export_df()
df['adt-ini'].shape
(10, 7265)
cols = df['adt-ini'].columns.tolist()
keep_cells = [x[0] for x in cols]
print(len(keep_cells))
7265
rows = df['gex-ini'].index.tolist()
new_rows = [x.replace('HUMAN_','') for x in rows]
df['gex-ini'].index = new_rows
df['gex-trim'] = df['gex-ini'][keep_cells]
df['gex-trim'] = np.arcsinh(df['gex-trim']/5)
print(df['gex-trim'].shape)
df['gex'] = deepcopy(df['gex-trim'])
all_genes = df['gex'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))
df['gex'] = df['gex'].loc[keep_genes]
df['gex'].shape
# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']
all_genes = df['gex'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or
x.split('_')[0] in list_mito_genes]
print(mito_genes)
keep_genes = [x for x in all_genes if x not in mito_genes]
df['gex'] = df['gex'].loc[keep_genes]
print(df['gex'].shape)
# transfer categories
cols = df['adt-ini'].columns.tolist()
ct_dict = {}
for inst_col in cols:
ct_dict[inst_col[0]] = inst_col[1]
cols = df['gex'].columns.tolist()
new_cols = [(x, 'Cell Type: ' + ct_dict[x]) for x in cols]
df['gex'].columns = new_cols
# normalize by UMI count
barcode_umi_sum = df['gex'].sum()
df['gex-umi'] = deepcopy(df['gex'].div(barcode_umi_sum))
(20400, 7265) 20400 20223 ['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-RNR1', 'MT-RNR2', 'MT-TD', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TL1', 'MT-TP', 'MT-TT', 'MT-TW', 'MTRF1', 'MTRF1L', 'MTRNR2L1', 'MTRNR2L10', 'MTRNR2L11', 'MTRNR2L12', 'MTRNR2L3', 'MTRNR2L4', 'MTRNR2L5', 'MTRNR2L6', 'MTRNR2L7', 'MTRNR2L8'] (20187, 7265)
df['gex-umi'].sum()
(TTGGAACCATGTCTCC, Cell Type: Unknown_5) 1.0 (AACTCTTCAACTGGCC, Cell Type: Unknown_5) 1.0 (ACAGCTAAGCGATGAC, Cell Type: CD4 T cell) 1.0 (CATCGAAGTTCCACGG, Cell Type: Unknown_3) 1.0 (CCGGGATGTAACGACG, Cell Type: Unknown_3) 1.0 (GCTTGAAGTACCGTAT, Cell Type: CD4 T cell) 1.0 (GGCGACTGTCAGAATA, Cell Type: Unknown_2) 1.0 (CGGACGTGTCGAACAG, Cell Type: CD14+ Mono cell_1) 1.0 (CTCGTACAGCGGATCA, Cell Type: CD14+ Mono cell_1) 1.0 (TCGGTAACATGTTGAC, Cell Type: Unknown_5) 1.0 (TCTTTCCTCATTCACT, Cell Type: CD8 T cell) 1.0 (CTAGTGAAGCCTATGT, Cell Type: CD4 T cell) 1.0 (ACATCAGCAATGGACG, Cell Type: Unknown_5) 1.0 (GGCCGATTCACAGTAC, Cell Type: Unknown_3) 1.0 (TACAGTGAGAACAACT, Cell Type: CD4 T cell) 1.0 (CTACATTTCAGTCAGT, Cell Type: CD14+ Mono cell_2) 1.0 (CTGCCTAGTGTTGGGA, Cell Type: CD4 T cell) 1.0 (ATTTCTGCATGCAACT, Cell Type: CD4 T cell) 1.0 (TTTATGCAGGAGTTGC, Cell Type: Unknown_3) 1.0 (CTCGTCACACATAACC, Cell Type: CD16+ Mono cell) 1.0 (AGGTCATCAGTGACAG, Cell Type: Unknown_3) 1.0 (GATGAAAAGCCCAGCT, Cell Type: Unknown_2) 1.0 (TAAGTGCCAACACCTA, Cell Type: Unknown_1) 1.0 (GTCGGGTTCTTCCTTC, Cell Type: CD34+ cell) 1.0 (GATCGATAGACAGAGA, Cell Type: CD4 T cell) 1.0 (GCTGCAGAGCTAGCCC, Cell Type: Unknown_2) 1.0 (TCGCGTTTCGTAGATC, Cell Type: CD14+ Mono cell_2) 1.0 (CTTTGCGCAATCTGCA, Cell Type: CD14+ Mono cell_1) 1.0 (CCGTTCATCCAAACTG, Cell Type: CD4 T cell) 1.0 (ACGGCCAGTGGTGTAG, Cell Type: Unknown_5) 1.0 ... (AGCGTCGTCCCAAGTA, Cell Type: NK cell) 1.0 (GACGCGTGTAGAAAGG, Cell Type: Unknown_1) 1.0 (CGTGTAAGTGGCTCCA, Cell Type: pDC_1) 1.0 (TGCACCTGTTCGTTGA, Cell Type: NK cell) 1.0 (CGTGTAACACACATGT, Cell Type: NK cell) 1.0 (CGTCAGGAGTGGTCCC, Cell Type: CD14+ Mono cell_1) 1.0 (TCTCTAATCGCCTGAG, Cell Type: NK cell) 1.0 (CTGGTCTGTAAAGGAG, Cell Type: NK cell) 1.0 (AGGGTGAGTGATGCCC, Cell Type: CD4 T cell) 1.0 (GTCGGGTAGTTCGCAT, Cell Type: CD14+ Mono cell_2) 1.0 (TTGCGTCGTGACTCAT, Cell Type: CD4 T cell) 1.0 (GTTACAGAGCGTCTAT, Cell Type: CD14+ Mono cell_2) 1.0 (CGTGTAACAGGAATCG, Cell Type: NK cell) 1.0 (TTCGGTCCACTTACGA, Cell Type: NK cell) 1.0 (AGTCTTTAGCCAACAG, Cell Type: pDC_1) 1.0 (GGTGCGTAGCGATGAC, Cell Type: NK cell) 1.0 (GTCGGGTAGTACGACG, Cell Type: CD4 T cell) 1.0 (GGCGTGTAGGATTCGG, Cell Type: Unknown_1) 1.0 (GTCGGGTAGTGAATTG, Cell Type: CD4 T cell) 1.0 (GCGACCAGTCACTTCC, Cell Type: pDC_1) 1.0 (GCATACAAGCTGAACG, Cell Type: CD14+ Mono cell_2) 1.0 (CGATCGGAGCCGTCGT, Cell Type: CD14+ Mono cell_2) 1.0 (ATCTGCCTCTGACCTC, Cell Type: Unknown_1) 1.0 (CTGAAGTAGGGATCTG, Cell Type: NK cell) 1.0 (GGCGTGTAGAGTGAGA, Cell Type: CD14+ Mono cell_2) 1.0 (AGCGTCGAGTCAAGGC, Cell Type: Unknown_1) 1.0 (AGCGTCGAGTTACGGG, Cell Type: CD4 T cell) 1.0 (TCGCGAGGTAGCCTAT, Cell Type: Unknown_1) 1.0 (GTCGGGTAGTAGCCGA, Cell Type: Unknown_1) 1.0 (TTGCCGTGTAGATTAG, Cell Type: Unknown_1) 1.0 Length: 7265, dtype: float64
ser_sum = df['gex'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-filt'] = df['gex'].loc[keep_genes]
df['gex-filt'].shape
(5000, 7265)
ser_sum = df['gex-umi'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-umi-filt'] = df['gex-umi'].loc[keep_genes]
df['gex-umi-filt'].shape
(5000, 7265)
print(df['adt-ini'].shape)
print(df['gex-filt'].shape)
print(df['gex-umi-filt'].shape)
(10, 7265) (5000, 7265) (5000, 7265)
def corr_datasets(name_1, name_2):
dist_arr_1 = pdist(df[name_1].transpose(), metric='cosine')
ser_dist_1 = pd.Series(data=dist_arr_1, name=name_1)
dist_arr_2 = pdist(df[name_2].transpose(), metric='cosine')
ser_dist_2 = pd.Series(data=dist_arr_2, name=name_2)
df_dist = pd.concat([ser_dist_1, ser_dist_2], axis=1)
inst_corr = 1 - pdist(df_dist.transpose(), metric='correlation')
print(name_1, 'vs', name_2, inst_corr[0])
corr_datasets('adt-ini', 'gex-filt')
adt-ini vs gex-filt 0.670118808363
corr_datasets('adt-ini', 'gex-umi-filt')
adt-ini vs gex-umi-filt 0.669929648801
corr_datasets('adt-ini', 'gex-umi-filt')
adt-ini vs gex-umi-filt 0.669929648801
# # z-scored ADT
# net.load_df(df['adt-ini'])
# net.normalize(axis='row', norm_type='zscore')
# df['adt-z'] = net.export_df()
df['adt-z'] = df['adt-ini']
# Z-scored 5K gex
net.load_df(df['gex-filt'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-z'] = net.export_df()
# Z-scored 5KV-UMI gex
net.load_df(df['gex-umi-filt'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-umi-z'] = net.export_df()
# Z-scored 5K-1K gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-z'] = net.export_df()
# Z-scored 5KV-1K-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-umi-z'] = net.export_df()
# Z-scored 5K-1H gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-z'] = net.export_df()
# Z-scored 5KV-1H-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-umi-z'] = net.export_df()
# Z-scored 5K-50 gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-z'] = net.export_df()
# Z-scored 5KV-50-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-umi-z'] = net.export_df()
corr_datasets('adt-z', 'gex-5K-z')
corr_datasets('adt-z', 'gex-5K-umi-z')
adt-z vs gex-5K-z 0.577517466014 adt-z vs gex-5K-umi-z 0.661076306163
corr_datasets('adt-z', 'gex-5K-1K-z')
corr_datasets('adt-z', 'gex-5K-1K-umi-z')
adt-z vs gex-5K-1K-z 0.630417530262 adt-z vs gex-5K-1K-umi-z 0.706515334859
corr_datasets('adt-z', 'gex-5K-1H-z')
corr_datasets('adt-z', 'gex-5K-1H-umi-z')
adt-z vs gex-5K-1H-z 0.685659326402 adt-z vs gex-5K-1H-umi-z 0.729042130795
corr_datasets('adt-z', 'gex-5K-50-z')
corr_datasets('adt-z', 'gex-5K-50-umi-z')
adt-z vs gex-5K-50-z 0.680044573427 adt-z vs gex-5K-50-umi-z 0.72393620348
df['gex-5K-50-umi-z'].shape
(50, 7265)
net.load_df(df['gex-5K-50-umi-z'])
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "S100A8", "ini": 50, "clust": 22, "rank": 20, "rankvar": 4, "gr…
df['gex-filt-umi'].shape
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-27-e9822880376f> in <module>() ----> 1 df['gex-filt-umi'].shape KeyError: 'gex-filt-umi'
cols = df['gex-filt-umi'].columns.tolist()
keep_cols = [x for x in cols if 'CD14+ Mono' in x[1]]
print(len(keep_cols))
net.load_df(df['gex-filt-umi'][keep_cols])
# net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.widget()
net.load_df(df['gex-filt-umi'])
net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.widget()
net.load_df(df['adt-ini'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
net.widget()
# net.load_df(df['gex-cat-filt'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
# net.widget()