3.2 Human GEX Data¶

In [2]:

from clustergrammer2 import net
df = {}

import numpy as np
import pandas as pd

clustergrammer2 backend version 0.2.9

In [3]:

import matplotlib.pyplot as plt

In [4]:

from copy import deepcopy
from scipy.spatial.distance import pdist
import itertools as it

In [5]:

filename = '../data/CITE-seq_data/GSE100866_CBMC_8K_13AB_10X-RNA_umi_HUMAN.csv.gz'
df['gex-ini'] = pd.read_csv(filename, compression='gzip', index_col=0)
df['gex-ini'].shape

Out[5]:

(20400, 7339)

In [6]:

net.load_file('../data/CITE-seq_data/adt_ashz_trim_cats.txt')
df['adt-ini'] = net.export_df()
df['adt-ini'].shape

Out[6]:

(10, 7265)

In [7]:

cols = df['adt-ini'].columns.tolist()
keep_cells = [x[0] for x in cols]
print(len(keep_cells))

Remove HUMAN prefix from genes¶

In [8]:

rows = df['gex-ini'].index.tolist()
new_rows = [x.replace('HUMAN_','') for x in rows]
df['gex-ini'].index = new_rows

Filter for trimmed cells only and arcsinh transform¶

In [9]:

df['gex-trim'] = df['gex-ini'][keep_cells]
df['gex-trim'] = np.arcsinh(df['gex-trim']/5)

Drop ribosomal and mitochondrial genes¶

In [10]:

print(df['gex-trim'].shape)
df['gex'] = deepcopy(df['gex-trim'])
all_genes = df['gex'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))

df['gex'] = df['gex'].loc[keep_genes]
df['gex'].shape

# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
                'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']

all_genes = df['gex'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or 
             x.split('_')[0] in list_mito_genes]

print(mito_genes)

keep_genes = [x for x in all_genes if x not in mito_genes]
df['gex'] = df['gex'].loc[keep_genes]
print(df['gex'].shape)

# transfer categories
cols = df['adt-ini'].columns.tolist()
ct_dict = {}
for inst_col in cols:
    ct_dict[inst_col[0]] = inst_col[1]
    
cols = df['gex'].columns.tolist()
new_cols = [(x, 'Cell Type: ' + ct_dict[x]) for x in cols]
df['gex'].columns = new_cols    

# normalize by UMI count
barcode_umi_sum = df['gex'].sum()
df['gex-umi'] = deepcopy(df['gex'].div(barcode_umi_sum))

(20400, 7265)
20400
20223
['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB', 'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5', 'MT-ND6', 'MT-RNR1', 'MT-RNR2', 'MT-TD', 'MT-TF', 'MT-TG', 'MT-TH', 'MT-TI', 'MT-TL1', 'MT-TP', 'MT-TT', 'MT-TW', 'MTRF1', 'MTRF1L', 'MTRNR2L1', 'MTRNR2L10', 'MTRNR2L11', 'MTRNR2L12', 'MTRNR2L3', 'MTRNR2L4', 'MTRNR2L5', 'MTRNR2L6', 'MTRNR2L7', 'MTRNR2L8']
(20187, 7265)

In [11]:

df['gex-umi'].sum()

Out[11]:

(TTGGAACCATGTCTCC, Cell Type: Unknown_5)            1.0
(AACTCTTCAACTGGCC, Cell Type: Unknown_5)            1.0
(ACAGCTAAGCGATGAC, Cell Type: CD4 T cell)           1.0
(CATCGAAGTTCCACGG, Cell Type: Unknown_3)            1.0
(CCGGGATGTAACGACG, Cell Type: Unknown_3)            1.0
(GCTTGAAGTACCGTAT, Cell Type: CD4 T cell)           1.0
(GGCGACTGTCAGAATA, Cell Type: Unknown_2)            1.0
(CGGACGTGTCGAACAG, Cell Type: CD14+ Mono cell_1)    1.0
(CTCGTACAGCGGATCA, Cell Type: CD14+ Mono cell_1)    1.0
(TCGGTAACATGTTGAC, Cell Type: Unknown_5)            1.0
(TCTTTCCTCATTCACT, Cell Type: CD8 T cell)           1.0
(CTAGTGAAGCCTATGT, Cell Type: CD4 T cell)           1.0
(ACATCAGCAATGGACG, Cell Type: Unknown_5)            1.0
(GGCCGATTCACAGTAC, Cell Type: Unknown_3)            1.0
(TACAGTGAGAACAACT, Cell Type: CD4 T cell)           1.0
(CTACATTTCAGTCAGT, Cell Type: CD14+ Mono cell_2)    1.0
(CTGCCTAGTGTTGGGA, Cell Type: CD4 T cell)           1.0
(ATTTCTGCATGCAACT, Cell Type: CD4 T cell)           1.0
(TTTATGCAGGAGTTGC, Cell Type: Unknown_3)            1.0
(CTCGTCACACATAACC, Cell Type: CD16+ Mono cell)      1.0
(AGGTCATCAGTGACAG, Cell Type: Unknown_3)            1.0
(GATGAAAAGCCCAGCT, Cell Type: Unknown_2)            1.0
(TAAGTGCCAACACCTA, Cell Type: Unknown_1)            1.0
(GTCGGGTTCTTCCTTC, Cell Type: CD34+ cell)           1.0
(GATCGATAGACAGAGA, Cell Type: CD4 T cell)           1.0
(GCTGCAGAGCTAGCCC, Cell Type: Unknown_2)            1.0
(TCGCGTTTCGTAGATC, Cell Type: CD14+ Mono cell_2)    1.0
(CTTTGCGCAATCTGCA, Cell Type: CD14+ Mono cell_1)    1.0
(CCGTTCATCCAAACTG, Cell Type: CD4 T cell)           1.0
(ACGGCCAGTGGTGTAG, Cell Type: Unknown_5)            1.0
                                                   ... 
(AGCGTCGTCCCAAGTA, Cell Type: NK cell)              1.0
(GACGCGTGTAGAAAGG, Cell Type: Unknown_1)            1.0
(CGTGTAAGTGGCTCCA, Cell Type: pDC_1)                1.0
(TGCACCTGTTCGTTGA, Cell Type: NK cell)              1.0
(CGTGTAACACACATGT, Cell Type: NK cell)              1.0
(CGTCAGGAGTGGTCCC, Cell Type: CD14+ Mono cell_1)    1.0
(TCTCTAATCGCCTGAG, Cell Type: NK cell)              1.0
(CTGGTCTGTAAAGGAG, Cell Type: NK cell)              1.0
(AGGGTGAGTGATGCCC, Cell Type: CD4 T cell)           1.0
(GTCGGGTAGTTCGCAT, Cell Type: CD14+ Mono cell_2)    1.0
(TTGCGTCGTGACTCAT, Cell Type: CD4 T cell)           1.0
(GTTACAGAGCGTCTAT, Cell Type: CD14+ Mono cell_2)    1.0
(CGTGTAACAGGAATCG, Cell Type: NK cell)              1.0
(TTCGGTCCACTTACGA, Cell Type: NK cell)              1.0
(AGTCTTTAGCCAACAG, Cell Type: pDC_1)                1.0
(GGTGCGTAGCGATGAC, Cell Type: NK cell)              1.0
(GTCGGGTAGTACGACG, Cell Type: CD4 T cell)           1.0
(GGCGTGTAGGATTCGG, Cell Type: Unknown_1)            1.0
(GTCGGGTAGTGAATTG, Cell Type: CD4 T cell)           1.0
(GCGACCAGTCACTTCC, Cell Type: pDC_1)                1.0
(GCATACAAGCTGAACG, Cell Type: CD14+ Mono cell_2)    1.0
(CGATCGGAGCCGTCGT, Cell Type: CD14+ Mono cell_2)    1.0
(ATCTGCCTCTGACCTC, Cell Type: Unknown_1)            1.0
(CTGAAGTAGGGATCTG, Cell Type: NK cell)              1.0
(GGCGTGTAGAGTGAGA, Cell Type: CD14+ Mono cell_2)    1.0
(AGCGTCGAGTCAAGGC, Cell Type: Unknown_1)            1.0
(AGCGTCGAGTTACGGG, Cell Type: CD4 T cell)           1.0
(TCGCGAGGTAGCCTAT, Cell Type: Unknown_1)            1.0
(GTCGGGTAGTAGCCGA, Cell Type: Unknown_1)            1.0
(TTGCCGTGTAGATTAG, Cell Type: Unknown_1)            1.0
Length: 7265, dtype: float64

Keep top 5K genes by sum¶

In [11]:

ser_sum = df['gex'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-filt'] = df['gex'].loc[keep_genes]
df['gex-filt'].shape

Out[11]:

(5000, 7265)

In [12]:

ser_sum = df['gex-umi'].sum(axis=1).sort_values(ascending=False)
keep_genes = ser_sum.index.tolist()[:5000]
df['gex-umi-filt'] = df['gex-umi'].loc[keep_genes]
df['gex-umi-filt'].shape

Out[12]:

(5000, 7265)

In [14]:

print(df['adt-ini'].shape)
print(df['gex-filt'].shape)
print(df['gex-umi-filt'].shape)

(10, 7265)
(5000, 7265)
(5000, 7265)

Compare Sample-Sample Similarity Across Datasets¶

In [15]:

def corr_datasets(name_1, name_2):
    dist_arr_1 = pdist(df[name_1].transpose(), metric='cosine')
    ser_dist_1 = pd.Series(data=dist_arr_1, name=name_1)

    dist_arr_2 = pdist(df[name_2].transpose(), metric='cosine')
    ser_dist_2 = pd.Series(data=dist_arr_2, name=name_2)
    df_dist = pd.concat([ser_dist_1, ser_dist_2], axis=1)

    inst_corr = 1 - pdist(df_dist.transpose(), metric='correlation')
    print(name_1, 'vs', name_2, inst_corr[0])

ADT vs GEX¶

In [16]:

corr_datasets('adt-ini', 'gex-filt')

adt-ini vs gex-filt 0.670118808363

ADT GEX vs UMI¶

In [19]:

corr_datasets('adt-ini', 'gex-umi-filt')

adt-ini vs gex-umi-filt 0.669929648801

In [18]:

corr_datasets('adt-ini', 'gex-umi-filt')

adt-ini vs gex-umi-filt 0.669929648801

Make Z-scored versions of the data¶

In [20]:

# # z-scored ADT
# net.load_df(df['adt-ini'])
# net.normalize(axis='row', norm_type='zscore')
# df['adt-z'] = net.export_df()

df['adt-z'] = df['adt-ini']

# Z-scored 5K gex
net.load_df(df['gex-filt'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-z'] = net.export_df()

# Z-scored 5KV-UMI gex
net.load_df(df['gex-umi-filt'])
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-umi-z'] = net.export_df()

# Z-scored 5K-1K gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-z'] = net.export_df()

# Z-scored 5KV-1K-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=1000, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1K-umi-z'] = net.export_df()

# Z-scored 5K-1H gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-z'] = net.export_df()

# Z-scored 5KV-1H-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-1H-umi-z'] = net.export_df()

# Z-scored 5K-50 gex
net.load_df(df['gex-filt'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-z'] = net.export_df()

# Z-scored 5KV-50-UMI gex
net.load_df(df['gex-umi-filt'])
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
df['gex-5K-50-umi-z'] = net.export_df()

In [21]:

corr_datasets('adt-z', 'gex-5K-z')
corr_datasets('adt-z', 'gex-5K-umi-z')

adt-z vs gex-5K-z 0.577517466014
adt-z vs gex-5K-umi-z 0.661076306163

In [22]:

corr_datasets('adt-z', 'gex-5K-1K-z')
corr_datasets('adt-z', 'gex-5K-1K-umi-z')

adt-z vs gex-5K-1K-z 0.630417530262
adt-z vs gex-5K-1K-umi-z 0.706515334859

In [23]:

corr_datasets('adt-z', 'gex-5K-1H-z')
corr_datasets('adt-z', 'gex-5K-1H-umi-z')

adt-z vs gex-5K-1H-z 0.685659326402
adt-z vs gex-5K-1H-umi-z 0.729042130795

In [24]:

corr_datasets('adt-z', 'gex-5K-50-z')
corr_datasets('adt-z', 'gex-5K-50-umi-z')

adt-z vs gex-5K-50-z 0.680044573427
adt-z vs gex-5K-50-umi-z 0.72393620348

In [25]:

df['gex-5K-50-umi-z'].shape

Out[25]:

(50, 7265)

In [26]:

net.load_df(df['gex-5K-50-umi-z'])
net.widget()

ExampleWidget(network='{"row_nodes": [{"name": "S100A8", "ini": 50, "clust": 22, "rank": 20, "rankvar": 4, "gr…

Cluster NK cells¶

In [27]:

df['gex-filt-umi'].shape

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-27-e9822880376f> in <module>()
----> 1 df['gex-filt-umi'].shape

KeyError: 'gex-filt-umi'

In [ ]:

cols = df['gex-filt-umi'].columns.tolist()
keep_cols = [x for x in cols if 'CD14+ Mono' in x[1]]
print(len(keep_cols))
net.load_df(df['gex-filt-umi'][keep_cols])
# net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')
net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.widget()

In [ ]:

net.load_df(df['gex-filt-umi'])
net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: NK cell')
net.filter_N_top(inst_rc='row', N_top=50, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.widget()

In [ ]:

net.load_df(df['adt-ini'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
net.widget()

In [ ]:

# net.load_df(df['gex-cat-filt'])
# net.filter_N_top(inst_rc='row', N_top=100, rank_type='var')
# net.normalize(axis='row', norm_type='zscore')
# net.widget()

In [ ]: