This notebook performs hierarchical clustering on antibody derived tag (ADT) data, assigns tentative cell types based on ADT clustering, transfers cell type assignment to the gene expression data (GEX), and saves the resulting labeled and filtered data to parquet files.
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
from clustergrammer2 import net
import helper_functions as hf
>> clustergrammer2 backend version 0.4.2
%%time
inst_path = '../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/filtered_feature_bc_matrix/'
feature_data = hf.load_v3_comp_sparse_feat_matrix(inst_path)
CPU times: user 30.8 s, sys: 459 ms, total: 31.2 s Wall time: 30.7 s
inst_features = feature_data['adt']['features']
keep_indexes = []
keep_features = []
for inst_index in range(len(inst_features)):
inst_feature = inst_features[inst_index]
if '_control' not in inst_feature:
keep_indexes.append(inst_index)
keep_features.append(inst_feature)
feature_data['adt']['features'] = keep_features
feature_data['adt']['mat'] = feature_data['adt']['mat'][keep_indexes,:]
hf.check_feature_data_size(feature_data)
gex 33538 7865 (33538, 7865) adt 14 7865 (14, 7865)
%%time
df = hf.convert_feature_data_to_df_dict(feature_data, make_sparse=False)
print('adt', df['adt'].shape)
print('gex', df['gex'].shape)
adt (14, 7865) gex (33538, 7865) CPU times: user 385 ms, sys: 506 ms, total: 892 ms Wall time: 899 ms
df['adt-ash'] = np.arcsinh(df['adt']/5)
df['adt-ash'].shape
df['adt-ash'].columns = [(x,) for x in df['adt-ash'].columns]
net.load_df(df['adt-ash'])
net.normalize(axis='row', norm_type='zscore')
net.cluster()
net.dendro_cats(axis='col', dendro_level=4)
net.cluster()
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "CD3", "ini": 14, "clust": 9, "rank": 11, "rankvar": 9, "group"…
We are assigning the following cell type definitions to the clusters identified in ADT space:
1: CD8 T Cells
3: CD4+ CD45RA+ T Cells
4: CD4+ CD45RO+ T Cells
6: Myeloid (CD14)
6: CD15+ CD16+
7: B Cells (CD19)
8: NK Cells (CD56, CD16)
CD15 granulocyte, mcarophage monocyte
ct_dict = {}
ct_dict['Group 4: cat-1'] = 'Cell Type: CD8 T Cells'
ct_dict['Group 4: cat-3'] = 'Cell Type: CD4+ CD45RA+ T Cells '
ct_dict['Group 4: cat-4'] = 'Cell Type: CD4+ CD45RO+ T Cells'
ct_dict['Group 4: cat-5'] = 'Cell Type: Myeloid CD14'
ct_dict['Group 4: cat-6'] = 'Cell Type: CD15+ CD16+'
ct_dict['Group 4: cat-7'] = 'Cell Type: B Cells CD19'
ct_dict['Group 4: cat-8'] = 'Cell Type: NK Cells CD56, CD16'
df['adt-cat'] = net.export_df()
print(df['adt-cat'].shape)
keep_cols = [x for x in df['adt-cat'].columns.tolist() if x[1] != 'Group 4: cat-2']
df['adt-cat'] = df['adt-cat'][keep_cols]
df['adt-cat'].shape
(14, 7865)
(14, 7864)
df['adt-cat'].columns = [(x[0], ct_dict[x[1]]) for x in df['adt-cat'].columns.tolist()]
keep_barcodes = [x[0] for x in df['adt-cat'].columns.tolist()]
print(len(keep_barcodes))
keep_barcodes[:3]
7864
['AAACCCAAGATTGTGA', 'AAACCCACATCGGTTA', 'AAACCCAGTACCGCGT']
df['gex'] = df['gex'][keep_barcodes]
df['gex'].shape
(33538, 7864)
net.load_df(df['adt-cat'])
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "CD3", "ini": 14, "clust": 9, "rank": 4, "rankvar": 6, "group":…
df['gex-cat'] = deepcopy(df['gex'])
df['gex-cat'].columns = df['adt-cat'].columns
def drop_ribo_mito(df):
all_genes = df.index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))
df = df.loc[keep_genes]
df.shape
# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']
all_genes = df.index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or
x.split('_')[0] in list_mito_genes]
print(mito_genes)
keep_genes = [x for x in all_genes if x not in mito_genes]
df = df.loc[keep_genes]
return df
df['gex-cat'] = drop_ribo_mito(df['gex-cat'])
33538 33346 ['MTRNR2L11', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L7', 'MTRNR2L5', 'MTRF1', 'MTRNR2L4', 'MTRNR2L1', 'MTRNR2L3', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3', 'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB']
for inst_top in [5000, 2500, 1000, 500, 250, 100]:
net.load_df(df['gex-cat'])
net.filter_N_top(inst_rc='row', N_top=inst_top, rank_type='var')
inst_df = net.export_df()
inst_df.columns = [str(x) for x in inst_df.columns]
print(inst_top, inst_df.shape)
inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/gex-cat_' +
str(inst_top) + '-var.parquet')
5000 (5000, 7864) 2500 (2500, 7864) 1000 (1000, 7864) 500 (500, 7864) 250 (250, 7864) 100 (100, 7864)
inst_df = deepcopy(df['adt-cat'])
inst_df.columns = [str(x) for x in inst_df.columns]
inst_df.to_parquet('../data/10k_pbmc_feature_v3-chem_v3.0.0-cr/processed_data/adt-cat.parquet')