Working on cell type by developmental stage downsampled view of the data. I'll downsample the data from each embryo into cell types. Each embryo has the same developmental stage.
from clustergrammer2 import net
df = {}
>> clustergrammer2 backend version 0.4.2
import pandas as pd
from glob import glob
import os
from copy import deepcopy
def add_cats_from_meta(barcodes, df_meta, add_cat_list):
'''
Add categories from df_meta.
'''
# get metadata of interest (add_cat_list) from barcodes of interest
df_cats = df_meta.loc[barcodes][add_cat_list]
# get list of cats
list_cat_ini = [list(x) for x in df_cats.values]
# add titles to cats
list_cat_titles = [ list([str(x) + ': ' + str(y) for x,y in zip(add_cat_list, a)]) for a in list_cat_ini]
# add barcodes to new columns
new_cols = [tuple([x] + y) for x,y in zip(barcodes, list_cat_titles)]
return new_cols
list_cell_types = []
meta_list = []
new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))
for inst_sample in new_samples:
df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
meta_list.append(df_meta)
list_cell_types.extend(list(df_meta['Main_cell_type'].get_values()))
print(len(list_cell_types))
1386587
ser_cell_types = pd.Series(list_cell_types)
all_cell_types = ser_cell_types.value_counts().index.tolist()
print('there are', len(all_cell_types), 'cell types')
ser_cell_types.value_counts()
ser_pop = ser_cell_types.value_counts()
ser_pop.name = 'Population'
there are 38 cell types
ser_pop.to_csv('../data/cao_2million-cell_2019_61-embryo_downsample/cell_type_dist.txt', sep='\t')
ser_pop.shape
(38,)
ser_pop
Chondrocytes & osteoblasts 104698 Connective tissue progenitors 98964 Intermediate Mesoderm 89518 Jaw and tooth progenitors 82289 Early mesenchyme 71949 Excitatory neurons 68567 Epithelial cells 66209 Radial glia 65428 Neural progenitor cells 58332 Postmitotic premature neurons 56033 Oligodendrocyte Progenitors 54606 Isthmic organizer cells 48498 Neural Tube 45985 Inhibitory neurons 44658 Myocytes 43197 Definitive erythroid lineage 34205 Chondroctye progenitors 33539 Inhibitory neuron progenitors 31214 Premature oligodendrocyte 29538 Limb mesenchyme 26559 Sensory neurons 26477 Endothelial cells 26431 Stromal cells 23259 Osteoblasts 23223 Schwann cell precursor 23145 Granule neurons 16131 Notochord cells 15481 Primitive erythroid lineage 15138 Inhibitory interneurons 13533 Hepatocytes 11229 White blood cells 9202 Ependymal cell 8566 Cholinergic neurons 7060 Cardiac muscle lineages 4867 Megakaryocytes 3572 Melanocytes 2827 Lens 1954 Neutrophils 506 Name: Population, dtype: int64
Add categories for: cell type, developmental stage, embryo id, and population number.
dev_dict = {}
dev_dict['E9.5'] = '1-E9.5'
dev_dict['E10.5'] = '2-E10.5'
dev_dict['E11.5'] = '3-E11.5'
dev_dict['E12.5'] = '4-E12.5'
dev_dict['E13.5'] = '5-E13.5'
%%time
df_mean_list = []
for inst_sample_path in new_samples:
df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet')
df_meta = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')
inst_sample = inst_sample_path.split('/')[-1]
inst_embryo = 'embryo-' + inst_sample.split('-')[1]
inst_dev = inst_sample.split('-')[2]
new_cols = add_cats_from_meta(df_gex.columns.tolist(),
df_meta,
['Main_cell_type', 'development_stage'])
# save number of cells in each cluster
ct_pop = pd.Series([x[1] for x in new_cols]).value_counts()
ct_pop.index = [x.split(': ')[1] for x in ct_pop.index.tolist()]
df_cat = deepcopy(df_gex)
df_cat.columns = new_cols
df_mi = net.row_tuple_to_multiindex(df_cat.transpose())
df_mean_ini = df_mi.groupby(level='Main_cell_type').mean().transpose()
rows = df_mean_ini.index.tolist()
cols = [(x + '_' + inst_sample.replace('embryo','e'),
'Cell Type: ' + x,
'Dev Stage: ' + dev_dict[inst_dev],
'Embryo: ' + inst_embryo,
'Pop: ' + str(ct_pop[x])) for x in df_mean_ini.columns.tolist()]
mat = df_mean_ini.get_values()
df_mean = pd.DataFrame(index=rows, columns=cols, data=mat)
print(inst_sample, df_cat.shape, df_mean.shape)
df_mean_list.append(df_mean)
embryo-1-E9.5 (5000, 15666) (5000, 36) embryo-10-E11.5 (5000, 32449) (5000, 38) embryo-11-E12.5 (5000, 10270) (5000, 37) embryo-12-E12.5 (5000, 27090) (5000, 38) embryo-13-E12.5 (5000, 12436) (5000, 37) embryo-14-E12.5 (5000, 27450) (5000, 38) embryo-15-E13.5 (5000, 23136) (5000, 38) embryo-16-E13.5 (5000, 13434) (5000, 36) embryo-17-E13.5 (5000, 17306) (5000, 36) embryo-19-E9.5 (5000, 4026) (5000, 30) embryo-20-E9.5 (5000, 2525) (5000, 27) embryo-21-E9.5 (5000, 11550) (5000, 35) embryo-22-E9.5 (5000, 5818) (5000, 31) embryo-24-E10.5 (5000, 28100) (5000, 38) embryo-25-E10.5 (5000, 14498) (5000, 37) embryo-26-E10.5 (5000, 24664) (5000, 38) embryo-27-E11.5 (5000, 42106) (5000, 38) embryo-28-E11.5 (5000, 37761) (5000, 38) embryo-29-E11.5 (5000, 33185) (5000, 38) embryo-3-E9.5 (5000, 8086) (5000, 34) embryo-31-E12.5 (5000, 24208) (5000, 38) embryo-33-E12.5 (5000, 57625) (5000, 38) embryo-34-E12.5 (5000, 39619) (5000, 38) embryo-35-E13.5 (5000, 17118) (5000, 38) embryo-36-E13.5 (5000, 22222) (5000, 38) embryo-37-E13.5 (5000, 21655) (5000, 37) embryo-38-E13.5 (5000, 22056) (5000, 36) embryo-39-E9.5 (5000, 7064) (5000, 30) embryo-4-E10.5 (5000, 12559) (5000, 38) embryo-40-E9.5 (5000, 7017) (5000, 33) embryo-41-E9.5 (5000, 3885) (5000, 28) embryo-42-E9.5 (5000, 8541) (5000, 35) embryo-43-E10.5 (5000, 19422) (5000, 38) embryo-44-E10.5 (5000, 26715) (5000, 38) embryo-46-E10.5 (5000, 30976) (5000, 38) embryo-47-E11.5 (5000, 37763) (5000, 38) embryo-48-E11.5 (5000, 43105) (5000, 38) embryo-49-E11.5 (5000, 36490) (5000, 38) embryo-5-E10.5 (5000, 21987) (5000, 38) embryo-50-E11.5 (5000, 37226) (5000, 38) embryo-51-E12.5 (5000, 18053) (5000, 38) embryo-52-E12.5 (5000, 23163) (5000, 38) embryo-53-E13.5 (5000, 16348) (5000, 38) embryo-55-E9.5 (5000, 4397) (5000, 33) embryo-56-E9.5 (5000, 7770) (5000, 33) embryo-57-E9.5 (5000, 10115) (5000, 35) embryo-58-E9.5 (5000, 8048) (5000, 35) embryo-59-E10.5 (5000, 25696) (5000, 38) embryo-6-E10.5 (5000, 27174) (5000, 38) embryo-60-E10.5 (5000, 33564) (5000, 38) embryo-61-E11.5 (5000, 36558) (5000, 38) embryo-62-E11.5 (5000, 33504) (5000, 38) embryo-63-E9.5 (5000, 10729) (5000, 36) embryo-64-E12.5 (5000, 44238) (5000, 38) embryo-65-E13.5 (5000, 19457) (5000, 38) embryo-66-E13.5 (5000, 38067) (5000, 38) embryo-67-E13.5 (5000, 17780) (5000, 38) embryo-68-E13.5 (5000, 27869) (5000, 38) embryo-7-E11.5 (5000, 35416) (5000, 38) embryo-8-E11.5 (5000, 32655) (5000, 38) embryo-9-E11.5 (5000, 27177) (5000, 38) CPU times: user 8min 23s, sys: 4min 9s, total: 12min 32s Wall time: 7min 1s
df_merge = pd.concat(df_mean_list, axis=1)
df_merge.shape
(5000, 2229)
df_save = deepcopy(df_merge)
df_save.columns = [str(x) for x in df_save.columns.tolist()]
df_save.to_parquet('../data/cao_2million-cell_2019_61-embryo_downsample/cao_embryo_cell-type_downsample.parquet')
df_merge.columns.tolist()[:3]
[('Cardiac muscle lineages_e-1-E9.5', 'Cell Type: Cardiac muscle lineages', 'Dev Stage: 1-E9.5', 'Embryo: embryo-1', 'Pop: 263'), ('Cholinergic neurons_e-1-E9.5', 'Cell Type: Cholinergic neurons', 'Dev Stage: 1-E9.5', 'Embryo: embryo-1', 'Pop: 76'), ('Chondroctye progenitors_e-1-E9.5', 'Cell Type: Chondroctye progenitors', 'Dev Stage: 1-E9.5', 'Embryo: embryo-1', 'Pop: 388')]
net.load_df(df_merge)
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "Gm42418", "ini": 250, "clust": 5, "rank": 79, "rankvar": 87, "…
net.load_df(df_merge)
net.filter_cat(axis='col', cat_index=1, cat_name='Cell Type: Sensory neurons')
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.clip(-5,5)
net.widget()
ExampleWidget(network='{"row_nodes": [{"name": "mt-Rnr2", "ini": 250, "clust": 9, "rank": 135, "rankvar": 190,…
cols = df_merge.columns.tolist()
total_cells = 0
for inst_col in cols:
inst_pop = int(inst_col[4].split(': ')[1])
total_cells = total_cells + inst_pop
print('total number of cells represented: ', total_cells)
total number of cells represented: 1386587