https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads
import pandas as pd
from glob import glob
import os
all_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files/*'))
len(all_samples)
61
def make_dir(directory):
import os
if not os.path.exists(directory):
os.mkdir(directory)
def calc_gene_mean(inst_sample):
df_gex = pd.read_parquet(inst_sample + '/gex.parquet')
print(df_gex.shape)
inst_mean = df_gex.mean(axis=1)
return inst_mean
df_meta.columns.tolist()
['Unnamed: 0', 'all_exon_count', 'all_intron_count', 'all_read_count', 'intergenic_rate', 'embryo_id', 'embryo_sex', 'nuclei_extraction_date', 'development_stage', 'Total_mRNAs', 'num_genes_expressed', 'Size_Factor', 'Main_Cluster', 'Main_cluster_tsne_1', 'Main_cluster_tsne_2', 'Sub_cluster', 'Sub_cluster_tsne_1', 'Sub_cluster_tsne_2', 'doublet_score', 'detected_doublet', 'doublet_cluster', 'sub_cluster_id', 'Main_cell_type', 'Main_trajectory', 'Main_trajectory_umap_1', 'Main_trajectory_umap_2', 'Main_trajectory_umap_3', 'Main_trajectory_refined_by_cluster', 'Main_trajectory_refined_umap_1', 'Main_trajectory_refined_umap_2', 'Main_trajectory_refined_umap_3', 'Sub_trajectory_name', 'Sub_trajectory_umap_1', 'Sub_trajectory_umap_2', 'Sub_trajectory_louvain_component', 'Sub_trajectory_Pseudotime']
df_meta_ini = pd.read_parquet(all_samples[0] + '/meta_cell.parquet')
ser_doublet = df_meta_ini['doublet_cluster']
ser_doublet = ser_doublet[ser_doublet == False]
keep_cells = ser_doublet.index.tolist()
df_meta = df_meta_ini.loc[keep_cells]
df_meta.shape
(15666, 36)
df_gex = pd.read_parquet(all_samples[0] + '/gex.parquet', columns=keep_cells)
print(df_gex.shape)
inst_mean = df_gex.mean(axis=1)
top_genes = inst_mean.sort_values(ascending=False).index.tolist()[:10000]
keep_genes = df_gex.loc[top_genes].var(axis=1).sort_values(ascending=False).index.tolist()[:5000]
(26183, 15666)
The top 5,000 variable genes were defined based on embryo-1E9.5 gene expression data
base_dir = '../data/cao_2million-cell_2019_61-embryo_parquet_files_binder/'
for inst_sample_path in all_samples:
inst_sample = inst_sample_path.split('/')[-1]
new_sample_dir = base_dir + inst_sample
# drop doublets
df_meta_ini = pd.read_parquet(inst_sample_path + '/meta_cell.parquet')
ser_doublet = df_meta_ini['doublet_cluster']
ser_doublet = ser_doublet[ser_doublet == False]
keep_cells = ser_doublet.index.tolist()
df_meta = df_meta_ini.loc[keep_cells]
# load gene expression
df_gex = pd.read_parquet(inst_sample_path + '/gex.parquet', columns=keep_cells).loc[keep_genes]
# save filtered data for mybinder
make_dir(new_sample_dir)
df_meta.to_parquet(new_sample_dir + '/meta_cell.parquet')
df_gex.to_parquet(new_sample_dir + '/gex.parquet')
# check file size (MB)
statinfo = os.stat(new_sample_dir + '/gex.parquet')
print(inst_sample, df_gex.shape, statinfo.st_size/(1000000))
embryo-1-E9.5 (5000, 15666) 26.023213 embryo-10-E11.5 (5000, 32449) 54.726554 embryo-11-E12.5 (5000, 10270) 14.867986 embryo-12-E12.5 (5000, 27090) 44.015724 embryo-13-E12.5 (5000, 12436) 18.081453 embryo-14-E12.5 (5000, 27450) 44.172131 embryo-15-E13.5 (5000, 23136) 35.579287 embryo-16-E13.5 (5000, 13434) 19.351804 embryo-17-E13.5 (5000, 17306) 25.060539 embryo-19-E9.5 (5000, 4026) 7.010098 embryo-20-E9.5 (5000, 2525) 4.080514 embryo-21-E9.5 (5000, 11550) 18.417185 embryo-22-E9.5 (5000, 5818) 10.062801 embryo-24-E10.5 (5000, 28100) 46.77209 embryo-25-E10.5 (5000, 14498) 23.030284 embryo-26-E10.5 (5000, 24664) 41.623712 embryo-27-E11.5 (5000, 42106) 73.731097 embryo-28-E11.5 (5000, 37761) 65.197931 embryo-29-E11.5 (5000, 33185) 57.158152 embryo-3-E9.5 (5000, 8086) 14.005555 embryo-31-E12.5 (5000, 24208) 36.324759 embryo-33-E12.5 (5000, 57625) 84.85053 embryo-34-E12.5 (5000, 39619) 60.933913 embryo-35-E13.5 (5000, 17118) 24.931888 embryo-36-E13.5 (5000, 22222) 32.530816 embryo-37-E13.5 (5000, 21655) 31.964647 embryo-38-E13.5 (5000, 22056) 33.00234 embryo-39-E9.5 (5000, 7064) 12.366449 embryo-4-E10.5 (5000, 12559) 20.919848 embryo-40-E9.5 (5000, 7017) 11.685474 embryo-41-E9.5 (5000, 3885) 6.920301 embryo-42-E9.5 (5000, 8541) 14.419699 embryo-43-E10.5 (5000, 19422) 29.989311 embryo-44-E10.5 (5000, 26715) 44.829101 embryo-46-E10.5 (5000, 30976) 54.765808 embryo-47-E11.5 (5000, 37763) 67.915082 embryo-48-E11.5 (5000, 43105) 73.141142 embryo-49-E11.5 (5000, 36490) 64.551681 embryo-5-E10.5 (5000, 21987) 36.750677 embryo-50-E11.5 (5000, 37226) 62.984305 embryo-51-E12.5 (5000, 18053) 26.987817 embryo-52-E12.5 (5000, 23163) 36.57017 embryo-53-E13.5 (5000, 16348) 24.086259 embryo-55-E9.5 (5000, 4397) 8.662424 embryo-56-E9.5 (5000, 7770) 13.214076 embryo-57-E9.5 (5000, 10115) 18.238266 embryo-58-E9.5 (5000, 8048) 14.08015 embryo-59-E10.5 (5000, 25696) 42.005196 embryo-6-E10.5 (5000, 27174) 44.938129 embryo-60-E10.5 (5000, 33564) 54.053156 embryo-61-E11.5 (5000, 36558) 61.10102 embryo-62-E11.5 (5000, 33504) 53.756219 embryo-63-E9.5 (5000, 10729) 18.63547 embryo-64-E12.5 (5000, 44238) 66.844447 embryo-65-E13.5 (5000, 19457) 32.948022 embryo-66-E13.5 (5000, 38067) 64.048408 embryo-67-E13.5 (5000, 17780) 26.249178 embryo-68-E13.5 (5000, 27869) 42.631713 embryo-7-E11.5 (5000, 35416) 59.146557 embryo-8-E11.5 (5000, 32655) 53.663686 embryo-9-E11.5 (5000, 27177) 44.994238
Compare to processed data after removing cells from doublet derived clusters (https://oncoscape.v3.sttrcancer.org/atlas.gs.washington.edu.mouse.rna/downloads).
total_cells = 0
new_samples = sorted(glob('../data/big_data/cao_2million-cell_2019_61-embryo_parquet_files_binder/*'))
for inst_sample in new_samples:
df_meta = pd.read_parquet(inst_sample + '/meta_cell.parquet')
total_cells = total_cells + df_meta.shape[0]
print(total_cells)
1386587