%load_ext pretty_jupyter
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
from datetime import date
import hisepy
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import scanpy as sc
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
def cache_uuid_path(uuid):
cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
if not os.path.isdir(cache_path):
hise_res = hisepy.reader.cache_files([uuid])
filename = os.listdir(cache_path)[0]
cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
return cache_file
def read_csv_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = pd.read_csv(cache_file)
return res
def read_parquet_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = pd.read_parquet(cache_file)
return res
def read_adata_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = sc.read_h5ad(cache_file)
return res
def backed_adata_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = sc.read_h5ad(cache_file, backed = 'r')
return res
def rm_cache_uuid(uuid):
cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
rm_call = 'rm -r {d}'.format(d = cache_path)
os.system(rm_call)
def format_cell_type(cell_type):
cell_type = re.sub('\\+', 'pos', cell_type)
cell_type = re.sub('-', 'neg', cell_type)
cell_type = re.sub(' ', '_', cell_type)
return cell_type
def filename_cell_type(filename):
cell_type = re.sub('.+L3_','',filename)
cell_type = re.sub('_2024.+','',cell_type)
cell_type = re.sub('_', ' ', cell_type)
cell_type = re.sub('pos','+', cell_type)
cell_type = re.sub('neg','-', cell_type)
return cell_type
def add_labels(adata, labels):
obs = adata.obs
obs = obs.reset_index(drop = True)
obs = obs.merge(labels, on = 'barcodes', how = 'left')
obs = obs.set_index('barcodes', drop = False)
adata.obs = obs
return adata
def element_id(n = 3):
import periodictable
from random import randrange
rand_el = []
for i in range(n):
el = randrange(0,118)
rand_el.append(periodictable.elements[el].name)
rand_str = '-'.join(rand_el)
return rand_str
def cluster_means(adata, clusters):
obs_clusters = adata.obs.groupby(clusters)
out = pd.DataFrame(
np.zeros((adata.shape[1], len(obs_clusters)), dtype=np.float64),
columns = list(obs_clusters.groups.keys()),
index = adata.var.index
)
for group, idx in obs_clusters.indices.items():
X = adata.X[idx]
out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
return out
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
broad_markers = [
'CD3D', # T cells
'CD3E', # T cells/NK
'FCN1', # Monocytes/Myeloid
'HBB', # Erythrocytes
'MS4A1', # B cells
'CD79A', # B cells
'PPBP', # Platelets
'IFI44L' # ISG-high
]
class_markers = [
'CD4',
'CD8A',
'CD27',
'GZMB',
'IFI44L',
'KLRF1',
'SOX4'
]
We ran a preliminary analysis of our dataset that wasn't tracked for reproducibility. Here, we'll retrieve the labels for visualization and comparison to check for consistency.
original_uuid = '3868592c-0087-4ed8-98b2-4bf1b8676111'
original_df = read_parquet_uuid(original_uuid)
original_df = original_df[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
original_df = original_df.rename({'AIFI_L1': 'original_L1', 'AIFI_L2': 'original_L2', 'AIFI_L3': 'original_L3'}, axis = 1)
original_df.head()
barcodes | original_L1 | original_L2 | original_L3 | |
---|---|---|---|---|
0 | 05ea9806794211eb93b836d1cb6129eb | DC | cDC1 | cDC1 |
1 | e225c914794011eb9282e2ceeb91ba52 | DC | cDC1 | cDC1 |
2 | b1379eae795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
3 | b13d3a8a795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
4 | b1430d16795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)
search_id = 'niobium-cerium-barium'
Retrieve files stored in our HISE project store
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]
Filter for files from the previous notebook using our search_id
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')
search_df['AIFI_L3'] = [filename_cell_type(f) for f in search_df['name']]
search_df['AIFI_L3'].tolist()
['CM CD4 T cell', 'Core naive CD4 T cell', 'GZMB- CD27- EM CD4 T cell', 'GZMB- CD27+ EM CD4 T cell', 'ISG+ memory CD4 T cell', 'ISG+ naive CD4 T cell', 'KLRF1- GZMB+ CD27- memory CD4 T cell', 'SOX4+ naive CD4 T cell']
cell_type = 'CM CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'f3a014af-7e16-4671-9da8-01e603bea5ed'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD8A','CD79A'], vmax = 'p99')
adata.obs['leiden_2'].value_counts()
leiden_2 0 128947 1 126207 2 111083 3 109609 4 104215 5 102100 6 100191 7 94879 8 88030 9 77068 10 69552 11 66893 12 66769 13 61331 14 51777 15 28947 16 25697 17 6341 18 2969 19 1680 20 9 21 4 22 3 Name: count, dtype: int64
sc.tl.rank_genes_groups(
adata,
groupby = 'leiden_2',
groups = ['17']
)
sc.pl.rank_genes_groups(adata)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 CM CD4 T cell 1236442 NaN 61425 GZMB- CD27- EM CD4 T cell 38525 Core naive CD4 T cell 36213 GZMB- CD27+ EM CD4 T cell 24295 GZMK+ CD27+ EM CD8 T cell 7075 Naive CD4 Treg 4756 CM CD8 T cell 4656 GZMK+ memory CD4 Treg 3455 KLRB1+ memory CD4 Treg 1948 Memory CD4 Treg 1666 ISG+ memory CD4 T cell 899 Core naive CD8 T cell 755 ISG+ naive CD4 T cell 663 Proliferating T cell 619 DN T cell 194 Memory CD8 Treg 188 GZMK- CD27+ EM CD8 T cell 132 Naive Vd1 gdT 87 SOX4+ naive CD4 T cell 68 KLRB1+ memory CD8 Treg 36 Erythrocyte 30 ISG+ memory CD8 T cell 28 KLRF1- GZMB+ CD27- memory CD4 T cell 26 CD8 MAIT 25 ILC 22 SOX4+ Vd1 gdT 21 CD8aa 20 CD4 MAIT 10 SOX4+ naive CD8 T cell 9 KLRF1- GZMB+ CD27- EM CD8 T cell 6 GZMK+ Vd2 gdT 3 ISG+ MAIT 2 ISG+ naive CD8 T cell 1 HLA-DRhi cDC2 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 CM CD4 T cell 8.681044e-01 NaN 4.312642e-02 GZMB- CD27- EM CD4 T cell 2.704836e-02 Core naive CD4 T cell 2.542510e-02 GZMB- CD27+ EM CD4 T cell 1.705749e-02 GZMK+ CD27+ EM CD8 T cell 4.967349e-03 Naive CD4 Treg 3.339182e-03 CM CD8 T cell 3.268972e-03 GZMK+ memory CD4 Treg 2.425751e-03 KLRB1+ memory CD4 Treg 1.367688e-03 Memory CD4 Treg 1.169697e-03 ISG+ memory CD4 T cell 6.311868e-04 Core naive CD8 T cell 5.300846e-04 ISG+ naive CD4 T cell 4.654915e-04 Proliferating T cell 4.345991e-04 DN T cell 1.362072e-04 Memory CD8 Treg 1.319946e-04 GZMK- CD27+ EM CD8 T cell 9.267704e-05 Naive Vd1 gdT 6.108259e-05 SOX4+ naive CD4 T cell 4.774272e-05 KLRB1+ memory CD8 Treg 2.527556e-05 Erythrocyte 2.106296e-05 ISG+ memory CD8 T cell 1.965877e-05 KLRF1- GZMB+ CD27- memory CD4 T cell 1.825457e-05 CD8 MAIT 1.755247e-05 ILC 1.544617e-05 SOX4+ Vd1 gdT 1.474407e-05 CD8aa 1.404198e-05 CD4 MAIT 7.020988e-06 SOX4+ naive CD8 T cell 6.318889e-06 KLRF1- GZMB+ CD27- EM CD8 T cell 4.212593e-06 GZMK+ Vd2 gdT 2.106296e-06 ISG+ MAIT 1.404198e-06 ISG+ naive CD8 T cell 7.020988e-07 HLA-DRhi cDC2 7.020988e-07 Name: count, dtype: float64
cell_type = 'Core naive CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'abfe5e61-ee6d-47cf-906b-faff06c2ce67'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
adata.obs['leiden_2'].value_counts()
leiden_2 0 313158 1 262452 2 256468 3 221468 4 221035 ... 71 2 72 2 73 2 74 2 75 2 Name: count, Length: 76, dtype: int64
sc.tl.rank_genes_groups(
adata,
groupby = 'leiden_2',
groups = ['14','17']
)
sc.pl.rank_genes_groups(adata)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
sc.pl.umap(adata, color = ['CXCR3','CCL5','YBX3','PFN1'], vmax = 'p99', ncols = 2)
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 Core naive CD4 T cell 2602702 NaN 60790 CM CD4 T cell 26244 SOX4+ naive CD4 T cell 4401 Naive CD4 Treg 2995 ISG+ naive CD4 T cell 2600 Core naive CD8 T cell 2548 GZMB- CD27+ EM CD4 T cell 2262 Erythrocyte 1011 CM CD8 T cell 809 GZMB- CD27- EM CD4 T cell 602 Naive Vd1 gdT 332 GZMK+ CD27+ EM CD8 T cell 196 Memory CD4 Treg 71 GZMK+ memory CD4 Treg 69 DN T cell 66 SOX4+ naive CD8 T cell 44 GZMK- CD27+ EM CD8 T cell 39 CD8 MAIT 37 ISG+ memory CD4 T cell 33 SOX4+ Vd1 gdT 26 KLRB1+ memory CD4 Treg 24 KLRF1- GZMB+ CD27- EM CD8 T cell 20 Core CD14 monocyte 12 KLRF1- GZMB+ CD27- memory CD4 T cell 12 GZMK+ Vd2 gdT 8 GZMB+ Vd2 gdT 7 GZMK- CD56dim NK cell 6 Core naive B cell 6 Core memory B cell 4 ISG+ memory CD8 T cell 4 ILC 3 CD8aa 3 ISG+ naive CD8 T cell 3 Proliferating T cell 2 Plasma cell 2 HLA-DRhi cDC2 2 Memory CD8 Treg 2 KLRF1+ GZMB+ CD27- EM CD8 T cell 2 Intermediate monocyte 2 Core CD16 monocyte 2 Platelet 1 ISG+ CD14 monocyte 1 CMP cell 1 CD4 MAIT 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 Core naive CD4 T cell 9.611135e-01 NaN 2.244824e-02 CM CD4 T cell 9.691260e-03 SOX4+ naive CD4 T cell 1.625180e-03 Naive CD4 Treg 1.105979e-03 ISG+ naive CD4 T cell 9.601157e-04 Core naive CD8 T cell 9.409134e-04 GZMB- CD27+ EM CD4 T cell 8.353006e-04 Erythrocyte 3.733373e-04 CM CD8 T cell 2.987437e-04 GZMB- CD27- EM CD4 T cell 2.223037e-04 Naive Vd1 gdT 1.225994e-04 GZMK+ CD27+ EM CD8 T cell 7.237795e-05 Memory CD4 Treg 2.621854e-05 GZMK+ memory CD4 Treg 2.547999e-05 DN T cell 2.437217e-05 SOX4+ naive CD8 T cell 1.624811e-05 GZMK- CD27+ EM CD8 T cell 1.440174e-05 CD8 MAIT 1.366318e-05 ISG+ memory CD4 T cell 1.218608e-05 SOX4+ Vd1 gdT 9.601157e-06 KLRB1+ memory CD4 Treg 8.862606e-06 KLRF1- GZMB+ CD27- EM CD8 T cell 7.385505e-06 Core CD14 monocyte 4.431303e-06 KLRF1- GZMB+ CD27- memory CD4 T cell 4.431303e-06 GZMK+ Vd2 gdT 2.954202e-06 GZMB+ Vd2 gdT 2.584927e-06 GZMK- CD56dim NK cell 2.215652e-06 Core naive B cell 2.215652e-06 Core memory B cell 1.477101e-06 ISG+ memory CD8 T cell 1.477101e-06 ILC 1.107826e-06 CD8aa 1.107826e-06 ISG+ naive CD8 T cell 1.107826e-06 Proliferating T cell 7.385505e-07 Plasma cell 7.385505e-07 HLA-DRhi cDC2 7.385505e-07 Memory CD8 Treg 7.385505e-07 KLRF1+ GZMB+ CD27- EM CD8 T cell 7.385505e-07 Intermediate monocyte 7.385505e-07 Core CD16 monocyte 7.385505e-07 Platelet 3.692753e-07 ISG+ CD14 monocyte 3.692753e-07 CMP cell 3.692753e-07 CD4 MAIT 3.692753e-07 Name: count, dtype: float64
dropped_obs = adata.obs
dropped_obs = dropped_obs[dropped_obs['original_L3'].isna()]
dropped_obs['leiden_2'].value_counts()
leiden_2 10 16537 0 6420 6 4598 4 4432 5 4166 ... 44 0 45 0 46 0 49 0 75 0 Name: count, Length: 76, dtype: int64
cell_type = 'GZMB- CD27- EM CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'506a449a-10a0-4fd6-a4c7-efe34bb80ee9'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD8A','CD79A'], vmax = 'p99')
adata.obs['leiden_2'].value_counts()
leiden_2 0 54116 1 52492 2 46112 3 45890 4 43845 5 43051 6 34643 7 33887 8 32004 9 31794 10 30792 11 27230 12 22068 13 18207 14 11384 15 3180 16 2797 17 124 18 71 19 21 20 11 21 8 22 6 23 6 24 4 25 3 26 2 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 GZMB- CD27- EM CD4 T cell 355617 CM CD4 T cell 129834 NaN 32293 GZMB- CD27+ EM CD4 T cell 13402 Memory CD4 Treg 745 KLRB1+ memory CD4 Treg 700 Core naive CD4 T cell 312 CM CD8 T cell 289 ISG+ memory CD4 T cell 255 Naive CD4 Treg 99 GZMK+ CD27+ EM CD8 T cell 46 GZMK+ memory CD4 Treg 36 CD8 MAIT 35 Memory CD8 Treg 22 GZMK- CD27+ EM CD8 T cell 21 Naive Vd1 gdT 16 CD4 MAIT 7 KLRB1+ memory CD8 Treg 4 Proliferating T cell 3 ISG+ memory CD8 T cell 2 GZMK+ Vd2 gdT 2 CD8aa 2 ILC 2 ISG+ naive CD4 T cell 1 KLRF1- GZMB+ CD27- EM CD8 T cell 1 KLRF1- GZMB+ CD27- memory CD4 T cell 1 Core naive CD8 T cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 GZMB- CD27- EM CD4 T cell 0.666264 CM CD4 T cell 0.243250 NaN 0.060502 GZMB- CD27+ EM CD4 T cell 0.025109 Memory CD4 Treg 0.001396 KLRB1+ memory CD4 Treg 0.001311 Core naive CD4 T cell 0.000585 CM CD8 T cell 0.000541 ISG+ memory CD4 T cell 0.000478 Naive CD4 Treg 0.000185 GZMK+ CD27+ EM CD8 T cell 0.000086 GZMK+ memory CD4 Treg 0.000067 CD8 MAIT 0.000066 Memory CD8 Treg 0.000041 GZMK- CD27+ EM CD8 T cell 0.000039 Naive Vd1 gdT 0.000030 CD4 MAIT 0.000013 KLRB1+ memory CD8 Treg 0.000007 Proliferating T cell 0.000006 ISG+ memory CD8 T cell 0.000004 GZMK+ Vd2 gdT 0.000004 CD8aa 0.000004 ILC 0.000004 ISG+ naive CD4 T cell 0.000002 KLRF1- GZMB+ CD27- EM CD8 T cell 0.000002 KLRF1- GZMB+ CD27- memory CD4 T cell 0.000002 Core naive CD8 T cell 0.000002 Name: count, dtype: float64
cell_type = 'GZMB- CD27+ EM CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'e42f91dc-c563-4009-a828-6bbfdf1f20b6'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD8A'], vmax = 'p99')
sc.tl.rank_genes_groups(
adata,
groupby = 'leiden_2',
groups = ['2']
)
sc.pl.rank_genes_groups(adata)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
sc.pl.umap(adata, color = ['IL7R'], vmax = 'p99')
adata.obs['leiden_2'].value_counts()
leiden_2 0 51766 1 50336 2 45958 3 43040 4 42515 5 37540 6 36524 7 35534 8 35133 9 34240 10 30874 11 30628 12 29839 13 28043 14 24461 15 17779 16 16741 17 13595 18 4389 19 14 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 GZMB- CD27+ EM CD4 T cell 476860 GZMB- CD27- EM CD4 T cell 67969 CM CD4 T cell 30203 NaN 17467 GZMK+ CD27+ EM CD8 T cell 5069 CD4 MAIT 3552 CM CD8 T cell 3182 CD8 MAIT 1627 GZMK- CD27+ EM CD8 T cell 773 Core naive CD4 T cell 603 KLRF1- GZMB+ CD27- memory CD4 T cell 520 GZMK+ memory CD4 Treg 518 ISG+ memory CD4 T cell 119 GZMK+ Vd2 gdT 105 ISG+ memory CD8 T cell 69 KLRB1+ memory CD4 Treg 64 Memory CD4 Treg 46 ISG+ MAIT 43 Naive CD4 Treg 36 CD8aa 25 KLRB1+ memory CD8 Treg 23 Erythrocyte 13 Proliferating T cell 12 ISG+ naive CD4 T cell 12 Naive Vd1 gdT 10 ILC 9 Core naive CD8 T cell 8 KLRF1- GZMB+ CD27- EM CD8 T cell 6 DN T cell 3 Memory CD8 Treg 1 GZMB+ Vd2 gdT 1 SOX4+ Vd1 gdT 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 GZMB- CD27+ EM CD4 T cell 0.783087 GZMB- CD27- EM CD4 T cell 0.111617 CM CD4 T cell 0.049599 NaN 0.028684 GZMK+ CD27+ EM CD8 T cell 0.008324 CD4 MAIT 0.005833 CM CD8 T cell 0.005225 CD8 MAIT 0.002672 GZMK- CD27+ EM CD8 T cell 0.001269 Core naive CD4 T cell 0.000990 KLRF1- GZMB+ CD27- memory CD4 T cell 0.000854 GZMK+ memory CD4 Treg 0.000851 ISG+ memory CD4 T cell 0.000195 GZMK+ Vd2 gdT 0.000172 ISG+ memory CD8 T cell 0.000113 KLRB1+ memory CD4 Treg 0.000105 Memory CD4 Treg 0.000076 ISG+ MAIT 0.000071 Naive CD4 Treg 0.000059 CD8aa 0.000041 KLRB1+ memory CD8 Treg 0.000038 Erythrocyte 0.000021 Proliferating T cell 0.000020 ISG+ naive CD4 T cell 0.000020 Naive Vd1 gdT 0.000016 ILC 0.000015 Core naive CD8 T cell 0.000013 KLRF1- GZMB+ CD27- EM CD8 T cell 0.000010 DN T cell 0.000005 Memory CD8 Treg 0.000002 GZMB+ Vd2 gdT 0.000002 SOX4+ Vd1 gdT 0.000002 Name: count, dtype: float64
cell_type = 'ISG+ memory CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'285bbefb-3cc3-4768-b183-33fe47cbd902'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 ISG+ memory CD4 T cell 27006 CM CD4 T cell 3614 ISG+ naive CD4 T cell 2060 GZMB- CD27- EM CD4 T cell 1503 NaN 1169 ISG+ memory CD8 T cell 569 GZMB- CD27+ EM CD4 T cell 479 Memory CD4 Treg 197 KLRB1+ memory CD4 Treg 81 Core naive CD4 T cell 57 GZMK+ memory CD4 Treg 33 GZMK+ CD27+ EM CD8 T cell 29 Naive CD4 Treg 27 ISG+ naive CD8 T cell 18 CM CD8 T cell 11 ISG+ CD14 monocyte 11 ISG+ MAIT 6 ISG+ naive B cell 5 DN T cell 4 GZMK- CD27+ EM CD8 T cell 3 KLRF1- GZMB+ CD27- memory CD4 T cell 3 Proliferating T cell 3 ILC 2 ISG+ cDC2 1 ISG+ CD56dim NK cell 1 ISG+ CD16 monocyte 1 KLRB1+ memory CD8 Treg 1 KLRF1- GZMB+ CD27- EM CD8 T cell 1 Memory CD8 Treg 1 Core naive CD8 T cell 1 CD56bright NK cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 ISG+ memory CD4 T cell 0.731910 CM CD4 T cell 0.097946 ISG+ naive CD4 T cell 0.055830 GZMB- CD27- EM CD4 T cell 0.040734 NaN 0.031682 ISG+ memory CD8 T cell 0.015421 GZMB- CD27+ EM CD4 T cell 0.012982 Memory CD4 Treg 0.005339 KLRB1+ memory CD4 Treg 0.002195 Core naive CD4 T cell 0.001545 GZMK+ memory CD4 Treg 0.000894 GZMK+ CD27+ EM CD8 T cell 0.000786 Naive CD4 Treg 0.000732 ISG+ naive CD8 T cell 0.000488 CM CD8 T cell 0.000298 ISG+ CD14 monocyte 0.000298 ISG+ MAIT 0.000163 ISG+ naive B cell 0.000136 DN T cell 0.000108 GZMK- CD27+ EM CD8 T cell 0.000081 KLRF1- GZMB+ CD27- memory CD4 T cell 0.000081 Proliferating T cell 0.000081 ILC 0.000054 ISG+ cDC2 0.000027 ISG+ CD56dim NK cell 0.000027 ISG+ CD16 monocyte 0.000027 KLRB1+ memory CD8 Treg 0.000027 KLRF1- GZMB+ CD27- EM CD8 T cell 0.000027 Memory CD8 Treg 0.000027 Core naive CD8 T cell 0.000027 CD56bright NK cell 0.000027 Name: count, dtype: float64
cell_type = 'ISG+ naive CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'1f3d6703-17e5-4bbf-9949-e5f93febc067'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 ISG+ naive CD4 T cell 37392 Core naive CD4 T cell 3769 ISG+ memory CD4 T cell 1258 NaN 531 CM CD4 T cell 361 Naive CD4 Treg 95 ISG+ naive CD8 T cell 90 SOX4+ naive CD4 T cell 77 ISG+ memory CD8 T cell 57 Core naive CD8 T cell 25 GZMB- CD27+ EM CD4 T cell 21 CM CD8 T cell 10 Naive Vd1 gdT 10 Memory CD4 Treg 7 GZMK+ memory CD4 Treg 6 GZMK+ CD27+ EM CD8 T cell 4 GZMB- CD27- EM CD4 T cell 4 SOX4+ Vd1 gdT 3 DN T cell 3 ISG+ naive B cell 3 SOX4+ naive CD8 T cell 2 CD56bright NK cell 2 CLP cell 2 ISG+ CD14 monocyte 2 KLRB1+ memory CD4 Treg 1 Intermediate monocyte 1 Core naive B cell 1 Core CD16 monocyte 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 ISG+ naive CD4 T cell 0.854909 Core naive CD4 T cell 0.086172 ISG+ memory CD4 T cell 0.028762 NaN 0.012140 CM CD4 T cell 0.008254 Naive CD4 Treg 0.002172 ISG+ naive CD8 T cell 0.002058 SOX4+ naive CD4 T cell 0.001760 ISG+ memory CD8 T cell 0.001303 Core naive CD8 T cell 0.000572 GZMB- CD27+ EM CD4 T cell 0.000480 CM CD8 T cell 0.000229 Naive Vd1 gdT 0.000229 Memory CD4 Treg 0.000160 GZMK+ memory CD4 Treg 0.000137 GZMK+ CD27+ EM CD8 T cell 0.000091 GZMB- CD27- EM CD4 T cell 0.000091 SOX4+ Vd1 gdT 0.000069 DN T cell 0.000069 ISG+ naive B cell 0.000069 SOX4+ naive CD8 T cell 0.000046 CD56bright NK cell 0.000046 CLP cell 0.000046 ISG+ CD14 monocyte 0.000046 KLRB1+ memory CD4 Treg 0.000023 Intermediate monocyte 0.000023 Core naive B cell 0.000023 Core CD16 monocyte 0.000023 Name: count, dtype: float64
cell_type = 'KLRF1- GZMB+ CD27- memory CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'd8196be8-1606-47ff-b1e9-3ed08ff18837'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'subject.subjectGuid', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD8A'], vmax = 'p99')
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 KLRF1- GZMB+ CD27- memory CD4 T cell 147546 NaN 7003 KLRF1- GZMB+ CD27- EM CD8 T cell 5130 GZMB- CD27+ EM CD4 T cell 506 GZMK+ CD27+ EM CD8 T cell 389 GZMB+ Vd2 gdT 76 KLRF1- effector Vd1 gdT 70 KLRF1+ GZMB+ CD27- EM CD8 T cell 52 KLRB1+ memory CD8 Treg 23 GZMB- CD27- EM CD4 T cell 22 Proliferating T cell 14 KLRF1+ effector Vd1 gdT 10 Adaptive NK cell 10 CD4 MAIT 9 GZMK+ Vd2 gdT 9 CM CD8 T cell 4 CD8 MAIT 4 CD8aa 3 ISG+ MAIT 2 ISG+ memory CD8 T cell 2 GZMK+ memory CD4 Treg 1 CM CD4 T cell 1 SOX4+ Vd1 gdT 1 GZMK- CD56dim NK cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 KLRF1- GZMB+ CD27- memory CD4 T cell 0.917073 NaN 0.043527 KLRF1- GZMB+ CD27- EM CD8 T cell 0.031886 GZMB- CD27+ EM CD4 T cell 0.003145 GZMK+ CD27+ EM CD8 T cell 0.002418 GZMB+ Vd2 gdT 0.000472 KLRF1- effector Vd1 gdT 0.000435 KLRF1+ GZMB+ CD27- EM CD8 T cell 0.000323 KLRB1+ memory CD8 Treg 0.000143 GZMB- CD27- EM CD4 T cell 0.000137 Proliferating T cell 0.000087 KLRF1+ effector Vd1 gdT 0.000062 Adaptive NK cell 0.000062 CD4 MAIT 0.000056 GZMK+ Vd2 gdT 0.000056 CM CD8 T cell 0.000025 CD8 MAIT 0.000025 CD8aa 0.000019 ISG+ MAIT 0.000012 ISG+ memory CD8 T cell 0.000012 GZMK+ memory CD4 Treg 0.000006 CM CD4 T cell 0.000006 SOX4+ Vd1 gdT 0.000006 GZMK- CD56dim NK cell 0.000006 Name: count, dtype: float64
cell_type = 'SOX4+ naive CD4 T cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'96dcc9a0-d880-4a77-930f-de8a4d07bd07'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'leiden_2', groups = ['14','15'])
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.tl.rank_genes_groups(
adata,
groupby = 'leiden_2',
groups = ['14']
)
sc.pl.rank_genes_groups(adata)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
sc.pl.umap(adata, color = ['CD8A','TRDC'], vmax = 'p99')
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 SOX4+ naive CD4 T cell 75041 Core naive CD4 T cell 21253 NaN 2891 SOX4+ naive CD8 T cell 357 ISG+ naive CD4 T cell 65 SOX4+ Vd1 gdT 62 Naive CD4 Treg 46 Core naive CD8 T cell 43 CM CD4 T cell 12 Naive Vd1 gdT 8 CM CD8 T cell 5 Core naive B cell 3 GZMB- CD27+ EM CD4 T cell 3 DN T cell 2 CLP cell 1 KLRB1+ memory CD4 Treg 1 GZMK- CD27+ EM CD8 T cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 SOX4+ naive CD4 T cell 0.751959 Core naive CD4 T cell 0.212969 NaN 0.028970 SOX4+ naive CD8 T cell 0.003577 ISG+ naive CD4 T cell 0.000651 SOX4+ Vd1 gdT 0.000621 Naive CD4 Treg 0.000461 Core naive CD8 T cell 0.000431 CM CD4 T cell 0.000120 Naive Vd1 gdT 0.000080 CM CD8 T cell 0.000050 Core naive B cell 0.000030 GZMB- CD27+ EM CD4 T cell 0.000030 DN T cell 0.000020 CLP cell 0.000010 KLRB1+ memory CD4 Treg 0.000010 GZMK- CD27+ EM CD8 T cell 0.000010 Name: count, dtype: float64
os.system('jupyter nbconvert --to html --template pj 11b-Python_review_L3_cd4_t_cell_data.ipynb')
[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`. [NbConvertApp] Converting notebook 11b-Python_review_L3_cd4_t_cell_data.ipynb to html [NbConvertApp] Writing 17409861 bytes to 11b-Python_review_L3_cd4_t_cell_data.html
0
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 matplotlib 3.8.0 numpy 1.24.0 pandas 2.1.4 scanpy 1.9.6 scipy 1.11.4 session_info 1.0.0 sklearn 1.3.2 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA bs4 4.12.2 cachetools 5.3.1 cerberus 1.3.5 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 entrypoints 0.4 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_pygments 0.3.0 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lxml 5.2.1 lxml_html_clean NA lz4 4.3.2 markupsafe 2.1.3 matplotlib_inline 0.1.6 mistune 0.8.4 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbclient 0.8.0 nbconvert 6.5.4 nbformat 5.9.2 numba 0.58.0 opentelemetry NA overrides NA packaging 23.2 pandocfilters NA parso 0.8.3 patsy 0.5.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 pretty_jupyter NA prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pycparser 2.21 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA send2trash NA shapely 1.8.5.post1 six 1.16.0 sniffio 1.3.0 socks 1.7.1 soupsieve 2.5 sparse 0.14.0 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 statsmodels 0.14.0 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.5 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1058-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-04-22 16:46