%load_ext pretty_jupyter
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
from datetime import date
import hisepy
import numpy as np
import os
import pandas as pd
import re
import scanpy as sc
def cache_uuid_path(uuid):
cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
if not os.path.isdir(cache_path):
hise_res = hisepy.reader.cache_files([uuid])
filename = os.listdir(cache_path)[0]
cache_file = '{p}/{f}'.format(p = cache_path, f = filename)
return cache_file
def read_csv_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = pd.read_csv(cache_file)
return res
def read_parquet_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = pd.read_parquet(cache_file)
return res
def read_adata_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = sc.read_h5ad(cache_file)
return res
def backed_adata_uuid(uuid):
cache_file = cache_uuid_path(uuid)
res = sc.read_h5ad(cache_file, backed = 'r')
return res
def rm_cache_uuid(uuid):
cache_path = '/home/jupyter/cache/{u}'.format(u = uuid)
rm_call = 'rm -r {d}'.format(d = cache_path)
os.system(rm_call)
def format_cell_type(cell_type):
cell_type = re.sub('\\+', 'pos', cell_type)
cell_type = re.sub('-', 'neg', cell_type)
cell_type = re.sub(' ', '_', cell_type)
return cell_type
def filename_cell_type(filename):
cell_type = re.sub('.+L3_','',filename)
cell_type = re.sub('_2024.+','',cell_type)
cell_type = re.sub('_', ' ', cell_type)
cell_type = re.sub('pos','+', cell_type)
cell_type = re.sub('neg','-', cell_type)
return cell_type
def add_labels(adata, labels):
obs = adata.obs
obs = obs.reset_index(drop = True)
obs = obs.merge(labels, on = 'barcodes', how = 'left')
obs = obs.set_index('barcodes', drop = False)
adata.obs = obs
return adata
def element_id(n = 3):
import periodictable
from random import randrange
rand_el = []
for i in range(n):
el = randrange(0,118)
rand_el.append(periodictable.elements[el].name)
rand_str = '-'.join(rand_el)
return rand_str
We ran a preliminary analysis of our dataset that wasn't tracked for reproducibility. Here, we'll retrieve the labels for visualization and comparison to check for consistency.
original_uuid = '3868592c-0087-4ed8-98b2-4bf1b8676111'
original_df = read_parquet_uuid(original_uuid)
original_df = original_df[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
original_df = original_df.rename({'AIFI_L1': 'original_L1', 'AIFI_L2': 'original_L2', 'AIFI_L3': 'original_L3'}, axis = 1)
original_df.head()
barcodes | original_L1 | original_L2 | original_L3 | |
---|---|---|---|---|
0 | 05ea9806794211eb93b836d1cb6129eb | DC | cDC1 | cDC1 |
1 | e225c914794011eb9282e2ceeb91ba52 | DC | cDC1 | cDC1 |
2 | b1379eae795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
3 | b13d3a8a795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
4 | b1430d16795411eb958b0245821e6993 | DC | cDC1 | cDC1 |
broad_markers = [
'CD3D', # T cells
'CD3E', # T cells/NK
'FCN1', # Monocytes/Myeloid
'HBB', # Erythrocytes
'IL7R', # T cells
'MS4A1', # B cells
'CD79A', # B cells
'PPBP', # Platelets
'IFI44L' # ISG-high
]
class_markers = [
'CD44'
]
hierarchy_uuid = '1a44252c-8cab-4c8f-92c9-d8f3af633790'
hierarchy_df = read_csv_uuid(hierarchy_uuid)
search_id = 'germanium-einsteinium-indium'
Retrieve files stored in our HISE project store
ps_df = hisepy.list_files_in_project_store('cohorts')
ps_df = ps_df[['id', 'name']]
Filter for files from the previous notebook using our search_id
search_df = ps_df[ps_df['name'].str.contains(search_id)]
search_df = search_df.sort_values('name')
search_df['AIFI_L3'] = [filename_cell_type(f) for f in search_df['name']]
search_df['AIFI_L3'].tolist()
['BaEoMaP cell', 'CLP cell', 'CMP cell', 'Erythrocyte', 'Platelet']
cell_type = 'BaEoMaP cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'84b4de9b-3bad-4a0e-b345-ee95b45f1e23'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'subject.subjectGuid', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
adata.obs['leiden_2'].value_counts()
leiden_2 0 63 1 56 2 54 3 54 4 48 5 48 6 40 7 38 8 37 9 35 10 30 11 30 12 29 13 25 14 22 15 5 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 BaEoMaP cell 554 NaN 36 CMP cell 24 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 BaEoMaP cell 0.902280 NaN 0.058632 CMP cell 0.039088 Name: count, dtype: float64
cell_type = 'CLP cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'bf5350bd-0aff-468e-9736-c3baea442fd8'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.umap(adata, color = 'subject.subjectGuid', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = ['IRF4', 'IRF8'], swap_axes = True)
adata.obs['leiden_2'].value_counts()
leiden_2 0 183 1 172 2 152 3 149 4 142 5 129 6 127 7 115 8 114 9 106 10 100 11 97 12 97 13 91 14 87 15 61 16 36 17 27 18 18 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 CLP cell 1955 NaN 44 CMP cell 3 HLA-DRhi cDC2 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 CLP cell 0.976036 NaN 0.021967 CMP cell 0.001498 HLA-DRhi cDC2 0.000499 Name: count, dtype: float64
cell_type = 'CMP cell'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'9b23db05-6e1f-45df-b01e-0bbaa5a0280b'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.umap(adata, color = 'subject.subjectGuid', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
adata.obs['leiden_2'].value_counts()
leiden_2 0 605 1 541 2 518 3 473 4 458 5 451 6 450 7 450 8 439 9 437 10 406 11 391 12 379 13 377 14 350 15 306 16 295 17 240 18 217 19 84 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 CMP cell 7839 NaN 17 BaEoMaP cell 5 CLP cell 4 Core naive CD4 T cell 1 GZMB- CD27- EM CD4 T cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 CMP cell 0.996441 NaN 0.002161 BaEoMaP cell 0.000636 CLP cell 0.000508 Core naive CD4 T cell 0.000127 GZMB- CD27- EM CD4 T cell 0.000127 Name: count, dtype: float64
No changes suggested
cell_type = 'Erythrocyte'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'69447a1b-0f14-4554-8909-f8992b88d785'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD3D','HBB','HBA1','HBA2'], vmax = 'p99', ncols = 2)
adata.obs['leiden_2'].value_counts()
leiden_2 0 1482 1 1334 2 1305 3 1305 4 1259 5 1204 6 1160 7 1133 8 1103 9 1056 10 993 11 976 12 800 13 655 14 627 15 541 16 340 17 180 18 166 Name: count, dtype: int64
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 Erythrocyte 16797 NaN 509 Platelet 156 Core naive CD4 T cell 60 GZMB- CD27- EM CD4 T cell 14 CD8 MAIT 9 Core CD14 monocyte 9 Core naive B cell 9 GZMB- CD27+ EM CD4 T cell 5 KLRF1- GZMB+ CD27- EM CD8 T cell 4 CM CD4 T cell 4 Core CD16 monocyte 4 GZMK+ CD27+ EM CD8 T cell 4 GZMK- CD56dim NK cell 3 Adaptive NK cell 3 GZMB+ Vd2 gdT 3 CD27- effector B cell 2 Core memory B cell 2 GZMK+ Vd2 gdT 2 Naive CD4 Treg 2 ISG+ naive CD4 T cell 2 KLRF1- effector Vd1 gdT 2 pDC 1 SOX4+ naive CD4 T cell 1 Proliferating T cell 1 CD27+ effector B cell 1 KLRF1- GZMB+ CD27- memory CD4 T cell 1 Intermediate monocyte 1 KLRF1+ effector Vd1 gdT 1 Core naive CD8 T cell 1 ISG+ memory CD4 T cell 1 HLA-DRhi cDC2 1 CD56bright NK cell 1 GZMK- CD27+ EM CD8 T cell 1 CM CD8 T cell 1 GZMK+ CD56dim NK cell 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 Erythrocyte 0.953346 NaN 0.028889 Platelet 0.008854 Core naive CD4 T cell 0.003405 GZMB- CD27- EM CD4 T cell 0.000795 CD8 MAIT 0.000511 Core CD14 monocyte 0.000511 Core naive B cell 0.000511 GZMB- CD27+ EM CD4 T cell 0.000284 KLRF1- GZMB+ CD27- EM CD8 T cell 0.000227 CM CD4 T cell 0.000227 Core CD16 monocyte 0.000227 GZMK+ CD27+ EM CD8 T cell 0.000227 GZMK- CD56dim NK cell 0.000170 Adaptive NK cell 0.000170 GZMB+ Vd2 gdT 0.000170 CD27- effector B cell 0.000114 Core memory B cell 0.000114 GZMK+ Vd2 gdT 0.000114 Naive CD4 Treg 0.000114 ISG+ naive CD4 T cell 0.000114 KLRF1- effector Vd1 gdT 0.000114 pDC 0.000057 SOX4+ naive CD4 T cell 0.000057 Proliferating T cell 0.000057 CD27+ effector B cell 0.000057 KLRF1- GZMB+ CD27- memory CD4 T cell 0.000057 Intermediate monocyte 0.000057 KLRF1+ effector Vd1 gdT 0.000057 Core naive CD8 T cell 0.000057 ISG+ memory CD4 T cell 0.000057 HLA-DRhi cDC2 0.000057 CD56bright NK cell 0.000057 GZMK- CD27+ EM CD8 T cell 0.000057 CM CD8 T cell 0.000057 GZMK+ CD56dim NK cell 0.000057 Name: count, dtype: float64
cell_type = 'Platelet'
type_uuid = search_df['id'].loc[search_df['AIFI_L3'] == cell_type].tolist()[0]
type_uuid
'947f1743-d4a7-410a-b226-8f873663445a'
adata = read_adata_uuid(type_uuid)
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data')
sc.pl.umap(adata, color = 'batch_id', legend_loc = 'on data')
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = broad_markers, swap_axes = True)
sc.pl.dotplot(adata, groupby = 'leiden_2', var_names = class_markers, swap_axes = True)
sc.pl.umap(adata, color = ['CD3D','HBB','HBA1','HBA2'], vmax = 'p99', ncols = 2)
adata.obs['leiden_2'].value_counts()
leiden_2 0 5030 1 4730 2 4659 3 4538 4 4438 5 4355 6 4003 7 3638 8 3321 9 2937 10 2711 11 2177 12 1857 13 1681 14 1665 15 1452 16 1444 17 1073 18 905 19 799 20 794 21 381 22 220 23 105 24 31 Name: count, dtype: int64
sc.pl.umap(adata, color = 'leiden_2', legend_loc = 'on data', groups = ['22'])
sc.tl.rank_genes_groups(
adata,
groupby = 'leiden_2',
groups = ['22']
)
sc.pl.rank_genes_groups(adata, ncols = 2)
WARNING: Default of the method has been changed to 't-test' from 't-test_overestim_var' WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
adata = add_labels(adata, original_df)
sc.pl.umap(adata, color = 'original_L3')
adata.obs['original_L3'].value_counts(dropna = False)
original_L3 Platelet 58563 NaN 353 Erythrocyte 20 GZMK- CD56dim NK cell 2 CMP cell 1 Core CD14 monocyte 1 Core CD16 monocyte 1 Core naive CD4 T cell 1 ILC 1 Naive CD4 Treg 1 Name: count, dtype: int64
adata.obs['original_L3'].value_counts(dropna = False) / adata.obs.shape[0]
original_L3 Platelet 0.993536 NaN 0.005989 Erythrocyte 0.000339 GZMK- CD56dim NK cell 0.000034 CMP cell 0.000017 Core CD14 monocyte 0.000017 Core CD16 monocyte 0.000017 Core naive CD4 T cell 0.000017 ILC 0.000017 Naive CD4 Treg 0.000017 Name: count, dtype: float64
os.system('jupyter nbconvert --to html --template pj 11g-Python_review_L3_other_data.ipynb')
[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`. [NbConvertApp] Converting notebook 11g-Python_review_L3_other_data.ipynb to html [NbConvertApp] Writing 10225604 bytes to 11g-Python_review_L3_other_data.html
0
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 numpy 1.24.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA bs4 4.12.2 cachetools 5.3.1 cerberus 1.3.5 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 entrypoints 0.4 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_pygments 0.3.0 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lxml 5.2.1 lxml_html_clean NA lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mistune 0.8.4 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbclient 0.8.0 nbconvert 6.5.4 nbformat 5.9.2 numba 0.58.0 opentelemetry NA overrides NA packaging 23.2 pandocfilters NA parso 0.8.3 patsy 0.5.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 pretty_jupyter NA prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pycparser 2.21 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 soupsieve 2.5 sparse 0.14.0 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 statsmodels 0.14.0 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.5 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1058-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-04-22 04:35