After assembling the full dataset, we have a smattering of cell types labeled as Contamination or Doublets that we can remove to get a version that's a good representation of PBMC cell types, if not as accurate of a representation of data that are present straight off of the pipeline.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
def read_adata_uuid(h5ad_uuid):
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file)
return adata
old_h5ad_uuid = '23445c96-7b17-41ae-ad53-60bdc3fdb09e'
old_adata = read_adata_uuid(old_h5ad_uuid)
old_adata.shape
(1828803, 1351)
h5ad_uuid = '94541689-2483-4fd4-bcc1-55d8a8eef866'
adata = read_adata_uuid(h5ad_uuid)
adata.shape
(1952128, 1236)
adata.obs['AIFI_L3'] = adata.obs['AIFI_L3'].cat.rename_categories({'Core naive CD8 T cell ':'Core naive CD8 T cell'})
old_l3 = old_adata.obs[['barcodes', 'AIFI_L3']]
old_l3 = old_l3.rename({'AIFI_L3':'old_l3'}, axis = 1)
old_l3 = old_l3.reset_index(drop = True)
old_l3['old_l3'] = old_l3['old_l3'].astype(str)
new_l3 = adata.obs[['barcodes', 'AIFI_L3']]
new_l3 = new_l3.reset_index(drop = True)
new_l3['AIFI_L3'] = new_l3['AIFI_L3'].astype(str)
comp = old_l3.merge(new_l3, on = 'barcodes', how = 'inner')
comp.shape
(1828265, 3)
comp.head()
barcodes | old_l3 | AIFI_L3 | |
---|---|---|---|
0 | cf71f47048b611ea8957bafe6d70929e | CD8 MAIT | CD8 MAIT |
1 | cf71fb7848b611ea8957bafe6d70929e | CM CD4 T cell | CM CD4 T cell |
2 | cf7216a848b611ea8957bafe6d70929e | GZMB- CD27+ EM CD4 T cell | GZMB- CD27+ EM CD4 T cell |
3 | cf72178448b611ea8957bafe6d70929e | Core naive CD4 T cell | SOX4+ naive CD4 T cell |
4 | cf721a4048b611ea8957bafe6d70929e | Proliferating NK cell | Proliferating NK cell |
diff = comp[comp['old_l3'] != comp['AIFI_L3']]
diff.shape
(190671, 3)
diff_old_adata = old_adata[old_adata.obs['barcodes'].isin(diff['barcodes'])]
sc.pl.umap(diff_old_adata, color = 'AIFI_L3')
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
sc.pl.umap(diff_old_adata, color = 'AIFI_L3', groups = ['DN T cell'])
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
diff_new_adata = adata[adata.obs['barcodes'].isin(diff['barcodes'])]
sc.pl.umap(diff_new_adata, color = 'AIFI_L3')
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
diff['old_l3'].value_counts().head(20)
old_l3 CM CD4 T cell 35175 Core naive CD4 T cell 23918 ISG+ CD14 monocyte 12455 GZMB- CD27- EM CD4 T cell 9636 Intermediate monocyte 9057 KLRF1- GZMB+ CD27- EM CD8 T cell 8723 Core naive B cell 7405 Core naive CD8 T cell 6945 GZMB- CD27+ EM CD4 T cell 5536 Core CD14 monocyte 4939 GZMK- CD56dim NK cell 4522 GZMK+ CD27+ EM CD8 T cell 3963 Memory CD4 Treg 3311 SOX4+ naive CD8 T cell 3220 CM CD8 T cell 3121 Core memory B cell 2942 KLRF1+ GZMB+ CD27- EM CD8 T cell 2887 GZMB+ Vd2 gdT 2773 Transitional B cell 2417 GZMK- CD27+ EM CD8 T cell 2377 Name: count, dtype: int64
diff_frac = diff['old_l3'].value_counts() / old_l3['old_l3'].value_counts()
diff_frac.sort_values(ascending = False).head(20)
old_l3 DN T cell 1.000000 Memory CD8 Treg 0.997717 GZMK+ memory CD4 Treg 0.833488 CD4 MAIT 0.672085 Early memory B cell 0.628980 Intermediate monocyte 0.496737 SOX4+ naive CD8 T cell 0.484065 ISG+ MAIT 0.387068 GZMK- CD27+ EM CD8 T cell 0.354617 CD14+ cDC2 0.313081 CD95 memory B cell 0.262089 Transitional B cell 0.252112 ISG+ CD14 monocyte 0.235814 Type 2 polarized memory B cell 0.233358 SOX4+ Vd1 gdT 0.214665 Memory CD4 Treg 0.211417 CM CD4 T cell 0.202647 ISG+ CD56dim NK cell 0.195097 KLRF1+ effector Vd1 gdT 0.179784 CD27+ effector B cell 0.168687 Name: count, dtype: float64
diff['AIFI_L3'][diff['old_l3'] == 'Core naive CD8 T cell'].value_counts()
AIFI_L3 Core naive CD8 T cell 110260 Naive CD8 T cell Platelet Doublets 4006 SOX4+ naive CD8 T cell 1734 CM CD8 T cell 565 Core naive CD4 T cell 386 CM CD4 T cell 103 ISG+ naive CD8 T cell 86 Contamination 18 Naive Vd1 gdT 13 SOX4+ naive CD4 T cell 11 ISG+ naive CD4 T cell 10 Memory CD4 Treg 3 CD4 naive Platelet Doublets 3 Naive CD4 Treg 2 GZMB- CD27+ EM CD4 T cell 1 ISG+ memory CD4 T cell 1 GZMK+ memory CD4 Treg 1 GZMK- CD27+ EM CD8 T cell 1 GZMK+ CD27+ EM CD8 T cell 1 Name: count, dtype: int64
diff['old_l3'].value_counts()
old_l3 Core naive CD8 T cell 117205 CM CD4 T cell 35175 Core naive CD4 T cell 23918 ISG+ CD14 monocyte 12455 GZMB- CD27- EM CD4 T cell 9636 ... Plasma cell 20 cDC1 14 CMP cell 9 ASDC 6 CLP cell 1 Name: count, Length: 70, dtype: int64
CM CD4 T cells
diff['AIFI_L3'][diff['old_l3'] == 'CM CD4 T cell'].value_counts()
AIFI_L3 GZMB- CD27- EM CD4 T cell 27658 Core naive CD4 T cell 3784 GZMB- CD27+ EM CD4 T cell 2283 ISG+ memory CD4 T cell 522 Naive CD4 Treg 244 CM CD8 T cell 242 KLRB1+ memory CD4 Treg 104 Memory CD4 Treg 97 ISG+ naive CD4 T cell 66 Contamination 53 ISG+ memory CD8 T cell 32 Core naive CD8 T cell 17 Naive CD8 T cell Platelet Doublets 16 Naive Vd1 gdT 12 GZMK+ CD27+ EM CD8 T cell 7 GZMK+ memory CD4 Treg 6 CD4 naive Platelet Doublets 6 SOX4+ naive CD4 T cell 5 T+Erythocytes doublet 5 Doublet 3 Proliferating T cell 2 SOX4+ Vd1 gdT 2 T_B doublet 2 GZMK+ Vd2 gdT 2 CD4 MAIT 1 CD8aa 1 GZMK- CD27+ EM CD8 T cell 1 KLRF1- GZMB+ CD27- memory CD4 T cell 1 CD8 MAIT 1 Name: count, dtype: int64
Looks like a shift from the general CD4 CM population towards the GZMB- CD27- EM population.
Does this make sense based on expression?
New version:
markers = ['CD4', 'GZMB', 'CD27']
diff_set = adata[adata.obs['barcodes'].isin()]
check_set = adata[adata.obs['AIFI_L3'].isin(['CM CD4 T cell', 'GZMB- CD27- EM CD4 T cell', 'GZMB- CD27+ EM CD4 T cell'])]
sc.pl.dotplot(
check_set,
var_names = markers,
groupby = 'AIFI_L3',
)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
sc.pl.umap(check_set,
color = ['AIFI_L3'] + markers,
vmax = 'p95',
size = 3,
ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
Old version:
markers = ['CD4', 'GZMB', 'CD27']
old_set = old_adata[old_adata.obs['AIFI_L3'].isin(['CM CD4 T cell', 'GZMB- CD27- EM CD4 T cell', 'GZMB- CD27+ EM CD4 T cell'])]
sc.pl.dotplot(
old_set,
var_names = markers,
groupby = 'AIFI_L3',
)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored dot_ax.scatter(x, y, **kwds)
sc.pl.umap(old_set,
color = ['AIFI_L3'] + markers,
vmax = 'p95',
size = 3,
ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
Core Naive CD4 T cells
diff['AIFI_L3'][diff['old_l3'] == 'Core naive CD4 T cell'].value_counts()
diff['AIFI_L3'].value_counts()
AIFI_L3 Core naive CD8 T cell 114126 GZMB- CD27- EM CD4 T cell 30052 CM CD4 T cell 20749 Core CD14 monocyte 18575 SOX4+ naive CD4 T cell 8983 ... Platelet 4 B+Erythocytes doublet 2 pDC 1 NK+Erythocytes doublet 1 CLP cell 1 Name: count, Length: 86, dtype: int64
This is pretty weird - lots of assignment to other types, including monocytes?
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2023.11.17 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.0.10 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-03-01 01:44