Assemble clean version¶

After assembling the full dataset, we have a smattering of cell types labeled as Contamination or Doublets that we can remove to get a version that's a good representation of PBMC cell types, if not as accurate of a representation of data that are present straight off of the pipeline.

In [1]:

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

In [2]:

def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.reader.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata

Read old labeled data¶

In [3]:

old_h5ad_uuid = '23445c96-7b17-41ae-ad53-60bdc3fdb09e'

In [4]:

old_adata = read_adata_uuid(old_h5ad_uuid)

In [5]:

old_adata.shape

Out[5]:

(1828803, 1351)

Read new labeled data¶

In [6]:

h5ad_uuid = '94541689-2483-4fd4-bcc1-55d8a8eef866'

In [7]:

adata = read_adata_uuid(h5ad_uuid)

In [8]:

adata.shape

Out[8]:

(1952128, 1236)

In [44]:

adata.obs['AIFI_L3'] = adata.obs['AIFI_L3'].cat.rename_categories({'Core naive CD8 T cell ':'Core naive CD8 T cell'})

Compare L3 labels¶

In [45]:

old_l3 = old_adata.obs[['barcodes', 'AIFI_L3']]
old_l3 = old_l3.rename({'AIFI_L3':'old_l3'}, axis = 1)
old_l3 = old_l3.reset_index(drop = True)
old_l3['old_l3'] = old_l3['old_l3'].astype(str)

In [46]:

new_l3 = adata.obs[['barcodes', 'AIFI_L3']]
new_l3 = new_l3.reset_index(drop = True)
new_l3['AIFI_L3'] = new_l3['AIFI_L3'].astype(str)

In [47]:

comp = old_l3.merge(new_l3, on = 'barcodes', how = 'inner')

In [48]:

comp.shape

Out[48]:

(1828265, 3)

In [49]:

comp.head()

Out[49]:

	barcodes	old_l3	AIFI_L3
0	cf71f47048b611ea8957bafe6d70929e	CD8 MAIT	CD8 MAIT
1	cf71fb7848b611ea8957bafe6d70929e	CM CD4 T cell	CM CD4 T cell
2	cf7216a848b611ea8957bafe6d70929e	GZMB- CD27+ EM CD4 T cell	GZMB- CD27+ EM CD4 T cell
3	cf72178448b611ea8957bafe6d70929e	Core naive CD4 T cell	SOX4+ naive CD4 T cell
4	cf721a4048b611ea8957bafe6d70929e	Proliferating NK cell	Proliferating NK cell

In [50]:

diff = comp[comp['old_l3'] != comp['AIFI_L3']]

In [51]:

diff.shape

Out[51]:

(190671, 3)

In [52]:

diff_old_adata = old_adata[old_adata.obs['barcodes'].isin(diff['barcodes'])]

In [53]:

sc.pl.umap(diff_old_adata, color = 'AIFI_L3')

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

In [69]:

sc.pl.umap(diff_old_adata, color = 'AIFI_L3', groups = ['DN T cell'])

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

In [54]:

diff_new_adata = adata[adata.obs['barcodes'].isin(diff['barcodes'])]

In [55]:

sc.pl.umap(diff_new_adata, color = 'AIFI_L3')

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

In [58]:

diff['old_l3'].value_counts().head(20)

Out[58]:

old_l3
CM CD4 T cell                       35175
Core naive CD4 T cell               23918
ISG+ CD14 monocyte                  12455
GZMB- CD27- EM CD4 T cell            9636
Intermediate monocyte                9057
KLRF1- GZMB+ CD27- EM CD8 T cell     8723
Core naive B cell                    7405
Core naive CD8 T cell                6945
GZMB- CD27+ EM CD4 T cell            5536
Core CD14 monocyte                   4939
GZMK- CD56dim NK cell                4522
GZMK+ CD27+ EM CD8 T cell            3963
Memory CD4 Treg                      3311
SOX4+ naive CD8 T cell               3220
CM CD8 T cell                        3121
Core memory B cell                   2942
KLRF1+ GZMB+ CD27- EM CD8 T cell     2887
GZMB+ Vd2 gdT                        2773
Transitional B cell                  2417
GZMK- CD27+ EM CD8 T cell            2377
Name: count, dtype: int64

In [68]:

diff_frac = diff['old_l3'].value_counts() / old_l3['old_l3'].value_counts()
diff_frac.sort_values(ascending = False).head(20)

Out[68]:

old_l3
DN T cell                         1.000000
Memory CD8 Treg                   0.997717
GZMK+ memory CD4 Treg             0.833488
CD4 MAIT                          0.672085
Early memory B cell               0.628980
Intermediate monocyte             0.496737
SOX4+ naive CD8 T cell            0.484065
ISG+ MAIT                         0.387068
GZMK- CD27+ EM CD8 T cell         0.354617
CD14+ cDC2                        0.313081
CD95 memory B cell                0.262089
Transitional B cell               0.252112
ISG+ CD14 monocyte                0.235814
Type 2 polarized memory B cell    0.233358
SOX4+ Vd1 gdT                     0.214665
Memory CD4 Treg                   0.211417
CM CD4 T cell                     0.202647
ISG+ CD56dim NK cell              0.195097
KLRF1+ effector Vd1 gdT           0.179784
CD27+ effector B cell             0.168687
Name: count, dtype: float64

In [17]:

diff['AIFI_L3'][diff['old_l3'] == 'Core naive CD8 T cell'].value_counts()

Out[17]:

AIFI_L3
Core naive CD8 T cell                 110260
Naive CD8 T cell Platelet Doublets      4006
SOX4+ naive CD8 T cell                  1734
CM CD8 T cell                            565
Core naive CD4 T cell                    386
CM CD4 T cell                            103
ISG+ naive CD8 T cell                     86
Contamination                             18
Naive Vd1 gdT                             13
SOX4+ naive CD4 T cell                    11
ISG+ naive CD4 T cell                     10
Memory CD4 Treg                            3
CD4 naive Platelet Doublets                3
Naive CD4 Treg                             2
GZMB- CD27+ EM CD4 T cell                  1
ISG+ memory CD4 T cell                     1
GZMK+ memory CD4 Treg                      1
GZMK- CD27+ EM CD8 T cell                  1
GZMK+ CD27+ EM CD8 T cell                  1
Name: count, dtype: int64

In [18]:

diff['old_l3'].value_counts()

Out[18]:

old_l3
Core naive CD8 T cell        117205
CM CD4 T cell                 35175
Core naive CD4 T cell         23918
ISG+ CD14 monocyte            12455
GZMB- CD27- EM CD4 T cell      9636
                              ...  
Plasma cell                      20
cDC1                             14
CMP cell                          9
ASDC                              6
CLP cell                          1
Name: count, Length: 70, dtype: int64

CM CD4 T cells

In [19]:

diff['AIFI_L3'][diff['old_l3'] == 'CM CD4 T cell'].value_counts()

Out[19]:

AIFI_L3
GZMB- CD27- EM CD4 T cell               27658
Core naive CD4 T cell                    3784
GZMB- CD27+ EM CD4 T cell                2283
ISG+ memory CD4 T cell                    522
Naive CD4 Treg                            244
CM CD8 T cell                             242
KLRB1+ memory CD4 Treg                    104
Memory CD4 Treg                            97
ISG+ naive CD4 T cell                      66
Contamination                              53
ISG+ memory CD8 T cell                     32
Core naive CD8 T cell                      17
Naive CD8 T cell Platelet Doublets         16
Naive Vd1 gdT                              12
GZMK+ CD27+ EM CD8 T cell                   7
GZMK+ memory CD4 Treg                       6
CD4 naive Platelet Doublets                 6
SOX4+ naive CD4 T cell                      5
T+Erythocytes doublet                       5
Doublet                                     3
Proliferating T cell                        2
SOX4+ Vd1 gdT                               2
T_B doublet                                 2
GZMK+ Vd2 gdT                               2
CD4 MAIT                                    1
CD8aa                                       1
GZMK- CD27+ EM CD8 T cell                   1
KLRF1- GZMB+ CD27- memory CD4 T cell        1
CD8 MAIT                                    1
Name: count, dtype: int64

Looks like a shift from the general CD4 CM population towards the GZMB- CD27- EM population.

Does this make sense based on expression?

New version:

In [24]:

markers = ['CD4', 'GZMB', 'CD27']

diff_set = adata[adata.obs['barcodes'].isin()]
check_set = adata[adata.obs['AIFI_L3'].isin(['CM CD4 T cell', 'GZMB- CD27- EM CD4 T cell', 'GZMB- CD27+ EM CD4 T cell'])]

sc.pl.dotplot(
    check_set,
    var_names = markers,
    groupby = 'AIFI_L3',
)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)

In [32]:

sc.pl.umap(check_set,
           color = ['AIFI_L3'] + markers,
           vmax = 'p95',
           size = 3,
           ncols = 1)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

Old version:

In [36]:

markers = ['CD4', 'GZMB', 'CD27']

old_set = old_adata[old_adata.obs['AIFI_L3'].isin(['CM CD4 T cell', 'GZMB- CD27- EM CD4 T cell', 'GZMB- CD27+ EM CD4 T cell'])]

sc.pl.dotplot(
    old_set,
    var_names = markers,
    groupby = 'AIFI_L3',
)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_dotplot.py:747: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
  dot_ax.scatter(x, y, **kwds)

In [37]:

sc.pl.umap(old_set,
           color = ['AIFI_L3'] + markers,
           vmax = 'p95',
           size = 3,
           ncols = 1)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

Core Naive CD4 T cells

In [ ]:

diff['AIFI_L3'][diff['old_l3'] == 'Core naive CD4 T cell'].value_counts()

In [33]:

diff['AIFI_L3'].value_counts()

Out[33]:

AIFI_L3
Core naive CD8 T cell        114126
GZMB- CD27- EM CD4 T cell     30052
CM CD4 T cell                 20749
Core CD14 monocyte            18575
SOX4+ naive CD4 T cell         8983
                              ...  
Platelet                          4
B+Erythocytes doublet             2
pDC                               1
NK+Erythocytes doublet            1
CLP cell                          1
Name: count, Length: 86, dtype: int64

This is pretty weird - lots of assignment to other types, including monocytes?

In [36]:

import session_info
session_info.show()

Out[36]:

Click to view session information

-----
anndata             0.10.3
hisepy              0.3.0
pandas              2.1.4
scanpy              1.9.6
session_info        1.0.0
-----

Click to view modules imported as dependencies

PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.2.0
attrs                       23.2.0
babel                       2.14.0
beatrix_jupyterlab          NA
brotli                      NA
cachetools                  5.3.1
certifi                     2023.11.17
cffi                        1.16.0
charset_normalizer          3.3.2
cloudpickle                 2.2.1
colorama                    0.4.6
comm                        0.1.4
cryptography                41.0.7
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
db_dtypes                   1.1.1
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
deprecated                  1.2.14
exceptiongroup              1.2.0
executing                   2.0.1
fastjsonschema              NA
fqdn                        NA
google                      NA
greenlet                    2.0.2
grpc                        1.58.0
grpc_status                 NA
h5py                        3.10.0
idna                        3.6
igraph                      0.10.8
importlib_metadata          NA
ipykernel                   6.28.0
ipython_genutils            0.2.0
ipywidgets                  8.1.1
isoduration                 NA
jedi                        0.19.1
jinja2                      3.1.2
joblib                      1.3.2
json5                       NA
jsonpointer                 2.4
jsonschema                  4.20.0
jsonschema_specifications   NA
jupyter_events              0.9.0
jupyter_server              2.12.1
jupyterlab_server           2.25.2
jwt                         2.8.0
kiwisolver                  1.4.5
leidenalg                   0.10.1
llvmlite                    0.41.0
lz4                         4.3.2
markupsafe                  2.1.3
matplotlib                  3.8.0
matplotlib_inline           0.1.6
mpl_toolkits                NA
mpmath                      1.3.0
natsort                     8.4.0
nbformat                    5.9.2
numba                       0.58.0
numpy                       1.24.0
opentelemetry               NA
overrides                   NA
packaging                   23.2
parso                       0.8.3
pexpect                     4.8.0
pickleshare                 0.7.5
pkg_resources               NA
platformdirs                4.1.0
plotly                      5.18.0
prettytable                 3.9.0
prometheus_client           NA
prompt_toolkit              3.0.42
proto                       NA
psutil                      NA
ptyprocess                  0.7.0
pure_eval                   0.2.2
pyarrow                     13.0.0
pydev_ipython               NA
pydevconsole                NA
pydevd                      2.9.5
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.17.2
pynvml                      NA
pyparsing                   3.1.1
pyreadr                     0.5.0
pythonjsonlogger            NA
pytz                        2023.3.post1
referencing                 NA
requests                    2.31.0
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rpds                        NA
scipy                       1.11.4
send2trash                  NA
shapely                     1.8.5.post1
six                         1.16.0
sklearn                     1.3.2
sniffio                     1.3.0
socks                       1.7.1
sql                         NA
sqlalchemy                  2.0.21
sqlparse                    0.4.4
stack_data                  0.6.2
sympy                       1.12
termcolor                   NA
texttable                   1.7.0
threadpoolctl               3.2.0
torch                       2.1.2+cu121
torchgen                    NA
tornado                     6.3.3
tqdm                        4.66.1
traitlets                   5.9.0
typing_extensions           NA
uri_template                NA
urllib3                     1.26.18
wcwidth                     0.2.12
webcolors                   1.13
websocket                   1.7.0
wrapt                       1.15.0
xarray                      2023.12.0
yaml                        6.0.1
zipp                        NA
zmq                         25.1.2
zoneinfo                    NA
zstandard                   0.22.0

-----
IPython             8.19.0
jupyter_client      8.6.0
jupyter_core        5.6.1
jupyterlab          4.0.10
notebook            6.5.4
-----
Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31
-----
Session information updated at 2024-03-01 01:44

In [ ]: