Assign Other annotations¶

To assemble our annotations, we'll read our Other cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells

In [1]:

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc

Helper function¶

This function makes it easy to pull csv files stored in HISE as a pandas data.frame

In [2]:

def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pd.read_csv(csv_file, index_col = 0)
    return df

Read subclustering results from HISE¶

In [3]:

cell_class = 'other'

In [4]:

h5ad_uuid = '1eb6ca8c-b8ed-4968-b515-c954497441dc'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)

In [5]:

if not os.path.isdir(h5ad_path):
    hise_res = hisepy.reader.cache_files([h5ad_uuid])

In [6]:

h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)

In [7]:

adata = sc.read_h5ad(h5ad_file)

In [8]:

adata

Out[8]:

AnnData object with n_obs × n_vars = 24603 × 4305
    obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'file.id', 'subject.cmv', 'subject.bmi', 'celltypist.low', 'seurat.l1', 'seurat.l1.score', 'seurat.l2', 'seurat.l2.score', 'seurat.l2.5', 'seurat.l2.5.score', 'seurat.l3', 'seurat.l3.score', 'predicted_doublet', 'doublet_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden', 'leiden_resolution_1', 'leiden_resolution_1.5', 'leiden_resolution_2'
    var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'celltypist.low_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'seurat.l2.5_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Read annotations¶

In [9]:

anno_uuid = '03817547-e2e3-412b-b36e-538d2bc74c87'
anno = read_csv_uuid(anno_uuid)

downloading fileID: 03817547-e2e3-412b-b36e-538d2bc74c87
Files have been successfully downloaded!

In [10]:

anno.head()

Out[10]:

	leiden_resolution_1	AIFI_L3	AIFI_L1	AIFI_L1_Final	AIFI_L2	AIFI_L2_Final	AIFI_L3_Final
0	0	Platelet	Platelet	Yes	Platelet	Yes	Yes
1	1	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes
2	2	Platelet	Platelet	Yes	Platelet	Yes	Yes
3	3	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes
4	4	Monocytes+Erythocytes doublet	Monocytes+Erythocytes doublet	Yes	Monocytes+Erythocytes doublet	Yes	Yes

In [11]:

join_col = 'leiden_resolution_1'

In [12]:

anno[join_col] = anno[join_col].astype('string').astype('category')

In [13]:

obs = adata.obs

In [14]:

sum(obs[join_col].isin(anno[join_col]))

Out[14]:

In [15]:

obs_anno = obs.merge(anno, how = 'left', on = join_col)

In [16]:

adata.obs = obs_anno
adata.obs = adata.obs.set_index('barcodes', drop = False)

In [17]:

adata.obs.head()

Out[17]:

	barcodes	batch_id	cell_name	cell_uuid	chip_id	hto_barcode	hto_category	n_genes	n_mito_umis	n_reads	...	leiden	leiden_resolution_1	leiden_resolution_1.5	leiden_resolution_2	AIFI_L3	AIFI_L1	AIFI_L1_Final	AIFI_L2	AIFI_L2_Final	AIFI_L3_Final
barcodes
cf7341b848b611ea8957bafe6d70929e	cf7341b848b611ea8957bafe6d70929e	B001	chalky_guileless_waterdogs	cf7341b848b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1482	176	17043	...	22	3	2	13	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes
cf7400bc48b611ea8957bafe6d70929e	cf7400bc48b611ea8957bafe6d70929e	B001	illadvised_cogitative_bluejay	cf7400bc48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1089	62	12523	...	22	4	12	14	Monocytes+Erythocytes doublet	Monocytes+Erythocytes doublet	Yes	Monocytes+Erythocytes doublet	Yes	Yes
cf763b8448b611ea8957bafe6d70929e	cf763b8448b611ea8957bafe6d70929e	B001	tricksome_sombrous_cats	cf763b8448b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1146	89	11185	...	21	1	1	1	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes
cf76501a48b611ea8957bafe6d70929e	cf76501a48b611ea8957bafe6d70929e	B001	pensive_queasy_tadpole	cf76501a48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1994	108	22387	...	29	9	11	8	CMP cell	Progenitor cell	Yes	Progenitor cell	Yes	Yes
cf83c0ba48b611ea8957bafe6d70929e	cf83c0ba48b611ea8957bafe6d70929e	B001	sodalite_foreign_puffin	cf83c0ba48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	841	44	7236	...	21	1	1	19	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes

5 rows × 59 columns

In [18]:

sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

In [19]:

sc.pl.umap(adata, 
           color = ['leiden_resolution_1',
                    'leiden_resolution_1.5',
                    'leiden_resolution_2'],
           ncols = 1)

/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(

Output final annotations¶

In [20]:

obs = adata.obs
obs = obs.reset_index(drop = True)

In [21]:

umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']

In [22]:

obs.head()

Out[22]:

	barcodes	batch_id	cell_name	cell_uuid	chip_id	hto_barcode	hto_category	n_genes	n_mito_umis	n_reads	...	leiden_resolution_1.5	leiden_resolution_2	AIFI_L3	AIFI_L1	AIFI_L1_Final	AIFI_L2	AIFI_L2_Final	AIFI_L3_Final	umap_1	umap_2
0	cf7341b848b611ea8957bafe6d70929e	B001	chalky_guileless_waterdogs	cf7341b848b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1482	176	17043	...	2	13	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes	-2.371720	1.574610
1	cf7400bc48b611ea8957bafe6d70929e	B001	illadvised_cogitative_bluejay	cf7400bc48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1089	62	12523	...	12	14	Monocytes+Erythocytes doublet	Monocytes+Erythocytes doublet	Yes	Monocytes+Erythocytes doublet	Yes	Yes	7.942333	-5.333513
2	cf763b8448b611ea8957bafe6d70929e	B001	tricksome_sombrous_cats	cf763b8448b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1146	89	11185	...	1	1	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes	-0.506895	2.742198
3	cf76501a48b611ea8957bafe6d70929e	B001	pensive_queasy_tadpole	cf76501a48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	1994	108	22387	...	11	8	CMP cell	Progenitor cell	Yes	Progenitor cell	Yes	Yes	12.154108	9.127453
4	cf83c0ba48b611ea8957bafe6d70929e	B001	sodalite_foreign_puffin	cf83c0ba48b611ea8957bafe6d70929e	B001-P1C1	TGATGGCCTATTGGG	singlet	841	44	7236	...	1	19	T+Erythocytes doublet	T+Erythocytes doublet	Yes	T+Erythocytes doublet	Yes	Yes	-0.071208	2.975142

5 rows × 61 columns

In [23]:

out_dir = 'output'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [24]:

obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)

In [25]:

obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)

In [26]:

bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]

In [27]:

label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)

In [28]:

label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)

Upload annotations to HISE¶

Finally, we'll use hisepy.upload.upload_files() to send a copy of our output to HISE to use for downstream analysis steps.

In [29]:

study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Other barcode annotations {d}'.format(d = date.today())

In [30]:

in_files = [h5ad_uuid, anno_uuid]

In [31]:

in_files

Out[31]:

['1eb6ca8c-b8ed-4968-b515-c954497441dc',
 '03817547-e2e3-412b-b36e-538d2bc74c87']

In [32]:

out_files = [obs_out_csv, obs_out_parquet,
             label_out_csv, label_out_parquet]

In [33]:

out_files

Out[33]:

['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv',
 'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet',
 'output/ref_pbmc_other_barcode_labels_2024-03-01.csv',
 'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']

In [34]:

hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)

Cannot determine the current notebook.
1) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/20-Python_assign_Other_cells.ipynb
2) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/19-Python_assign_NK_cells.ipynb
3) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/18-Python_assign_Myeloid_cells.ipynb
Please select (1-3)

you are trying to upload file_ids... ['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv', 'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet', 'output/ref_pbmc_other_barcode_labels_2024-03-01.csv', 'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']. Do you truly want to proceed?

Out[34]:

{'trace_id': '9661fa36-3916-41ca-b413-5660c8461106',
 'files': ['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv',
  'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet',
  'output/ref_pbmc_other_barcode_labels_2024-03-01.csv',
  'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']}

In [35]:

import session_info
session_info.show()

Out[35]:

Click to view session information

-----
anndata             0.10.3
hisepy              0.3.0
pandas              2.1.4
scanpy              1.9.6
session_info        1.0.0
-----

Click to view modules imported as dependencies

PIL                         10.0.1
anyio                       NA
arrow                       1.3.0
asttokens                   NA
attr                        23.2.0
attrs                       23.2.0
babel                       2.14.0
beatrix_jupyterlab          NA
brotli                      NA
cachetools                  5.3.1
certifi                     2023.11.17
cffi                        1.16.0
charset_normalizer          3.3.2
cloudpickle                 2.2.1
colorama                    0.4.6
comm                        0.1.4
cryptography                41.0.7
cycler                      0.10.0
cython_runtime              NA
dateutil                    2.8.2
db_dtypes                   1.1.1
debugpy                     1.8.0
decorator                   5.1.1
defusedxml                  0.7.1
deprecated                  1.2.14
exceptiongroup              1.2.0
executing                   2.0.1
fastjsonschema              NA
fqdn                        NA
google                      NA
greenlet                    2.0.2
grpc                        1.58.0
grpc_status                 NA
h5py                        3.10.0
idna                        3.6
igraph                      0.10.8
importlib_metadata          NA
ipykernel                   6.28.0
ipython_genutils            0.2.0
ipywidgets                  8.1.1
isoduration                 NA
jedi                        0.19.1
jinja2                      3.1.2
joblib                      1.3.2
json5                       NA
jsonpointer                 2.4
jsonschema                  4.20.0
jsonschema_specifications   NA
jupyter_events              0.9.0
jupyter_server              2.12.1
jupyterlab_server           2.25.2
jwt                         2.8.0
kiwisolver                  1.4.5
leidenalg                   0.10.1
llvmlite                    0.41.0
lz4                         4.3.2
markupsafe                  2.1.3
matplotlib                  3.8.0
matplotlib_inline           0.1.6
mpl_toolkits                NA
mpmath                      1.3.0
natsort                     8.4.0
nbformat                    5.9.2
numba                       0.58.0
numpy                       1.24.0
opentelemetry               NA
overrides                   NA
packaging                   23.2
parso                       0.8.3
pexpect                     4.8.0
pickleshare                 0.7.5
pkg_resources               NA
platformdirs                4.1.0
plotly                      5.18.0
prettytable                 3.9.0
prometheus_client           NA
prompt_toolkit              3.0.42
proto                       NA
psutil                      NA
ptyprocess                  0.7.0
pure_eval                   0.2.2
pyarrow                     13.0.0
pydev_ipython               NA
pydevconsole                NA
pydevd                      2.9.5
pydevd_file_utils           NA
pydevd_plugins              NA
pydevd_tracing              NA
pygments                    2.17.2
pynvml                      NA
pyparsing                   3.1.1
pyreadr                     0.5.0
pythonjsonlogger            NA
pytz                        2023.3.post1
referencing                 NA
requests                    2.31.0
rfc3339_validator           0.1.4
rfc3986_validator           0.1.1
rpds                        NA
scipy                       1.11.4
send2trash                  NA
shapely                     1.8.5.post1
six                         1.16.0
sklearn                     1.3.2
sniffio                     1.3.0
socks                       1.7.1
sql                         NA
sqlalchemy                  2.0.21
sqlparse                    0.4.4
stack_data                  0.6.2
sympy                       1.12
termcolor                   NA
texttable                   1.7.0
threadpoolctl               3.2.0
torch                       2.1.2+cu121
torchgen                    NA
tornado                     6.3.3
tqdm                        4.66.1
traitlets                   5.9.0
typing_extensions           NA
uri_template                NA
urllib3                     1.26.18
wcwidth                     0.2.12
webcolors                   1.13
websocket                   1.7.0
wrapt                       1.15.0
xarray                      2023.12.0
yaml                        6.0.1
zipp                        NA
zmq                         25.1.2
zoneinfo                    NA
zstandard                   0.22.0

-----
IPython             8.19.0
jupyter_client      8.6.0
jupyter_core        5.6.1
jupyterlab          4.0.10
notebook            6.5.4
-----
Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31
-----
Session information updated at 2024-03-01 01:56

In [ ]: