To assemble our annotations, we'll read our clustered T cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells.
For T cells, we have multiple groups of cells to label. We clustered all T cells, then subset cell types for additional resolution. So, we'll load these sets, remove the subsets from the rest of the T cells, assign identities based on clusters in each, and finally concatenate all of the cell barcodes.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
These function makes it easy to pull csv and h5ad files stored in HISE as pandas DataFrames
def read_csv_uuid(csv_uuid):
csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
if not os.path.isdir(csv_path):
hise_res = hisepy.reader.cache_files([csv_uuid])
csv_filename = os.listdir(csv_path)[0]
csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
df = pd.read_csv(csv_file, index_col = 0)
return df
def read_obs_uuid(h5ad_uuid):
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file, backed = 'r')
obs = adata.obs.copy()
return obs
cell_class = 't-cells'
h5ad_uuid = 'd6ebc576-34ea-4394-a569-e35e16f20253'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file)
adata.shape
(1191327, 1487)
iter_uuids = {
't-cd4-naive': '70651e60-282b-4ed0-96f6-414547297232',
't-cd8-mait': '0f821486-866b-4c08-b0b8-508a5c544547',
't-cd8-cm': '6c1dff43-ddc5-437b-8e3d-dd5a32553b16',
't-cd8-em': 'b671c53a-2698-41c1-a886-9ab939306716',
'treg': '35b11bcf-7a45-4714-b470-9f7627d6fbbd',
't-cd8-naive': '5ae29893-5a77-4081-86d1-523713a237e6',
't-proliferating': '90a71622-5713-47f7-82e8-18e164ca9454',
't-gd': '71d79aee-5600-4f3f-a3d1-e3f830e1c0ff',
't-isg-high': 'd33ef147-59db-4fb6-950c-1dd8af242d4f',
't-other': 'bda4fe2f-1d8a-4ec5-9ce7-6bee1a158d7b'
}
iter_obs = {}
for cell_type, uuid in iter_uuids.items():
obs = read_obs_uuid(uuid)
iter_obs[cell_type] = obs
For gdT cells subclustering, we included some cells that initially clustered with MAIT, CD8 CM, and CD8 EM cells. Here, we'll identify our gdT cells, then drop the cells that were labeled with gdTs from the other subclustering results so we don't have duplicates.
gdt_bc = iter_obs["t-gd"]['barcodes'].tolist()
len(gdt_bc)
54113
drop_set = ['t-cd8-mait', 't-cd8-cm', 't-cd8-em']
for cell_type in drop_set:
obs = iter_obs[cell_type]
n_start = obs.shape[0]
keep_bc = [not x for x in obs['barcodes'].isin(gdt_bc)]
obs = obs[keep_bc]
n_end = obs.shape[0]
print('{c}; N Start: {s}; N End: {e}'.format(c = cell_type, s = str(n_start), e=str(n_end)))
iter_obs[cell_type] = obs
t-cd8-mait; N Start: 50823; N End: 48084 t-cd8-cm; N Start: 43289; N End: 37568 t-cd8-em; N Start: 118291; N End: 105726
Now, we'll join cell type labels from our cluster annotations to our cell barcode-level observations.
anno_uuids = {
't-gd': '18df15b0-769e-4af5-a2b9-2df0413c4519',
'treg': '401874cf-6700-4721-803d-acf66d9db321',
't-cd8-mait': '6b77ef51-9b88-427c-bfce-a596a97610a8',
't-cd4-naive': '8dd06068-ec33-4bbd-ab10-00944db3d304',
't-cd8-naive': '921181ac-ceaf-4508-bc87-3165dddb0451',
't-cd8-cm': 'ae96a58d-e0da-4838-a473-86c59d786e02',
't-other': 'b0207abb-6e23-457e-aa63-72b0c04d57dd',
't-proliferating': 'cc09cd70-2b40-44d5-836a-bbae87eb7e8f',
't-isg-high': 'dbded4c8-644a-4b81-a6b2-c13e97f3733b',
't-cd8-em': 'df6cef89-3647-4a67-87de-daa9d9bf8171',
}
iter_anno = {}
for cell_type,uuid in anno_uuids.items():
iter_anno[cell_type] = read_csv_uuid(uuid)
iter_bc_anno = {}
for cell_type,sub_obs in iter_obs.items():
sub_anno = iter_anno[cell_type]
join_col = sub_anno.columns[0]
sub_anno[join_col] = sub_anno[join_col].astype(str).astype('category')
sub_obs = sub_obs.merge(sub_anno, on = join_col, how = 'left')
sub_obs = sub_obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
iter_bc_anno[cell_type] = sub_obs
all_anno = pd.concat(iter_bc_anno)
all_anno.shape
(1191327, 4)
adata.shape
(1191327, 1487)
sum(adata.obs['barcodes'].isin(all_anno['barcodes']))
1191327
obs = adata.obs
obs = obs.reset_index(drop = True)
obs = obs.merge(all_anno, on = 'barcodes', how = 'left')
obs = obs.set_index('barcodes', drop = True)
adata.obs = obs
sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
sc.pl.umap(adata,
color = ['leiden_resolution_1',
'leiden_resolution_1.5',
'leiden_resolution_2'],
ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
obs = adata.obs
obs = obs.reset_index(drop = False)
umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']
obs.head()
barcodes | batch_id | cell_name | cell_uuid | chip_id | hto_barcode | hto_category | n_genes | n_mito_umis | n_reads | ... | pct_counts_mito | leiden | leiden_resolution_1 | leiden_resolution_1.5 | leiden_resolution_2 | AIFI_L1 | AIFI_L2 | AIFI_L3 | umap_1 | umap_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | cf71f47048b611ea8957bafe6d70929e | B001 | weathered_pernicious_polliwog | cf71f47048b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1081 | 115 | 9307 | ... | 4.032258 | 13 | 8 | 11 | 12 | T cell | MAIT | CD8 MAIT | 6.384859 | -4.117041 |
1 | cf71f54248b611ea8957bafe6d70929e | B001 | untidy_emulsive_hamadryad | cf71f54248b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1923 | 178 | 22729 | ... | 2.731737 | 2 | 12 | 16 | 18 | T cell | Naive CD4 T cell | Core naive CD4 T cell | -4.634521 | 3.974752 |
2 | cf71fb7848b611ea8957bafe6d70929e | B001 | long_weakminded_roebuck | cf71fb7848b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1118 | 77 | 12990 | ... | 2.071006 | 1 | 5 | 6 | 4 | T cell | Memory CD4 T cell | CM CD4 T cell | -0.504904 | 1.140365 |
3 | cf7216a848b611ea8957bafe6d70929e | B001 | bimetallic_returnable_pony | cf7216a848b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1836 | 249 | 21279 | ... | 4.098091 | 11 | 2 | 8 | 7 | T cell | Memory CD4 T cell | GZMB- CD27+ EM CD4 T cell | 2.885731 | -0.563941 |
4 | cf72178448b611ea8957bafe6d70929e | B001 | newsworthy_copacetic_halicore | cf72178448b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 928 | 60 | 9412 | ... | 2.130682 | 0 | 1 | 0 | 0 | T cell | Naive CD4 T cell | SOX4+ naive CD4 T cell | -4.179520 | 0.825014 |
5 rows × 58 columns
out_dir = 'output'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)
obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)
bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)
label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'T cell barcode annotations {d}'.format(d = date.today())
iter_h5ad_uuids = list(iter_uuids.values())
iter_anno_uuids = list(anno_uuids.values())
in_files = [h5ad_uuid] + iter_h5ad_uuids + iter_anno_uuids
in_files
['d6ebc576-34ea-4394-a569-e35e16f20253', '70651e60-282b-4ed0-96f6-414547297232', '0f821486-866b-4c08-b0b8-508a5c544547', '6c1dff43-ddc5-437b-8e3d-dd5a32553b16', 'b671c53a-2698-41c1-a886-9ab939306716', '35b11bcf-7a45-4714-b470-9f7627d6fbbd', '5ae29893-5a77-4081-86d1-523713a237e6', '90a71622-5713-47f7-82e8-18e164ca9454', '71d79aee-5600-4f3f-a3d1-e3f830e1c0ff', 'd33ef147-59db-4fb6-950c-1dd8af242d4f', 'bda4fe2f-1d8a-4ec5-9ce7-6bee1a158d7b', '18df15b0-769e-4af5-a2b9-2df0413c4519', '401874cf-6700-4721-803d-acf66d9db321', '6b77ef51-9b88-427c-bfce-a596a97610a8', '8dd06068-ec33-4bbd-ab10-00944db3d304', '921181ac-ceaf-4508-bc87-3165dddb0451', 'ae96a58d-e0da-4838-a473-86c59d786e02', 'b0207abb-6e23-457e-aa63-72b0c04d57dd', 'cc09cd70-2b40-44d5-836a-bbae87eb7e8f', 'dbded4c8-644a-4b81-a6b2-c13e97f3733b', 'df6cef89-3647-4a67-87de-daa9d9bf8171']
We should have 10 h5ad's and 10 annotations
len(in_files)
21
out_files = [obs_out_csv, obs_out_parquet,
label_out_csv, label_out_parquet]
out_files
['output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.csv', 'output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.parquet', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.csv', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.parquet']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files
)
output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.csv output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.parquet output/ref_pbmc_t-cells_barcode_labels_2024-03-05.csv output/ref_pbmc_t-cells_barcode_labels_2024-03-05.parquet you are trying to upload file_ids... ['output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.csv', 'output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.parquet', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.csv', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.parquet']. Do you truly want to proceed?
{'trace_id': 'e083ff64-6e3e-4914-9fdc-7612ae7472cc', 'files': ['output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.csv', 'output/ref_pbmc_t-cells_labeled_meta_umap_2024-03-05.parquet', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.csv', 'output/ref_pbmc_t-cells_barcode_labels_2024-03-05.parquet']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.2 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-03-05 19:18