To assemble our annotations, we'll read our Other cell data and assign our expert annotations to those clusters. We'll then inspect the annotations in our UMAP projections, and output final labels for these cells
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
This function makes it easy to pull csv files stored in HISE as a pandas data.frame
def read_csv_uuid(csv_uuid):
csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
if not os.path.isdir(csv_path):
hise_res = hisepy.reader.cache_files([csv_uuid])
csv_filename = os.listdir(csv_path)[0]
csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
df = pd.read_csv(csv_file, index_col = 0)
return df
cell_class = 'other'
h5ad_uuid = '1eb6ca8c-b8ed-4968-b515-c954497441dc'
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file)
adata
AnnData object with n_obs × n_vars = 24603 × 4305 obs: 'barcodes', 'batch_id', 'cell_name', 'cell_uuid', 'chip_id', 'hto_barcode', 'hto_category', 'n_genes', 'n_mito_umis', 'n_reads', 'n_umis', 'original_barcodes', 'pbmc_sample_id', 'pool_id', 'well_id', 'sample.sampleKitGuid', 'cohort.cohortGuid', 'subject.subjectGuid', 'subject.biologicalSex', 'subject.race', 'subject.ethnicity', 'subject.birthYear', 'sample.visitName', 'sample.drawDate', 'file.id', 'subject.cmv', 'subject.bmi', 'celltypist.low', 'seurat.l1', 'seurat.l1.score', 'seurat.l2', 'seurat.l2.score', 'seurat.l2.5', 'seurat.l2.5.score', 'seurat.l3', 'seurat.l3.score', 'predicted_doublet', 'doublet_score', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito', 'leiden', 'leiden_resolution_1', 'leiden_resolution_1.5', 'leiden_resolution_2' var: 'mito', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std' uns: 'celltypist.low_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'neighbors', 'pca', 'seurat.l2.5_colors', 'umap' obsm: 'X_pca', 'X_pca_harmony', 'X_umap' varm: 'PCs' obsp: 'connectivities', 'distances'
anno_uuid = '03817547-e2e3-412b-b36e-538d2bc74c87'
anno = read_csv_uuid(anno_uuid)
downloading fileID: 03817547-e2e3-412b-b36e-538d2bc74c87 Files have been successfully downloaded!
anno.head()
leiden_resolution_1 | AIFI_L3 | AIFI_L1 | AIFI_L1_Final | AIFI_L2 | AIFI_L2_Final | AIFI_L3_Final | |
---|---|---|---|---|---|---|---|
0 | 0 | Platelet | Platelet | Yes | Platelet | Yes | Yes |
1 | 1 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes |
2 | 2 | Platelet | Platelet | Yes | Platelet | Yes | Yes |
3 | 3 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes |
4 | 4 | Monocytes+Erythocytes doublet | Monocytes+Erythocytes doublet | Yes | Monocytes+Erythocytes doublet | Yes | Yes |
join_col = 'leiden_resolution_1'
anno[join_col] = anno[join_col].astype('string').astype('category')
obs = adata.obs
sum(obs[join_col].isin(anno[join_col]))
24603
obs_anno = obs.merge(anno, how = 'left', on = join_col)
adata.obs = obs_anno
adata.obs = adata.obs.set_index('barcodes', drop = False)
adata.obs.head()
barcodes | batch_id | cell_name | cell_uuid | chip_id | hto_barcode | hto_category | n_genes | n_mito_umis | n_reads | ... | leiden | leiden_resolution_1 | leiden_resolution_1.5 | leiden_resolution_2 | AIFI_L3 | AIFI_L1 | AIFI_L1_Final | AIFI_L2 | AIFI_L2_Final | AIFI_L3_Final | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
barcodes | |||||||||||||||||||||
cf7341b848b611ea8957bafe6d70929e | cf7341b848b611ea8957bafe6d70929e | B001 | chalky_guileless_waterdogs | cf7341b848b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1482 | 176 | 17043 | ... | 22 | 3 | 2 | 13 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes |
cf7400bc48b611ea8957bafe6d70929e | cf7400bc48b611ea8957bafe6d70929e | B001 | illadvised_cogitative_bluejay | cf7400bc48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1089 | 62 | 12523 | ... | 22 | 4 | 12 | 14 | Monocytes+Erythocytes doublet | Monocytes+Erythocytes doublet | Yes | Monocytes+Erythocytes doublet | Yes | Yes |
cf763b8448b611ea8957bafe6d70929e | cf763b8448b611ea8957bafe6d70929e | B001 | tricksome_sombrous_cats | cf763b8448b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1146 | 89 | 11185 | ... | 21 | 1 | 1 | 1 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes |
cf76501a48b611ea8957bafe6d70929e | cf76501a48b611ea8957bafe6d70929e | B001 | pensive_queasy_tadpole | cf76501a48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1994 | 108 | 22387 | ... | 29 | 9 | 11 | 8 | CMP cell | Progenitor cell | Yes | Progenitor cell | Yes | Yes |
cf83c0ba48b611ea8957bafe6d70929e | cf83c0ba48b611ea8957bafe6d70929e | B001 | sodalite_foreign_puffin | cf83c0ba48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 841 | 44 | 7236 | ... | 21 | 1 | 1 | 19 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes |
5 rows × 59 columns
sc.pl.umap(adata, color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'], ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
sc.pl.umap(adata,
color = ['leiden_resolution_1',
'leiden_resolution_1.5',
'leiden_resolution_2'],
ncols = 1)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
obs = adata.obs
obs = obs.reset_index(drop = True)
umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']
obs.head()
barcodes | batch_id | cell_name | cell_uuid | chip_id | hto_barcode | hto_category | n_genes | n_mito_umis | n_reads | ... | leiden_resolution_1.5 | leiden_resolution_2 | AIFI_L3 | AIFI_L1 | AIFI_L1_Final | AIFI_L2 | AIFI_L2_Final | AIFI_L3_Final | umap_1 | umap_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | cf7341b848b611ea8957bafe6d70929e | B001 | chalky_guileless_waterdogs | cf7341b848b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1482 | 176 | 17043 | ... | 2 | 13 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes | -2.371720 | 1.574610 |
1 | cf7400bc48b611ea8957bafe6d70929e | B001 | illadvised_cogitative_bluejay | cf7400bc48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1089 | 62 | 12523 | ... | 12 | 14 | Monocytes+Erythocytes doublet | Monocytes+Erythocytes doublet | Yes | Monocytes+Erythocytes doublet | Yes | Yes | 7.942333 | -5.333513 |
2 | cf763b8448b611ea8957bafe6d70929e | B001 | tricksome_sombrous_cats | cf763b8448b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1146 | 89 | 11185 | ... | 1 | 1 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes | -0.506895 | 2.742198 |
3 | cf76501a48b611ea8957bafe6d70929e | B001 | pensive_queasy_tadpole | cf76501a48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1994 | 108 | 22387 | ... | 11 | 8 | CMP cell | Progenitor cell | Yes | Progenitor cell | Yes | Yes | 12.154108 | 9.127453 |
4 | cf83c0ba48b611ea8957bafe6d70929e | B001 | sodalite_foreign_puffin | cf83c0ba48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 841 | 44 | 7236 | ... | 1 | 19 | T+Erythocytes doublet | T+Erythocytes doublet | Yes | T+Erythocytes doublet | Yes | Yes | -0.071208 | 2.975142 |
5 rows × 61 columns
out_dir = 'output'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
obs_out_csv = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_csv(obs_out_csv, index = False)
obs_out_parquet = '{p}/ref_pbmc_{c}_labeled_meta_umap_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)
bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
label_out_csv = '{p}/ref_pbmc_{c}_barcode_labels_{d}.csv'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)
label_out_parquet = '{p}/ref_pbmc_{c}_barcode_labels_{d}.parquet'.format(p = out_dir, c = cell_class, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Other barcode annotations {d}'.format(d = date.today())
in_files = [h5ad_uuid, anno_uuid]
in_files
['1eb6ca8c-b8ed-4968-b515-c954497441dc', '03817547-e2e3-412b-b36e-538d2bc74c87']
out_files = [obs_out_csv, obs_out_parquet,
label_out_csv, label_out_parquet]
out_files
['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv', 'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet', 'output/ref_pbmc_other_barcode_labels_2024-03-01.csv', 'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files
)
Cannot determine the current notebook. 1) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/20-Python_assign_Other_cells.ipynb 2) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/19-Python_assign_NK_cells.ipynb 3) /home/jupyter/scRNA-Reference-IH-A/05-Assembly/18-Python_assign_Myeloid_cells.ipynb Please select (1-3)
you are trying to upload file_ids... ['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv', 'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet', 'output/ref_pbmc_other_barcode_labels_2024-03-01.csv', 'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']. Do you truly want to proceed?
{'trace_id': '9661fa36-3916-41ca-b413-5660c8461106', 'files': ['output/ref_pbmc_other_labeled_meta_umap_2024-03-01.csv', 'output/ref_pbmc_other_labeled_meta_umap_2024-03-01.parquet', 'output/ref_pbmc_other_barcode_labels_2024-03-01.csv', 'output/ref_pbmc_other_barcode_labels_2024-03-01.parquet']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2023.11.17 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.0.10 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-03-01 01:56