Here, we'll gather all of the cell barcode labels generated for the major classes of PBMCs and generate a single complete set of annotations. We can then join these to the original, full PBMC dataset to generate a final object with all cell type annotations
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
def read_adata_uuid(h5ad_uuid):
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file, backed = 'r')
return adata
def read_parquet_uuid(pq_uuid):
pq_path = '/home/jupyter/cache/{u}'.format(u = pq_uuid)
if not os.path.isdir(pq_path):
hise_res = hisepy.reader.cache_files([pq_uuid])
pq_filename = os.listdir(pq_path)[0]
pq_file = '{p}/{f}'.format(p = pq_path, f = pq_filename)
df = pd.read_parquet(pq_file)
return df
anno_uuids = [
'6897e950-2719-4575-87d0-1d2307d5db63', # B cells
'eff741c2-b0fe-4dff-9914-ff5dc89ee95e', # Myeloid cells
'b57066f8-c0d5-4bfb-aa9b-79c743aa5886', # NK cells
'09f10d41-3170-4317-ba84-650268fcfcd2', # Other cells
'75cb6602-0179-4128-9171-83c269a2b340' # T cells
]
anno_list = []
for uuid in anno_uuids:
anno = read_parquet_uuid(uuid)
anno_list.append(anno)
downloading fileID: 6897e950-2719-4575-87d0-1d2307d5db63 Files have been successfully downloaded!
anno = pd.concat(anno_list)
anno.shape
(1952128, 4)
h5ad_uuid = '9db48bed-cd91-49ae-abd2-447ae478ca96'
adata = read_adata_uuid(h5ad_uuid)
adata.shape
(1952128, 1236)
obs = adata.obs
obs = obs.reset_index(drop = True)
obs = obs.merge(anno, on = 'barcodes', how = 'left')
obs = obs.set_index('barcodes', drop = False)
adata.obs = obs
sc.pl.umap(
adata,
color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'],
ncols = 1
)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
obs = adata.obs
obs = obs.reset_index(drop = True)
umap_mat = adata.obsm['X_umap']
umap_df = pd.DataFrame(umap_mat, columns = ['umap_1', 'umap_2'])
obs['umap_1'] = umap_df['umap_1']
obs['umap_2'] = umap_df['umap_2']
obs.head()
barcodes | batch_id | cell_name | cell_uuid | chip_id | hto_barcode | hto_category | n_genes | n_mito_umis | n_reads | ... | pct_counts_in_top_500_genes | total_counts_mito | log1p_total_counts_mito | pct_counts_mito | leiden | AIFI_L1 | AIFI_L2 | AIFI_L3 | umap_1 | umap_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | cf71f47048b611ea8957bafe6d70929e | B001 | weathered_pernicious_polliwog | cf71f47048b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1081 | 115 | 9307 | ... | 79.628331 | 115 | 4.753590 | 4.032258 | 13 | T cell | MAIT | CD8 MAIT | 0.069207 | 6.428511 |
1 | cf71f54248b611ea8957bafe6d70929e | B001 | untidy_emulsive_hamadryad | cf71f54248b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1923 | 178 | 22729 | ... | 76.718846 | 178 | 5.187386 | 2.731737 | 2 | T cell | Naive CD4 T cell | Core naive CD4 T cell | -6.025837 | 10.903475 |
2 | cf71fa1048b611ea8957bafe6d70929e | B001 | impatient_familial_cuckoo | cf71fa1048b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1246 | 204 | 11107 | ... | 78.295025 | 204 | 5.323010 | 5.935409 | 3 | Monocyte | CD14 monocyte | Core CD14 monocyte | 13.830215 | 9.230867 |
3 | cf71fb7848b611ea8957bafe6d70929e | B001 | long_weakminded_roebuck | cf71fb7848b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1118 | 77 | 12990 | ... | 83.378160 | 77 | 4.356709 | 2.071006 | 1 | T cell | Memory CD4 T cell | CM CD4 T cell | -2.623835 | 10.297518 |
4 | cf71ffba48b611ea8957bafe6d70929e | B001 | dastardly_wintery_airedale | cf71ffba48b611ea8957bafe6d70929e | B001-P1C1 | TGATGGCCTATTGGG | singlet | 1965 | 363 | 15979 | ... | 69.089532 | 363 | 5.897154 | 6.871096 | 9 | Monocyte | CD14 monocyte | Core CD14 monocyte | 12.289608 | 13.853219 |
5 rows × 55 columns
out_dir = 'output'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
obs_out_csv = '{p}/ref_pbmc_labeled_meta_umap_{d}.csv'.format(p = out_dir, d = date.today())
obs.to_csv(obs_out_csv, index = False)
obs_out_parquet = '{p}/ref_pbmc_labeled_meta_umap_{d}.parquet'.format(p = out_dir, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)
bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
label_out_csv = '{p}/ref_pbmc_barcode_labels_{d}.csv'.format(p = out_dir, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)
label_out_parquet = '{p}/ref_pbmc_barcode_labels_{d}.parquet'.format(p = out_dir, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)
out_h5ad = '{p}/ref_pbmc_labeled_{d}.h5ad'.format(p = out_dir, d = date.today())
adata.write_h5ad(out_h5ad)
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'Labeled 10x 3-prime PBMC Reference {d}'.format(d = date.today())
in_files = [h5ad_uuid] + anno_uuids
in_files
['9db48bed-cd91-49ae-abd2-447ae478ca96', '6897e950-2719-4575-87d0-1d2307d5db63', 'eff741c2-b0fe-4dff-9914-ff5dc89ee95e', 'b57066f8-c0d5-4bfb-aa9b-79c743aa5886', '09f10d41-3170-4317-ba84-650268fcfcd2', '75cb6602-0179-4128-9171-83c269a2b340']
out_files = [out_h5ad,
obs_out_csv, obs_out_parquet,
label_out_csv, label_out_parquet]
out_files
['output/ref_pbmc_labeled_2024-03-09.h5ad', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.csv', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.parquet', 'output/ref_pbmc_barcode_labels_2024-03-09.csv', 'output/ref_pbmc_barcode_labels_2024-03-09.parquet']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files
)
output/ref_pbmc_labeled_2024-03-09.h5ad output/ref_pbmc_labeled_meta_umap_2024-03-09.csv output/ref_pbmc_labeled_meta_umap_2024-03-09.parquet output/ref_pbmc_barcode_labels_2024-03-09.csv output/ref_pbmc_barcode_labels_2024-03-09.parquet you are trying to upload file_ids... ['output/ref_pbmc_labeled_2024-03-09.h5ad', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.csv', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.parquet', 'output/ref_pbmc_barcode_labels_2024-03-09.csv', 'output/ref_pbmc_barcode_labels_2024-03-09.parquet']. Do you truly want to proceed?
{'trace_id': '3db6a8db-b7e2-4d17-b5a9-608bab76bc2c', 'files': ['output/ref_pbmc_labeled_2024-03-09.h5ad', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.csv', 'output/ref_pbmc_labeled_meta_umap_2024-03-09.parquet', 'output/ref_pbmc_barcode_labels_2024-03-09.csv', 'output/ref_pbmc_barcode_labels_2024-03-09.parquet']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.2 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1052-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-03-09 05:21