In this notebook, we'll read the assembled PBMC reference dataset and remove cell types flagged as Doublets, Contamination, or with high mitochondrial content. We'll then re-project the clean reference for use in visualization and for construction of reference models.
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
from datetime import date
import hisepy
import os
import pandas as pd
import scanpy as sc
import scanpy.external as sce
def read_adata_uuid(h5ad_uuid):
h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
if not os.path.isdir(h5ad_path):
hise_res = hisepy.reader.cache_files([h5ad_uuid])
h5ad_filename = os.listdir(h5ad_path)[0]
h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
adata = sc.read_h5ad(h5ad_file)
return adata
out_dir = 'output'
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
def element_id(n = 3):
import periodictable
from random import randrange
rand_el = []
for i in range(n):
el = randrange(0,118)
rand_el.append(periodictable.elements[el].name)
rand_str = '-'.join(rand_el)
return rand_str
h5ad_uuid = '157bd496-0f1e-4239-83bc-a9616696b63a'
adata = read_adata_uuid(h5ad_uuid)
downloading fileID: 157bd496-0f1e-4239-83bc-a9616696b63a Files have been successfully downloaded!
adata.shape
(1952128, 1236)
exclude_terms = [
'Contamination', 'contamination',
'Doublet', 'doublet', 'HBB+',
'Mito', 'mito'
]
all_types = adata.obs['AIFI_L3'].unique().tolist()
exclude_types = []
for cell_type in all_types:
for term in exclude_terms:
if term in cell_type:
exclude_types.append(cell_type)
exclude_types
['Contamination', 'Naive CD8 T cell Platelet Doublets', 'T_B doublet', 'C5_C12_Contamination', 'Doublet', 'T+Erythocytes doublet', 'NK+T Doublets', 'Monocytes+Erythocytes doublet', 'Proliferating Cells+Monocytes Doublets', 'NK_Mono doublet', 'B_mono doublet', 'B_platelet doublet', 'CD4 naive Platelet Doublets', 'Erythrocyte contamination', 'HBB+ MAIT', 'CD4 naive CD16 Monocytes Doublets', 'CD56dim Mito+ NK', 'NK+Platelet Doublet', 'B+Erythocytes doublet', 'NK+Erythocytes Doublet', 'NK+Erythocytes doublet']
keep_types = []
for cell_type in all_types:
if not cell_type in exclude_types:
keep_types.append(cell_type)
all_counts = adata.obs['AIFI_L3'].value_counts()
n_all = sum(all_counts)
n_all
1952128
exclude_counts = all_counts[exclude_types]
exclude_counts
AIFI_L3 Contamination 10233 Naive CD8 T cell Platelet Doublets 14199 T_B doublet 2803 C5_C12_Contamination 1937 Doublet 46150 T+Erythocytes doublet 9148 NK+T Doublets 5494 Monocytes+Erythocytes doublet 2491 Proliferating Cells+Monocytes Doublets 122 NK_Mono doublet 2114 B_mono doublet 2535 B_platelet doublet 3948 CD4 naive Platelet Doublets 5769 Erythrocyte contamination 5962 HBB+ MAIT 1941 CD4 naive CD16 Monocytes Doublets 1367 CD56dim Mito+ NK 2768 NK+Platelet Doublet 4214 B+Erythocytes doublet 953 NK+Erythocytes Doublet 5181 NK+Erythocytes doublet 1074 Name: count, dtype: int64
n_exclude = sum(exclude_counts)
n_exclude
130403
Percent removed:
n_exclude / n_all * 100
6.68004352173628
adata.obs['keep'] = adata.obs['AIFI_L3'].isin(keep_types).astype('category')
sc.pl.umap(
adata,
color = ['AIFI_L3', 'keep'],
ncols = 1
)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
adata_subset = adata[adata.obs['AIFI_L3'].isin(keep_types)]
adata_subset.shape
(1821725, 1236)
obs = adata_subset.obs.copy()
obs['AIFI_L1'] = obs['AIFI_L1'].cat.remove_unused_categories()
obs['AIFI_L2'] = obs['AIFI_L2'].cat.remove_unused_categories()
obs['AIFI_L3'] = obs['AIFI_L3'].cat.remove_unused_categories()
adata_subset.obs = obs
sc.pl.umap(
adata_subset,
color = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3'],
ncols = 1
)
/opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter( /opt/conda/lib/python3.10/site-packages/scanpy/plotting/_tools/scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored cax = scatter(
obs = adata_subset.obs
obs = obs.reset_index(drop = True)
obs_out_csv = '{p}/ref_clean_pbmc_labeled_meta_{d}.csv'.format(p = out_dir, d = date.today())
obs.to_csv(obs_out_csv, index = False)
obs_out_parquet = '{p}/ref_clean_pbmc_labeled_meta_{d}.parquet'.format(p = out_dir, d = date.today())
obs.to_parquet(obs_out_parquet, index = False)
bc_anno = obs[['barcodes', 'AIFI_L1', 'AIFI_L2', 'AIFI_L3']]
label_out_csv = '{p}/ref_clean_pbmc_barcode_labels_{d}.csv'.format(p = out_dir, d = date.today())
bc_anno.to_csv(label_out_csv, index = False)
label_out_parquet = '{p}/ref_clean_pbmc_barcode_labels_{d}.parquet'.format(p = out_dir, d = date.today())
bc_anno.to_parquet(label_out_parquet, index = False)
out_h5ad = '{p}/ref_clean_pbmc_labeled_{d}.h5ad'.format(p = out_dir, d = date.today())
adata_subset.write_h5ad(out_h5ad)
Finally, we'll use hisepy.upload.upload_files()
to send a copy of our output to HISE to use for downstream analysis steps.
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = '10x 3-prime PBMC Clean Reference {d}'.format(d = date.today())
search_id = element_id()
search_id
'copper-hafnium-hassium'
in_files = [h5ad_uuid]
in_files
['157bd496-0f1e-4239-83bc-a9616696b63a']
out_files = [out_h5ad,
obs_out_csv, obs_out_parquet,
label_out_csv, label_out_parquet]
out_files
['output/ref_clean_pbmc_labeled_2024-04-18.h5ad', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.csv', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.parquet', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.csv', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.parquet']
hisepy.upload.upload_files(
files = out_files,
study_space_id = study_space_uuid,
title = title,
input_file_ids = in_files,
destination = search_id
)
you are trying to upload file_ids... ['output/ref_clean_pbmc_labeled_2024-04-18.h5ad', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.csv', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.parquet', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.csv', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.parquet']. Do you truly want to proceed?
{'trace_id': 'c2893b6e-a59a-43cd-bfd9-4f8049deb7ce', 'files': ['output/ref_clean_pbmc_labeled_2024-04-18.h5ad', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.csv', 'output/ref_clean_pbmc_labeled_meta_2024-04-18.parquet', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.csv', 'output/ref_clean_pbmc_barcode_labels_2024-04-18.parquet']}
import session_info
session_info.show()
----- anndata 0.10.3 hisepy 0.3.0 pandas 2.1.4 scanpy 1.9.6 session_info 1.0.0 -----
PIL 10.0.1 anyio NA arrow 1.3.0 asttokens NA attr 23.2.0 attrs 23.2.0 babel 2.14.0 beatrix_jupyterlab NA brotli NA cachetools 5.3.1 certifi 2024.02.02 cffi 1.16.0 charset_normalizer 3.3.2 cloudpickle 2.2.1 colorama 0.4.6 comm 0.1.4 cryptography 41.0.7 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 db_dtypes 1.1.1 debugpy 1.8.0 decorator 5.1.1 defusedxml 0.7.1 deprecated 1.2.14 exceptiongroup 1.2.0 executing 2.0.1 fastjsonschema NA fqdn NA google NA greenlet 2.0.2 grpc 1.58.0 grpc_status NA h5py 3.10.0 idna 3.6 igraph 0.10.8 importlib_metadata NA ipykernel 6.28.0 ipython_genutils 0.2.0 ipywidgets 8.1.1 isoduration NA jedi 0.19.1 jinja2 3.1.2 joblib 1.3.2 json5 NA jsonpointer 2.4 jsonschema 4.20.0 jsonschema_specifications NA jupyter_events 0.9.0 jupyter_server 2.12.1 jupyterlab_server 2.25.2 jwt 2.8.0 kiwisolver 1.4.5 leidenalg 0.10.1 llvmlite 0.41.0 lz4 4.3.2 markupsafe 2.1.3 matplotlib 3.8.0 matplotlib_inline 0.1.6 mpl_toolkits NA mpmath 1.3.0 natsort 8.4.0 nbformat 5.9.2 numba 0.58.0 numpy 1.24.0 opentelemetry NA overrides NA packaging 23.2 parso 0.8.3 periodictable 1.5.2 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA platformdirs 4.1.0 plotly 5.18.0 prettytable 3.9.0 prometheus_client NA prompt_toolkit 3.0.42 proto NA psutil NA ptyprocess 0.7.0 pure_eval 0.2.2 pyarrow 13.0.0 pydev_ipython NA pydevconsole NA pydevd 2.9.5 pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.17.2 pynvml NA pyparsing 3.1.1 pyreadr 0.5.0 pythonjsonlogger NA pytz 2023.3.post1 referencing NA requests 2.31.0 rfc3339_validator 0.1.4 rfc3986_validator 0.1.1 rpds NA scipy 1.11.4 send2trash NA shapely 1.8.5.post1 six 1.16.0 sklearn 1.3.2 sniffio 1.3.0 socks 1.7.1 sql NA sqlalchemy 2.0.21 sqlparse 0.4.4 stack_data 0.6.2 sympy 1.12 termcolor NA texttable 1.7.0 threadpoolctl 3.2.0 torch 2.1.2+cu121 torchgen NA tornado 6.3.3 tqdm 4.66.1 traitlets 5.9.0 typing_extensions NA uri_template NA urllib3 1.26.18 wcwidth 0.2.12 webcolors 1.13 websocket 1.7.0 wrapt 1.15.0 xarray 2023.12.0 yaml 6.0.1 zipp NA zmq 25.1.2 zoneinfo NA zstandard 0.22.0
----- IPython 8.19.0 jupyter_client 8.6.0 jupyter_core 5.6.1 jupyterlab 4.1.5 notebook 6.5.4 ----- Python 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0] Linux-5.15.0-1055-gcp-x86_64-with-glibc2.31 ----- Session information updated at 2024-04-18 21:38