#!/usr/bin/env python # coding: utf-8 # Downloaded from the Single Cell Portal: https://singlecell.broadinstitute.org/single_cell/study/SCP1064/multi-modal-pooled-perturb-cite-seq-screens-in-patient-models-define-novel-mechanisms-of-cancer-immune-evasion # In[1]: # %load block0_load.py author_year = 'Frangieh_2021' is_counts = False var_genes = None doi = '10.1038/s41588-021-00779-1' import numpy as np import pandas as pd import scanpy as sc sc.set_figure_params(dpi=100, frameon=False) sc.logging.print_header() # verify assert(doi in pd.read_csv('../personal.csv').DOI.values) adata = sc.read(f'{author_year}_raw.h5ad') adata # In[2]: # %load block1_init.py # metalabels adata.uns['preprocessing_nb_link'] = f'https://nbviewer.org/github/theislab/sc-pert/blob/main/datasets/{author_year}_curation.ipynb' adata.uns['doi'] = doi display(adata.obs.describe(include='all').T.head(20)) # use gene symbols as gene names if var_genes: adata.var[var_genes] = adata.var[var_genes].astype(str) adata.var = adata.var.reset_index().set_index(var_genes) print(adata.var_names) # filtering and processing sc.pp.filter_cells(adata, min_genes=200) sc.pp.filter_genes(adata, min_cells=20) adata.var_names_make_unique() if is_counts: adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt' sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True) sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True) sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt') sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts') # Add additional labels, normalize and add plotting variables. # In[3]: # %load block2_process.py if is_counts: adata.layers['counts'] = adata.X sc.pp.normalize_total(adata, target_sum=1e4) sc.pp.log1p(adata) sc.pl.highest_expr_genes(adata, n_top=20) sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False) sc.pl.highly_variable_genes(adata) # pre-compute plots sc.tl.pca(adata, use_highly_variable=True) sc.pp.neighbors(adata) sc.tl.leiden(adata) sc.tl.umap(adata) # Save. # In[4]: sc.write(f'{author_year}.h5ad', adata, compression='gzip') adata # Standardized perturbation labels and save again. # In[ ]: # adata = sc.read(f'{author_year}.h5ad') # In[49]: # %load block3_standardize.py # the following fields are meant to serve as a template adata = adata[~pd.isnull(adata.obs['sgRNA'])] # remove cells without an assigned guide adata.obs['perturbation_name'] = [re.sub('[_123]+', '', s) for s in adata.obs['sgRNA']] adata.obs['perturbation_name'] = adata.obs['perturbation_name'].astype('category') adata.obs['perturbation_type'] = 'genetic' adata.obs['perturbation_value'] = adata.obs['MOI'].astype('int') adata.obs['perturbation_unit'] = 'million virons' # In[51]: sc.write(f'{author_year}.h5ad', adata, compression='gzip') adata # View. # In[9]: plot_cols = [c for c in adata.obs.columns if len(adata.obs[c].unique()) < 30 or not hasattr(adata.obs[c], 'cat')] for c in plot_cols: if adata.obs[c].dtype == 'bool': adata.obs[c] = adata.obs[c].astype('category') sc.pl.umap(adata, color=plot_cols)