#!/usr/bin/env python
# coding: utf-8

# Downloaded from the Single Cell Portal: https://singlecell.broadinstitute.org/single_cell/study/SCP1064/multi-modal-pooled-perturb-cite-seq-screens-in-patient-models-define-novel-mechanisms-of-cancer-immune-evasion

# In[1]:


# %load block0_load.py
author_year = 'Frangieh_2021'
is_counts = False
var_genes = None
doi = '10.1038/s41588-021-00779-1'

import numpy as np
import pandas as pd
import scanpy as sc
sc.set_figure_params(dpi=100, frameon=False)
sc.logging.print_header()

# verify
assert(doi in pd.read_csv('../personal.csv').DOI.values)

adata = sc.read(f'{author_year}_raw.h5ad')
adata


# In[2]:


# %load block1_init.py
# metalabels
adata.uns['preprocessing_nb_link'] = f'https://nbviewer.org/github/theislab/sc-pert/blob/main/datasets/{author_year}_curation.ipynb'
adata.uns['doi'] = doi
display(adata.obs.describe(include='all').T.head(20))

# use gene symbols as gene names
if var_genes:
    adata.var[var_genes] = adata.var[var_genes].astype(str)
    adata.var = adata.var.reset_index().set_index(var_genes)
    print(adata.var_names)

# filtering and processing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=20)
adata.var_names_make_unique()

if is_counts:
    adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')


# Add additional labels, normalize and add plotting variables.

# In[3]:


# %load block2_process.py
if is_counts:
    adata.layers['counts'] = adata.X
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)

sc.pl.highest_expr_genes(adata, n_top=20)
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)
sc.pl.highly_variable_genes(adata)

# pre-compute plots
sc.tl.pca(adata, use_highly_variable=True)
sc.pp.neighbors(adata)
sc.tl.leiden(adata)
sc.tl.umap(adata)


# Save.

# In[4]:


sc.write(f'{author_year}.h5ad', adata, compression='gzip')
adata


# Standardized perturbation labels and save again.

# In[ ]:


# adata = sc.read(f'{author_year}.h5ad')


# In[49]:


# %load block3_standardize.py
# the following fields are meant to serve as a template
adata = adata[~pd.isnull(adata.obs['sgRNA'])]  # remove cells without an assigned guide

adata.obs['perturbation_name'] = [re.sub('[_123]+', '', s) for s in adata.obs['sgRNA']]
adata.obs['perturbation_name'] = adata.obs['perturbation_name'].astype('category')
adata.obs['perturbation_type'] = 'genetic'
adata.obs['perturbation_value'] = adata.obs['MOI'].astype('int')
adata.obs['perturbation_unit'] = 'million virons'


# In[51]:


sc.write(f'{author_year}.h5ad', adata, compression='gzip')
adata


# View.

# In[9]:


plot_cols = [c for c in adata.obs.columns if len(adata.obs[c].unique()) < 30 or not hasattr(adata.obs[c], 'cat')]
for c in plot_cols:
    if adata.obs[c].dtype == 'bool':
        adata.obs[c] = adata.obs[c].astype('category')
sc.pl.umap(adata, color=plot_cols)