#!/usr/bin/env python
# coding: utf-8

# # Clonality analysis in 10X data
# 
# September 2022 draft

# # Loading packages

# In[1]:


import pandas as pd  # Pandas for data analysis.
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt  # For basic plotting.
import seaborn as sns # For pretty visualization in Seaborn.  See https://seaborn.pydata.org/
from IPython.display import display # Pretty display of data frames.

from sklearn import base
from sklearn.feature_selection import chi2, f_classif

import scanpy as sc
sc.settings.verbosity = 1             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

# Put plots inline rather than in a pop-up.
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


def hrule(repchar = '=', length=80):
    '''
    A quick function to print a horizontal line.
    '''
    if len(repchar) == 1:
        print(repchar*length)


# # Loading data
# 
# ## Loading a big Loom file

# In[3]:


filename = 'Raw/JM_10X_merged.loom'


# In[4]:


JM_all = sc.read_loom(filename)


# In[5]:


JM_all


# In[6]:


JM_all.obs # Examine cell metadata.


# In[7]:


JM_all.var # Examine gene metadata... pretty blank!


# In[8]:


JM_all.obs['orig.ident'].value_counts() # Cell counts in each experiment.


# In[9]:


JM_all.layers['counts']


# In[10]:


JM_all.obs['hash.ID'].value_counts() # Sample counts for each experiment.


# The data indicates a possible problem -- misidentifying clones/well hashtags perhaps.  Each clone should only be found in one well (wells = Samples 1-6) in Experiment 1.  But a few cells seem to violate this.  See the table below.

# In[11]:


test_df = JM_all.obs[JM_all.obs['orig.ident'] == 'exp1']
for c_id in test_df.clone_id.unique():
    print('Clone {}'.format(c_id))
    display(test_df[test_df['clone_id'] == c_id]['hash.ID'].value_counts())


# So now we reassign those cells which seem to be assigned to the wrong sample.

# In[12]:


test_df = JM_all.obs[JM_all.obs['orig.ident'] == 'exp1']
for c_id in test_df.clone_id.unique():
    vc = test_df[test_df['clone_id'] == c_id]['hash.ID'].value_counts()
    if c_id != 'exp1_non':
        if len(vc) > 1:
            print('----')
            print('Alert!')
            print(vc)
            right_sample = vc.idxmax()
            cells = test_df[test_df['clone_id'] == c_id].index
            JM_all.obs.loc[cells,'hash.ID'] = right_sample
            num_corr = len(cells) - vc[right_sample]
            print('Sample number (hashID) changed to {} for {} cells'.format(right_sample, num_corr))
            print('----')
        else:
            print('Clone {} ok.'.format(c_id))
    else: 
        print('Clone non.')


# In[13]:


test_df = JM_all.obs[JM_all.obs['orig.ident'] == 'exp1']
for c_id in test_df.clone_id.unique():
    print('Clone {}'.format(c_id))
    display(test_df[test_df['clone_id'] == c_id]['hash.ID'].value_counts())


# Now we do some string processing to avoid problems.

# In[14]:


JM_all.obs['hash.ID'] = JM_all.obs['hash.ID'].str.replace(' ', '')
JM_all.obs['hash.ID'].unique()


# In[15]:


JM_all.obs.columns = [col.replace('.','_') for col in JM_all.obs.columns]
JM_all.obs.columns = [col.replace('clone_id','clone_ID') for col in JM_all.obs.columns]


# We extract the data from clone_ID and hash_ID into new metadata attributes called simply "clone" and "well" and "experiment".

# In[16]:


JM_all.obs['clone'] = JM_all.obs.clone_ID.apply(lambda x : x.split('_')[1])
JM_all.obs['well'] = JM_all.obs.hash_ID.apply(lambda x : x.split('_')[1])
JM_all.obs['experiment'] = JM_all.obs.orig_ident


# # CD4 vs CD8 scoring

# It is pretty easy to distinguish CD4 from CD8 cells.  We use the three genes:  CD4, CD8A, CD8B, essentially giving each of them a "vote".  A clone consists of all CD4 or all CD8 cells, so we vote the entire clone to be a CD4 or CD8 cell based on the clonal mean for each gene's expression.  

# In[17]:


def CD_score(adata):
    CD_df = pd.DataFrame(index=adata.obs.index)
    for gene in ['CD4','CD8A','CD8B']:
        if gene in adata.var_names:
            CD_df[gene] = adata.to_df()[gene]
        else:
            CD_df[gene] = 0.0
    CD_means = CD_df.groupby(adata.obs.clone).mean()
    CD_scores = ((CD_means['CD8A'] > 1).astype(int) + 
                 (CD_means['CD8B'] > 1).astype(int) - 
                 (CD_means['CD4'] > 0.4).astype(int)) # Manually found cutoffs.
    CD_scores = CD_scores.map({-1 : 'CD4', 2 : 'CD8'})
    CD_scores['non'] = 'unknown'
    return CD_scores


# In[18]:


CD_scores = CD_score(JM_all)
JM_all.obs['CD_type'] = JM_all.obs.clone.map(CD_scores)


# In[19]:


meta_df = JM_all.obs
meta_df.head()


# In[20]:


meta_df.CD_type.value_counts()


# Note that this CD4 vs CD8 voting yields *unanimous* results for each identified clone.  

# In[21]:


meta_df.to_csv('Processed/Allcells_metadata.csv')


# ## Breaking into smaller experiment files
# 
# Now we create smaller files.  We create one file for each experiment.  And we create files for each well within each experiment too.  This is important for appropriate filtering along the way.

# In[22]:


JM = {}
datasets = JM_all.obs['hash_ID'].unique()
for ds in datasets:
    ad_temp = JM_all[JM_all.obs['hash_ID'] == ds]
    JM[ds] = sc.AnnData(ad_temp.layers['counts'])
    JM[ds].obs = ad_temp.obs
    JM[ds].var = ad_temp.var


# In[25]:


experiments = ['exp1','exp2','exp3']
for ds in experiments:
    ad_temp = JM_all[JM_all.obs['experiment'] == ds]
    JM[ds] = sc.AnnData(ad_temp.layers['counts'])
    JM[ds].obs = ad_temp.obs
    JM[ds].var = ad_temp.var


# In[26]:


JM['exp1_Sample1'].obs


# In[27]:


JM['exp1'].obs


# In[29]:


import sys
sys.getsizeof(JM['exp1'])


# In[39]:


datasets = list(datasets) + experiments


# In[40]:


for ds in datasets:
    fn = 'Raw/JM_10X_'+ds+'.loom'
    JM[ds].write_loom(fn)


# In[41]:


len(datasets)


# # Preprocessing

# We start by loading the data from separate loom files.

# In[42]:


JM = {}
for ds in datasets:
    fn = 'Raw/JM_10X_'+ds+'.loom'
    JM[ds] = sc.read_loom(fn)


# ## Quality control

# In[43]:


for ds in datasets:
    sc.pl.highest_expr_genes(JM[ds], n_top=10, log=False, show=False)


# In[46]:


for ds in datasets:
    JM[ds].var['mt'] = JM[ds].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(JM[ds], qc_vars=['mt'], percent_top=None, inplace=True)


# In[47]:


len(datasets)


# In[48]:


fig,ax = plt.subplots(7,3,figsize=(12,21))
for j,ds in enumerate(datasets):
    curr_ax = ax[j%7, j//7]
    sns.scatterplot(x='n_genes_by_counts', y='total_counts', data=JM[ds].obs, ax=curr_ax)
    curr_ax.set_xlim(0,7000)
    curr_ax.set_ylim(0,25000)
plt.show()


# ## Note:  we might want to filter out some cells here based on total counts?  

# ## Normalization and filtering

# Normalization to 10000-count, excluding highly variable genes

# In[49]:


for ds in datasets:
    sc.pp.highly_variable_genes(JM[ds], n_top_genes=1000, flavor='seurat_v3')
    sc.pp.normalize_total(JM[ds], target_sum = 10000, exclude_highly_expressed=True) #Normalize 


# Filter out never expressed genes.

# In[50]:


for ds in datasets:
    genes_before = JM[ds].n_vars
    sc.pp.filter_genes(JM[ds], min_counts=1) # At least 1 count.
    genes_after = JM[ds].n_vars
    print('Dataset {} has {} genes before and {} after filtering.'.format(ds, genes_before,genes_after))


# Pseudo-log normalize.

# In[51]:


for ds in datasets:
    sc.pp.log1p(JM[ds]) #pseudo-log transform


# Exclude T-cell receptor genes.

# In[52]:


def drop_TRs(ad, filename='Raw/TR_gene_data.csv'):
    ad_out = ad
    TR_gene_df = pd.read_csv(filename, index_col=0)
    TR_genes = TR_gene_df['gene_name'].values
    bad_genes = [gene for gene in ad_out.var_names if gene in TR_genes]
    print('{} genes removed.'.format(len(bad_genes)))
    keep_genes = [gene for gene in ad_out.var_names if not gene in bad_genes]
    ad_out = ad_out[:, keep_genes]
    return ad_out


# In[53]:


for ds in datasets:
    JM[ds] = drop_TRs(JM[ds])


# In[54]:


for ds in datasets:
    print('Summary of AnnData object for (shape = #samples x #genes), {}'.format(ds))
    print(JM[ds])
    hrule()


# # Make clonal analysis datasets

# In[55]:


ad = {}
for ds in datasets:
    adata = JM[ds].copy()
    print('Dataset {} has {} cells'.format(ds, len(adata.obs)))
    e_ds = ds.split('_')[0]
    has_clone = adata.obs[adata.obs.clone_ID != e_ds+'_non'].index
    print('{} cells have an assigned clone.'.format(len(has_clone)))
    vcs = adata.obs.loc[has_clone].clone_ID.value_counts()
    allclones = vcs.index
    bigclones = vcs[vcs >= 10].index
    print('{} clones with at least 10 cells each'.format(len(bigclones)))
    bc_cells = adata.obs[adata.obs.clone_ID.isin(bigclones)].index
    print('{} cells belong to these big-enough clones.'.format(len(bc_cells)))
    print('New AnnData Object Created for {}.'.format(ds))
    goodcells = adata.obs[adata.obs.clone_ID.isin(allclones)].index
    ad[ds] = adata[goodcells]
    #print(ad[ds])
    hrule()


# In[56]:


display(ad[ds].obs)


# In[57]:


for ds in datasets:
    print('CD4 and CD8 types in {}'.format(ds))
    print(ad[ds].obs.CD_type.value_counts())
    hrule()
    

# In[58]:


ads_8 = {}
ads_4 = {}
for ds in datasets:
    print('Filtering non-expressed genes.')
    ads_4[ds] = ad[ds][ad[ds].obs.CD_type == 'CD4']
    ads_8[ds] = ad[ds][ad[ds].obs.CD_type == 'CD8']
    sc.pp.filter_genes(ads_4[ds], min_counts=1) # At least 1 count.
    sc.pp.filter_genes(ads_8[ds], min_counts=1) # At least 1 count.
    hrule()


# Now we create our loom files for CD4 and CD8 T-=cells within each dataset.

# In[59]:


datasets


# In[60]:


for ds in datasets:
    fn = 'Processed/JM_10X_'+ds
    if ads_4[ds].n_obs > 0:
        ads_4[ds].write_loom(fn+'_CD4.loom')
    if ads_8[ds].n_obs > 0:
        ads_8[ds].write_loom(fn+'_CD8.loom')


# In[ ]: