#!/usr/bin/env python
# coding: utf-8

# # Pseudobulk data with fraction and median non-zero
# 
# For use in plotting, we'll compute "pseudobulk" summary statistics for gene expression based on cell metadata grouping.
# 
# Note that this approach doesn't use the careful pseudobulk approaches implemented in packages like `scran` - we're instead taking the fraction of cells in which gene expression was detected (`pct_exp`) and the median of expression in those non-zero cells (`median_expression`).
# 
# For use in our visualization tools, we'll group by each of our cell type labeling levels (AIFI_L1, _L2, and _L3), as well as by both cell type and the originating, age-related cohort for these cells.

# ## Load packages

# In[1]:


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

from datetime import date
import hisepy
import numpy as np
import os
import pandas as pd
import pickle
import re
import scanpy as sc
import scipy.sparse as scs


# In[2]:


if not os.path.exists('output'):
    os.mkdir('output')


# In[3]:


out_files = []


# ## Helper functions

# In[4]:


def read_adata_uuid(h5ad_uuid):
    h5ad_path = '/home/jupyter/cache/{u}'.format(u = h5ad_uuid)
    if not os.path.isdir(h5ad_path):
        hise_res = hisepy.cache_files([h5ad_uuid])
    h5ad_filename = os.listdir(h5ad_path)[0]
    h5ad_file = '{p}/{f}'.format(p = h5ad_path, f = h5ad_filename)
    adata = sc.read_h5ad(h5ad_file)
    return adata


# In[5]:


def sparse_nz_median(sparse_mat, feat_names):
    # transpose the matrix so values for each gene are together in memory
    sparse_mat = sparse_mat.transpose().tocsr()
    
    # get dimensions for calculations
    n_cells = sparse_mat.shape[1]
    
    # compute the fraction non-zero per gene (row = axis 1)
    pct_exp = sparse_mat.getnnz(axis = 1) / n_cells
    
    # split non-zero values from the matrix directly
    split_values = np.split(sparse_mat.data, sparse_mat.indptr)
    
    # remove first and last entry in split - indptr starts with 0 and ends with the last value
    # so the first and last split entries are not meaningful
    split_values = split_values[1:-1].copy()
    
    # compute medians for every split
    median_expression = [np.median(x) for x in split_values]

    # assemble a DataFrame with results
    res_df = pd.DataFrame({
        'gene': feat_names,
        'pct_exp': pct_exp,
        'median_expression': median_expression
    })
    
    # replace missing values from splits without any values
    # these were not detected so median expression is 0
    res_df['median_expression'] = res_df['median_expression'].fillna(0)
    
    return res_df


# ## Retrieve atlas data from HISE

# In[6]:


h5ad_uuid = '5b3a0cc8-1811-4126-90c7-e9cdd41459fd'


# In[7]:


adata = read_adata_uuid(h5ad_uuid)


# In[8]:


adata.shape


# In[9]:


obs = adata.obs


# ### Summarized expression by cell type
# 
# For our UMAP viewer, we plot plot expression at each level of cell type resolution

# In[10]:


levels = ['AIFI_L1', 'AIFI_L2', 'AIFI_L3']


# In[11]:


get_ipython().run_cell_magic('time', '', "level_results = {}\n\nfor level in levels:\n    print('>>> {l}'.format(l = level))\n    \n    level_obs = obs.groupby(level)\n    \n    level_dict = {}\n    for cell_type, type_obs in level_obs:\n        type_bc = type_obs['barcodes']\n        type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n        \n        type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n        \n        level_dict[cell_type] = type_res\n        \n    level_results[level] = level_dict\n")


# ### Save assembled results for each level

# In[12]:


level_combined = {}
for level in levels:
    all_level = pd.concat(level_results[level])
    all_level = all_level.reset_index(drop = False, names = [level, 'row_num'])
    all_level = all_level.drop('row_num', axis = 1)
    level_combined[level] = all_level


# In[13]:


for level, df in level_combined.items():
    out_csv = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())
    df.to_csv(out_csv)
    out_files.append(out_csv)
    out_parquet = 'output/pbmc-ref_{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())
    df.to_parquet(out_parquet)
    out_files.append(out_parquet)


# ### Restructure for use in visualization apps

# In[14]:


gene_results = {}
for level in levels:
    all_res = pd.concat(level_results[level])
    split_res = all_res.groupby('gene')
    split_dict = {}
    for gene, df in split_res:
        df = df.reset_index(drop = False)
        df = df.drop('level_1', axis = 1)
        df = df.rename({'level_0': 'obs_group'}, axis = 1)
        split_dict[gene] = df
    
    gene_results[level] = split_dict


# Check that a gene or two look accurate

# In[15]:


gene_results['AIFI_L1']['CD79A']


# In[16]:


gene_results['AIFI_L2']['FCN1']


# In[17]:


type_pkl = 'output/pbmc-ref_type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())
with open(type_pkl, 'wb') as out_file:
    pickle.dump(gene_results, out_file)
out_files.append(type_pkl)


# ### Summarized expression by cell type and cohort
# 
# For our Gene Expression Summary viewer, we plot plot expression at each level of cell type resolution partitioned by each cohort in the reference.

# In[18]:


get_ipython().run_cell_magic('time', '', "level_results = {}\n\nfor level in levels:\n    print('>>> {l}'.format(l = level))\n    \n    level_obs = obs.groupby(['cohort.cohortGuid', level])\n    \n    level_dict = {}\n    for group_tuple, type_obs in level_obs:\n        cohort = group_tuple[0]\n        cell_type = group_tuple[1]\n        c_t = '{c}_{t}'.format(c = cohort, t = cell_type)\n        \n        type_bc = type_obs['barcodes']\n        type_adata = adata[adata.obs['barcodes'].isin(type_bc)]\n        \n        type_res = sparse_nz_median(type_adata.X, type_adata.var_names.tolist())\n        \n        type_res['cohort'] = [cohort] * type_res.shape[0]\n        level_dict[c_t] = type_res\n        \n    level_results[level] = level_dict\n")


# ### Save assembled results for each level

# In[19]:


level_combined = {}
for level in levels:
    all_level = pd.concat(level_results[level])
    all_level = all_level.reset_index(drop = False, names = ['group', 'row_num'])
    all_level[level] = [re.sub('.+_', '', x) for x in all_level['group']]
    all_level = all_level.drop(['group','row_num'], axis = 1)
    all_level = all_level[['cohort', level, 'gene', 'pct_exp', 'median_expression']]
    level_combined[level] = all_level


# In[20]:


for level, df in level_combined.items():
    out_csv = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.csv'.format(l = level, d = date.today())
    df.to_csv(out_csv)
    out_files.append(out_csv)
    out_parquet = 'output/pbmc-ref_cohort-{l}_nz-pct-median_pseudobulk_{d}.parquet'.format(l = level, d = date.today())
    df.to_parquet(out_parquet)
    out_files.append(out_parquet)


# ### Restructure for use in visualization apps

# In[21]:


gene_results = {}
for level in levels:
    all_res = pd.concat(level_results[level])
    split_res = all_res.groupby('gene')
    
    split_dict = {}
    for gene, df in split_res:
        df = df.reset_index(drop = False)
        df = df.drop('level_1', axis = 1)
        df['obs_group'] = [re.sub('.+_', '', x) for x in df['level_0']]
        df = df.drop('level_0', axis = 1)
        df = df[['obs_group', 'gene', 'pct_exp', 'median_expression', 'cohort']]
        split_dict[gene] = df
    gene_results[level] = split_dict


# Check that a gene or two look accurate

# In[22]:


gene_results['AIFI_L1']['CD79A']


# In[23]:


cohort_type_pkl = 'output/pbmc-ref_cohort-type_nz-pct-median_pseudobulk_{d}.pkl'.format(d = date.today())
with open(cohort_type_pkl, 'wb') as out_file:
    pickle.dump(gene_results, out_file)
out_files.append(cohort_type_pkl)


# ## Upload Results to HISE
# 
# Finally, we'll use `hisepy.upload.upload_files()` to send a copy of our output to HISE to use for downstream analysis steps.

# In[24]:


study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Reference Pseudobulk Frac Medians {d}'.format(d = date.today())


# In[25]:


in_files = [h5ad_uuid]


# In[26]:


in_files


# In[27]:


out_files


# In[28]:


hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files
)


# In[29]:


import session_info
session_info.show()


# In[ ]: