#!/usr/bin/env python
# coding: utf-8

# # Test `alignparse.ccs.Summaries`
# Tests this class and makes sure it works with and without reports and `np` tags giving number of passes.
# This Jupyter notebook is designed to be run with `nbval` for the testing.
# 
# First, import Python modules:

# In[1]:


import contextlib
import os
import tempfile
import warnings

import Bio.SeqIO

import pandas as pd

import alignparse.ccs


# Hide warnings that clutter output:

# In[2]:


warnings.simplefilter('ignore')


# Data frame giving the `ccs` report file and the CCS FASTQ file for each run:

# In[3]:


run_names = ['recA_lib-1', 'recA_lib-2']
ccs_dir = '../notebooks/input_files'

ccs_df = pd.DataFrame(
            {'name': run_names,
             'report': [f"{ccs_dir}/{name}_report.txt" for name in run_names],
             'fastq': [f"{ccs_dir}/{name}_ccs.fastq" for name in run_names]
             })


# Create an `alignparse.ccs.Summaries` object:

# In[4]:


summaries = alignparse.ccs.Summaries(ccs_df)

for summary in summaries.summaries:
    print(summary.name)


# Confirm ZMW stats exist:

# In[5]:


summaries.has_zmw_stats()


# Get and plot the ZMW stats:

# In[6]:


summaries.zmw_stats()


# Plot these stats:

# In[7]:


p = summaries.plot_zmw_stats()
_ = p.draw()


# Now do the same with a `Summaries` object with no reports defined:

# In[8]:


summaries_no_zmw = alignparse.ccs.Summaries(ccs_df, report_col=None, ncpus=1)


# No ZMW stats defined:

# In[9]:


summaries_no_zmw.has_zmw_stats()


# So trying to get stats raises an error:

# In[10]:


try:
    summaries_no_zmw.plot_zmw_stats()
except ValueError:
    print('cannot plot ZMW stats')


# Now get information on CCS statistics:

# In[11]:


for stat in ['passes', 'length', 'accuracy']:
    if summaries.has_stat(stat):
        print(summaries.ccs_stats(stat).head(n=5))
        print()
    else:
        print(f"no {stat} stat\n")


# Plot these stats:

# In[12]:


for stat in ['length', 'passes', 'accuracy']:
    try:
        p = summaries.plot_ccs_stats(stat)
        _ = p.draw()
    except ValueError:
        print(f"Cannot plot {stat}")


# Now do the same for a `Summaries` defined using FASTQ files without the `np` tag giving the number of passes:

# In[13]:


with contextlib.ExitStack() as stack:
    tempfiles = [stack.enter_context(tempfile.NamedTemporaryFile('wt', suffix='.fastq'))
                 for _ in range(len(ccs_df))]
    ccs_df = ccs_df.assign(fastq_nopasstag=[f.name for f in tempfiles])
    for fout, tup in zip(tempfiles, ccs_df.itertuples()):
        seqs = []
        for iseq, seq in enumerate(Bio.SeqIO.parse(tup.fastq, 'fastq')):
            if iseq == 0:  # drop np tag from just first
                seq.description = seq.description.split()[0]
            seqs.append(seq)
        Bio.SeqIO.write(seqs, fout, 'fastq')
        fout.flush()
    summaries_nopasstag = alignparse.ccs.Summaries(ccs_df, fastq_col='fastq_nopasstag')


# Now print and plot information without `np` tag:

# In[14]:


for stat in ['passes', 'length', 'accuracy']:
    if summaries_nopasstag.has_stat(stat):
        print(summaries_nopasstag.ccs_stats(stat).head(n=5))
        print()
    else:
        print(f"no {stat} stat\n")


# In[15]:


for stat in ['length', 'passes', 'accuracy']:
    try:
        p = summaries_nopasstag.plot_ccs_stats(stat)
        _ = p.draw()
    except ValueError:
        print(f"Cannot plot {stat}")