#!/usr/bin/env python
# coding: utf-8

# # Test consistency of alignment parsing by `Targets`
# This Jupyter notebook is designed to test parsing of alignments by `Targets`.
# It does not test correctness by looking at actual alignments, but does test that results are internally consistent from the different parsing methods.

# ## Set up
# Import Python modules:

# In[1]:


import contextlib
import os
import re
import tempfile

import pandas as pd
from pandas.testing import assert_frame_equal

import alignparse.minimap2
import alignparse.targets


# Set up targets with the feature parsing used for the examples and also returning **all** `cs` tags and feature clipping:

# In[2]:


targetfile = '../notebooks/input_files/recA_amplicon.gb'
feature_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'

targets = alignparse.targets.Targets(seqsfile=targetfile,
                                     feature_parse_specs=feature_parse_specs_file)

# now `Targets` object that returns `cs` and clipping for all features with no filter
parse_d = {}
for target in targets.targets:
    parse_d[target.name] = {'query_clip5': None, 'query_clip3': None}
    for featurename in target.feature_names:
        parse_d[target.name][featurename] = {}
        parse_d[target.name][featurename]['return'] = ['cs', 'clip5', 'clip3']
        parse_d[target.name][featurename]['filter'] = {'mutation_nt_count': None,
                                                       'mutation_op_count': None,
                                                       'clip5': None,
                                                       'clip3': None}
targets_all = alignparse.targets.Targets(seqsfile=targetfile,
                                         feature_parse_specs=parse_d)


# ## Check parsing consistency
# Confirm that `targets_all.parse_alignment` returns `cs`, `clip5`, and `clip3` for all features, just like the private `targets._parse_cs_alignment` method.
# Do this for both returned data frames and ones written to CSV files.
# 
# First, define a function to assert that data frames are **not** equal as [here](https://stackoverflow.com/a/38778401):

# In[3]:


def assert_frame_not_equal(*args, **kwargs):
    try:
        assert_frame_equal(*args, **kwargs)
    except AssertionError:
        # frames are not equal
        pass
    else:
        # frames are equal
        raise AssertionError('frames unexpectedly equal')


# Now do the tests:

# In[4]:


queryfile = '../notebooks/input_files/recA_lib-1_ccs.fastq'

mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)

with contextlib.ExitStack() as stack:
    # make alignment SAM files
    alignmentfile = stack.enter_context(
            tempfile.NamedTemporaryFile('r+', suffix='.sam'))
    alignmentfile_all = stack.enter_context(
            tempfile.NamedTemporaryFile('r+', suffix='.sam'))
    targets.align(queryfile, alignmentfile.name, mapper)
    targets_all.align(queryfile, alignmentfile_all.name, mapper)
    
    # directly get data frames from alignment SAM files
    alignments_cs = targets._parse_alignment_cs(alignmentfile.name)
    alignments_all_cs = targets_all._parse_alignment_cs(alignmentfile_all.name)
    alignments = targets.parse_alignment(alignmentfile.name)
    alignments_all = targets_all.parse_alignment(alignmentfile_all.name)
    
    # make sure the expected data frames are identical
    for targetname in targets.target_names:
        assert_frame_equal(alignments_cs[targetname],
                           alignments_all_cs[targetname])
        assert_frame_equal(alignments_cs[targetname],
                           alignments_all[1][targetname])
        assert_frame_not_equal(alignments[1][targetname],
                               alignments_all[1][targetname])
        
    # make sure the filtering is as expected
    for targetname in targets.target_names:
        assert len(alignments_all[2][targetname]) == 0
        assert len(alignments[2][targetname]) > 0
        
    # make sure the read stats are as expected
    for a_tup in [alignments_all, alignments]:
        read_stats, aligned, filtered = a_tup
        for targetname in targets.target_names:
            aligned_df = aligned[targetname]
            filtered_df = filtered[targetname]
            assert len(filtered_df) == (read_stats
                                        .set_index('category')
                                        .at[f"filtered {targetname}", 'count']
                                        )
            assert len(aligned_df) == (read_stats
                                       .set_index('category')
                                       .at[f"aligned {targetname}", 'count']
                                       )
    
    # now get the alignments into CSV files
    csv_dir = stack.enter_context(tempfile.TemporaryDirectory())
    csv_dir_all = stack.enter_context(tempfile.TemporaryDirectory())
    alignments_csv = targets.parse_alignment(alignmentfile.name,
                                             to_csv=True,
                                             csv_dir=csv_dir)
    alignments_all_csv = targets_all.parse_alignment(alignmentfile.name,
                                                     to_csv=True,
                                                     csv_dir=csv_dir_all)
    
    # make sure the CSV files match those returned directly as data frames
    for a, a_csv in [(alignments, alignments_csv), (alignments_all, alignments_all_csv)]:
        read_stats, aligned, filtered = a
        read_stats_csv, aligned_csv, filtered_csv = a_csv
        assert_frame_equal(read_stats, read_stats_csv)
        for targetname in targets.target_names:
            assert_frame_equal(aligned[targetname],
                               pd.read_csv(aligned_csv[targetname]).fillna(''))
            assert_frame_equal(filtered[targetname],
                               pd.read_csv(filtered_csv[targetname]).fillna(''))