#!/usr/bin/env python # coding: utf-8 # # Test consistency of alignment parsing by `Targets` # This Jupyter notebook is designed to test parsing of alignments by `Targets`. # It does not test correctness by looking at actual alignments, but does test that results are internally consistent from the different parsing methods. # ## Set up # Import Python modules: # In[1]: import contextlib import os import re import tempfile import pandas as pd from pandas.testing import assert_frame_equal import alignparse.minimap2 import alignparse.targets # Set up targets with the feature parsing used for the examples and also returning **all** `cs` tags and feature clipping: # In[2]: targetfile = '../notebooks/input_files/recA_amplicon.gb' feature_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml' targets = alignparse.targets.Targets(seqsfile=targetfile, feature_parse_specs=feature_parse_specs_file) # now `Targets` object that returns `cs` and clipping for all features with no filter parse_d = {} for target in targets.targets: parse_d[target.name] = {'query_clip5': None, 'query_clip3': None} for featurename in target.feature_names: parse_d[target.name][featurename] = {} parse_d[target.name][featurename]['return'] = ['cs', 'clip5', 'clip3'] parse_d[target.name][featurename]['filter'] = {'mutation_nt_count': None, 'mutation_op_count': None, 'clip5': None, 'clip3': None} targets_all = alignparse.targets.Targets(seqsfile=targetfile, feature_parse_specs=parse_d) # ## Check parsing consistency # Confirm that `targets_all.parse_alignment` returns `cs`, `clip5`, and `clip3` for all features, just like the private `targets._parse_cs_alignment` method. # Do this for both returned data frames and ones written to CSV files. # # First, define a function to assert that data frames are **not** equal as [here](https://stackoverflow.com/a/38778401): # In[3]: def assert_frame_not_equal(*args, **kwargs): try: assert_frame_equal(*args, **kwargs) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError('frames unexpectedly equal') # Now do the tests: # In[4]: queryfile = '../notebooks/input_files/recA_lib-1_ccs.fastq' mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS) with contextlib.ExitStack() as stack: # make alignment SAM files alignmentfile = stack.enter_context( tempfile.NamedTemporaryFile('r+', suffix='.sam')) alignmentfile_all = stack.enter_context( tempfile.NamedTemporaryFile('r+', suffix='.sam')) targets.align(queryfile, alignmentfile.name, mapper) targets_all.align(queryfile, alignmentfile_all.name, mapper) # directly get data frames from alignment SAM files alignments_cs = targets._parse_alignment_cs(alignmentfile.name) alignments_all_cs = targets_all._parse_alignment_cs(alignmentfile_all.name) alignments = targets.parse_alignment(alignmentfile.name) alignments_all = targets_all.parse_alignment(alignmentfile_all.name) # make sure the expected data frames are identical for targetname in targets.target_names: assert_frame_equal(alignments_cs[targetname], alignments_all_cs[targetname]) assert_frame_equal(alignments_cs[targetname], alignments_all[1][targetname]) assert_frame_not_equal(alignments[1][targetname], alignments_all[1][targetname]) # make sure the filtering is as expected for targetname in targets.target_names: assert len(alignments_all[2][targetname]) == 0 assert len(alignments[2][targetname]) > 0 # make sure the read stats are as expected for a_tup in [alignments_all, alignments]: read_stats, aligned, filtered = a_tup for targetname in targets.target_names: aligned_df = aligned[targetname] filtered_df = filtered[targetname] assert len(filtered_df) == (read_stats .set_index('category') .at[f"filtered {targetname}", 'count'] ) assert len(aligned_df) == (read_stats .set_index('category') .at[f"aligned {targetname}", 'count'] ) # now get the alignments into CSV files csv_dir = stack.enter_context(tempfile.TemporaryDirectory()) csv_dir_all = stack.enter_context(tempfile.TemporaryDirectory()) alignments_csv = targets.parse_alignment(alignmentfile.name, to_csv=True, csv_dir=csv_dir) alignments_all_csv = targets_all.parse_alignment(alignmentfile.name, to_csv=True, csv_dir=csv_dir_all) # make sure the CSV files match those returned directly as data frames for a, a_csv in [(alignments, alignments_csv), (alignments_all, alignments_all_csv)]: read_stats, aligned, filtered = a read_stats_csv, aligned_csv, filtered_csv = a_csv assert_frame_equal(read_stats, read_stats_csv) for targetname in targets.target_names: assert_frame_equal(aligned[targetname], pd.read_csv(aligned_csv[targetname]).fillna('')) assert_frame_equal(filtered[targetname], pd.read_csv(filtered_csv[targetname]).fillna(''))