Targets
¶This Jupyter notebook is designed to test parsing of alignments by Targets
.
It does not test correctness by looking at actual alignments, but does test that results are internally consistent from the different parsing methods.
Import Python modules:
import contextlib
import os
import re
import tempfile
import pandas as pd
from pandas.testing import assert_frame_equal
import alignparse.minimap2
import alignparse.targets
Set up targets with the feature parsing used for the examples and also returning all cs
tags and feature clipping:
targetfile = '../notebooks/input_files/recA_amplicon.gb'
feature_parse_specs_file = '../notebooks/input_files/recA_feature_parse_specs.yaml'
targets = alignparse.targets.Targets(seqsfile=targetfile,
feature_parse_specs=feature_parse_specs_file)
# now `Targets` object that returns `cs` and clipping for all features with no filter
parse_d = {}
for target in targets.targets:
parse_d[target.name] = {'query_clip5': None, 'query_clip3': None}
for featurename in target.feature_names:
parse_d[target.name][featurename] = {}
parse_d[target.name][featurename]['return'] = ['cs', 'clip5', 'clip3']
parse_d[target.name][featurename]['filter'] = {'mutation_nt_count': None,
'mutation_op_count': None,
'clip5': None,
'clip3': None}
targets_all = alignparse.targets.Targets(seqsfile=targetfile,
feature_parse_specs=parse_d)
Confirm that targets_all.parse_alignment
returns cs
, clip5
, and clip3
for all features, just like the private targets._parse_cs_alignment
method.
Do this for both returned data frames and ones written to CSV files.
First, define a function to assert that data frames are not equal as here:
def assert_frame_not_equal(*args, **kwargs):
try:
assert_frame_equal(*args, **kwargs)
except AssertionError:
# frames are not equal
pass
else:
# frames are equal
raise AssertionError('frames unexpectedly equal')
Now do the tests:
queryfile = '../notebooks/input_files/recA_lib-1_ccs.fastq'
mapper = alignparse.minimap2.Mapper(alignparse.minimap2.OPTIONS_CODON_DMS)
with contextlib.ExitStack() as stack:
# make alignment SAM files
alignmentfile = stack.enter_context(
tempfile.NamedTemporaryFile('r+', suffix='.sam'))
alignmentfile_all = stack.enter_context(
tempfile.NamedTemporaryFile('r+', suffix='.sam'))
targets.align(queryfile, alignmentfile.name, mapper)
targets_all.align(queryfile, alignmentfile_all.name, mapper)
# directly get data frames from alignment SAM files
alignments_cs = targets._parse_alignment_cs(alignmentfile.name)
alignments_all_cs = targets_all._parse_alignment_cs(alignmentfile_all.name)
alignments = targets.parse_alignment(alignmentfile.name)
alignments_all = targets_all.parse_alignment(alignmentfile_all.name)
# make sure the expected data frames are identical
for targetname in targets.target_names:
assert_frame_equal(alignments_cs[targetname],
alignments_all_cs[targetname])
assert_frame_equal(alignments_cs[targetname],
alignments_all[1][targetname])
assert_frame_not_equal(alignments[1][targetname],
alignments_all[1][targetname])
# make sure the filtering is as expected
for targetname in targets.target_names:
assert len(alignments_all[2][targetname]) == 0
assert len(alignments[2][targetname]) > 0
# make sure the read stats are as expected
for a_tup in [alignments_all, alignments]:
read_stats, aligned, filtered = a_tup
for targetname in targets.target_names:
aligned_df = aligned[targetname]
filtered_df = filtered[targetname]
assert len(filtered_df) == (read_stats
.set_index('category')
.at[f"filtered {targetname}", 'count']
)
assert len(aligned_df) == (read_stats
.set_index('category')
.at[f"aligned {targetname}", 'count']
)
# now get the alignments into CSV files
csv_dir = stack.enter_context(tempfile.TemporaryDirectory())
csv_dir_all = stack.enter_context(tempfile.TemporaryDirectory())
alignments_csv = targets.parse_alignment(alignmentfile.name,
to_csv=True,
csv_dir=csv_dir)
alignments_all_csv = targets_all.parse_alignment(alignmentfile.name,
to_csv=True,
csv_dir=csv_dir_all)
# make sure the CSV files match those returned directly as data frames
for a, a_csv in [(alignments, alignments_csv), (alignments_all, alignments_all_csv)]:
read_stats, aligned, filtered = a
read_stats_csv, aligned_csv, filtered_csv = a_csv
assert_frame_equal(read_stats, read_stats_csv)
for targetname in targets.target_names:
assert_frame_equal(aligned[targetname],
pd.read_csv(aligned_csv[targetname]).fillna(''))
assert_frame_equal(filtered[targetname],
pd.read_csv(filtered_csv[targetname]).fillna(''))