from qiime.test import write_test_data

# A random 1% of the representative sequences from a de novo OTU
# picking run on the Moving Pictures of the Human Microbiome dataset
# (this is biased toward human microbiome OTUs)
input_seqs_fp = "moving-picture-rep-seqs.01.fna"

# sequences derived from my taxonomic assigner comparison:
# https://github.com/gregcaporaso/short-read-tax-assignment
# this is a randomly selected subset of GG 13_8 97% OTUs, sliced to the 
# 515F/806R amplicon region. (this is limited in practical applicability 
# because they are known and trusted reference sequences)
input_seqs_fp = "/Users/caporaso/code/short-read-tax-assignment/data/simulated-community/B1/rep_set.fna"
!rm -r core-set-aligned gg-aligned
!cp $input_seqs_fp seqs


!align_seqs.py -t /Users/caporaso/data/greengenes_core_sets/core_set_aligned.fasta.imputed -m pynast -o core-set-aligned -i seqs

!align_seqs.py -t /Users/caporaso/data/gg_13_8_otus/rep_set_aligned/85_otus.fasta -o gg-aligned -i seqs

!md5 core-set-aligned/seqs_aligned.fasta gg-aligned/seqs_aligned.fasta

from skbio import Alignment
a1 = Alignment.read('core-set-aligned/seqs_aligned.fasta')
a2 = Alignment.read('gg-aligned/seqs_aligned.fasta')

a1_ids = set(a1.ids())
a2_ids = set(a2.ids())
common_ids = a1_ids & a2_ids

print "Number of sequences retained in the core-set-aligned data, but dropped in the GG-aligned data:"
print len(a1_ids - a2_ids), a1_ids - a2_ids
print "Number of sequences retained in the GG-aligned data, but dropped in the core-set-aligneddata:"
print len(a2_ids - a1_ids), a2_ids - a1_ids
print "Number of sequences that are in both alignments:"
print len(common_ids)

a1 = a1.subalignment(seqs_to_keep=common_ids)
a2 = a2.subalignment(seqs_to_keep=common_ids)
a1.write('core-set-aligned/seqs_aligned.fasta')
a2.write('gg-aligned/seqs_aligned.fasta')

!filter_alignment.py -i core-set-aligned/seqs_aligned.fasta -o core-set-aligned/

!filter_alignment.py -i gg-aligned/seqs_aligned.fasta -o gg-aligned/

!md5 core-set-aligned/seqs_aligned_pfiltered.fasta  gg-aligned/seqs_aligned_pfiltered.fasta 

!make_phylogeny.py -i core-set-aligned/seqs_aligned_pfiltered.fasta -o core-set-aligned/seqs_aligned_pfiltered.tre

!make_phylogeny.py -i gg-aligned/seqs_aligned_pfiltered.fasta -o gg-aligned/seqs_aligned_pfiltered.tre

from skbio import TreeNode
from skbio.stats.distance import mantel

t1 = TreeNode.read('core-set-aligned/seqs_aligned_pfiltered.tre')
t2 = TreeNode.read('gg-aligned/seqs_aligned_pfiltered.tre')

mantel(t1.tip_tip_distances(), t2.tip_tip_distances(), strict=False)