from qiime.test import write_test_data # A random 1% of the representative sequences from a de novo OTU # picking run on the Moving Pictures of the Human Microbiome dataset # (this is biased toward human microbiome OTUs) input_seqs_fp = "moving-picture-rep-seqs.01.fna" # sequences derived from my taxonomic assigner comparison: # https://github.com/gregcaporaso/short-read-tax-assignment # this is a randomly selected subset of GG 13_8 97% OTUs, sliced to the # 515F/806R amplicon region. (this is limited in practical applicability # because they are known and trusted reference sequences) input_seqs_fp = "/Users/caporaso/code/short-read-tax-assignment/data/simulated-community/B1/rep_set.fna" !rm -r core-set-aligned gg-aligned !cp $input_seqs_fp seqs !align_seqs.py -t /Users/caporaso/data/greengenes_core_sets/core_set_aligned.fasta.imputed -m pynast -o core-set-aligned -i seqs !align_seqs.py -t /Users/caporaso/data/gg_13_8_otus/rep_set_aligned/85_otus.fasta -o gg-aligned -i seqs !md5 core-set-aligned/seqs_aligned.fasta gg-aligned/seqs_aligned.fasta from skbio import Alignment a1 = Alignment.read('core-set-aligned/seqs_aligned.fasta') a2 = Alignment.read('gg-aligned/seqs_aligned.fasta') a1_ids = set(a1.ids()) a2_ids = set(a2.ids()) common_ids = a1_ids & a2_ids print "Number of sequences retained in the core-set-aligned data, but dropped in the GG-aligned data:" print len(a1_ids - a2_ids), a1_ids - a2_ids print "Number of sequences retained in the GG-aligned data, but dropped in the core-set-aligneddata:" print len(a2_ids - a1_ids), a2_ids - a1_ids print "Number of sequences that are in both alignments:" print len(common_ids) a1 = a1.subalignment(seqs_to_keep=common_ids) a2 = a2.subalignment(seqs_to_keep=common_ids) a1.write('core-set-aligned/seqs_aligned.fasta') a2.write('gg-aligned/seqs_aligned.fasta') !filter_alignment.py -i core-set-aligned/seqs_aligned.fasta -o core-set-aligned/ !filter_alignment.py -i gg-aligned/seqs_aligned.fasta -o gg-aligned/ !md5 core-set-aligned/seqs_aligned_pfiltered.fasta gg-aligned/seqs_aligned_pfiltered.fasta !make_phylogeny.py -i core-set-aligned/seqs_aligned_pfiltered.fasta -o core-set-aligned/seqs_aligned_pfiltered.tre !make_phylogeny.py -i gg-aligned/seqs_aligned_pfiltered.fasta -o gg-aligned/seqs_aligned_pfiltered.tre from skbio import TreeNode from skbio.stats.distance import mantel t1 = TreeNode.read('core-set-aligned/seqs_aligned_pfiltered.tre') t2 = TreeNode.read('gg-aligned/seqs_aligned_pfiltered.tre') mantel(t1.tip_tip_distances(), t2.tip_tip_distances(), strict=False)