#!/usr/bin/env python # coding: utf-8 # # ipyrad-analysis toolkit: mrBayes # # This notebook shows advanced features for implementing mrbayes through ipa by using fossil (or secondary information derived) calibrations to node ages. # ### Required software # In[1]: # conda install ipyrad -c conda-forge -c bioconda # conda install toytree -c conda-forge # conda install mrbayes -c conda-forge -c bioconda # In[2]: import ipyrad.analysis as ipa import toytree # In[3]: # check mrbayes version get_ipython().system(' mb -v') # ### Sequence database file # In[4]: # the seqs.hdf5 file produced by ipyrad SEQSFILE = "/tmp/oaks.seqs.hdf5" # In[5]: # download example seqs file if not already present (~500Mb, takes ~5 minutes) URL = "https://www.dropbox.com/s/c1u89nwuuv8e6ie/virentes_ref.seqs.hdf5?raw=1" ipa.download(URL, path=SEQSFILE); # ### Write a nexus alignment (using window extracter) # This tool will extract RAD loci mapping to the first scaffold and with no missing data across the selected 8 samples and write the sequence data to a nexus file. See the window_extracter docs for more details. In this example, the filtered alignment contains 219K sites with 2525 SNPs across the 8 samples. This alignment was written to the nexus filt path shown below. # In[22]: # samples in include IMAP = { 'include': [ "LALC2", "FLSF47", "FLCK18", "CUSV6", "CRL0030", "MXED8", "BJSB3", "AR" ], } # init wex tool to select data wex = ipa.window_extracter( data=SEQSFILE, name="oaks-chr1-aln", workdir="/tmp", scaffold_idxs=0, mincov=1.0, imap=IMAP, ) # show stats for this alignment display(wex.stats) # write the alignment file wex.run(nexus=True, force=True) # ### Setting mb param settings # # The ipa.mrbayes wrapper tool will create a NEXUS file that contains both the sequence data from the nex file we just created, as well as a block with instructions for how to run mrbayes. The tool can then also execute this nexus file in mrbayes to run the tree inference and summarize the results. # # The purpose of the ipa.mrbayes tool is merely for convenience. You should still take care to understand the priors and parameters that you are selecting by reading the mrbayes documentation. # # This wrapper currently only supports a two relatively simple models. Here we will implement a relaxed molecular clock model following these instructions: https://www.groundai.com/project/molecular-clock-dating-using-mrbayes/. # In[23]: # the path to your NEXUS formatted alignment file NEXFILE = "/tmp/oaks-chr1-aln.nex" # In[24]: # init mrbayes object with input data and (optional) parameter options mb = ipa.mrbayes( name="oaks-chr1-mb", data=NEXFILE, clock_model=2, ngen=int(5e6), samplefreq=int(5e3), nruns=2, ) # In[25]: # modify a param and show all params mb.params.nruns = 1 mb.params # In[26]: # print the mb nexus string; this can be modified by changing .params print(mb.nexus_string) # ### NOT YET UPDATED PAST THIS MARK ------ # # # ### Run mrbayes # # This takes about 20 minutes to run on my laptop for a data set with 13 samples and ~1.2M SNPs and about 14% missing data. For a publication quality analyses you will likely want to run it quite a bit longer and to test for convergence of your runs using a tool like `tracer` and `treeannotator`. # In[8]: # run the command, (options: block until finishes; overwrite existing) mb.run(block=True, force=True) # ### Accessing results # # You can access the results files of the mrbayes analysis from the mb analysis object from the `.trees` attribute, or of course you can also write out the full path to the result files which will be located in your working directory which can be set as a parameter option, otherwise is defaulted to `"./analysis-mb/".` # In[9]: # you can access the tree results from the mb analysis object mb.trees # In[10]: # for example to select the consensus tree path mb.trees.constre # ### Check the parameter values and convergence # # On this very large data set (in terms of number of sites) we can see that the posterior has not explored the parameter space very well by the end of this run. We would hope to see the ESS score for all parameters above 100. Here the TH, TL, net_speciation, and clockrate parameters have low convergence scores. # In[12]: # show the convergence statistics (from the .pstat mb output file) mb.convergence_stats.round(3) # ### Plotting mb consesus tree # You can plot either the consensus tree (`.nex.cons.tre`) or the posterior distribution of trees (`.nex.t`) produced by a mrbayes run by loading the single tree or list of trees with `toytree` as demonstrated below. # In[16]: # load from the .trees attribute or from the saved tree file tre = toytree.tree(mb.trees.constre, tree_format=10) # draw the tree with styling tre.draw( tip_labels_align=True, scalebar=True, #node_labels="support", ); # ### Plotting mb posterior distribution of trees # In[27]: # load from the .trees attribute or from the saved tree file mtre = toytree.mtree(mb.trees.posttre) # remove the burnin manually mtre.treelist = mtre.treelist[10:] # draw a few trees on a shared axis mtre.draw_tree_grid( nrows=1, ncols=4, start=20, shared_axis=True, height=400, ); # The command below plots a cloud tree diagram across the ~90 trees in the posterior distribution. Because we have no fossil calibrations for this data the root age is not of great interest so I use the `.mod` fuction below to scale the root height of each tree to 1.0. This will highlight variation among trees in relative node heights. Below you can see that in the posterior set of trees there some variation around node heights in the middle of the tree, and there does not appear to be any variation in the topology recovered, which is not uncommon for large concatenated data sets. # In[38]: # set root height of all trees to 1.0 mtre.treelist = [i.mod.node_scale_root_height(1.0) for i in mtre.treelist] # In[39]: # draw the posterior of trees overlapping mtre.draw_cloud_tree( width=400, height=300, html=True, edge_colors="red", edge_style={"stroke-opacity": 0.02}, tip_labels={i: i.rsplit("_", 1)[0] for i in mtre.treelist[0].get_tip_labels()}, ); # ### Save figures # In[14]: import toyplot.pdf canvas, axes = mtre.draw_cloud_tree( width=400, height=300, html=True, edge_colors="red", tip_labels={i: i.rsplit("_", 1)[0] for i in mtre.treelist[0].get_tip_labels()} ); toyplot.pdf.render(canvas, "./pedicularis-min10-5M-mb-tree.pdf") # ### What else? # # If you have reference mapped data then you should see the `.treeslider()` tool to infer trees in sliding windows along scaffolds; or the `.extractor()` tool to extract, filter, and concatenate RAD loci within a given window (e.g., near some known gene).