ipyrad-analysis toolkit: Popgen summary statistics

Calculate summary statistics such as pi, Tajima's D, Fst

In [15]:

import ipyrad
import ipyrad.analysis as ipa
import ipyparallel as ipp
from ipyrad.analysis.popgen import Popgen

# Start the ipcluster instance externally before running this.
# Run this on the command line:
# `ipcluster start --cluster-id=popgen --n=12 --daemonize`
#
# And remember to stop the cluster when you're done
# `ipcluster stop --cluster-id=popgen`

ipyclient = ipp.Client(cluster_id="popgen")
print(len(ipyclient))

In [13]:

# The necessary data file is the HDF5 formatted sequence file (in the output directory)
data = "/tmp/ipyrad-test/wat_outfiles/wat.seqs.hdf5"

# popgen tools can also accept an ipyrad assembly
#data = ipyrad.load_json("/tmp/ipyrad-test/wat.json")

In [11]:

# Dictionary mapping samples to populations
imap = {
    "pop1" : ["1A_0", "1B_0", "1C_0", "1D_0"],
    "pop2" : ["2E_0", "2F_0", "2G_0", "2H_0"],
    "pop3" : ["3I_0", "3J_0", "3K_0", "3L_0"],
}

In [73]:

# Create the popgen object and run it
popgen = Popgen(data=data, imap=imap)
popgen.run(ipyclient=ipyclient)

Parallel connection | bobolink: 12 cores
[locus filter] full data: 1000
[locus filter] post filter: 998
[####################] 100% 0:00:05 | Calculating sumstats for nloci 998 
[####################] 100% 0:00:01 | Collating sumstats for npops 3

In [74]:

# Check the results
popgen.results

Out[74]:

between   {'Dxy':       pop1   pop2   pop3
pop1   0.0  0.025  0.025
pop2   0.0  0.000  0.024
pop3   0.0  0.000  0.000, 'Fst':       pop1   pop2   pop3
pop1   0.0  0.372  0.416
pop2   0.0  0.000  0.416
pop3   0.0  0.000  0.000, 'Fst_adj':       pop1   pop2   pop3
pop1   0.0  0.299  0.343
pop2   0.0  0.000  0.344
pop3   0.0  0.000  0.000}
within         mean_pi mean_Watterson mean_TajimasD
pop1   0.006          0.005         0.312
pop2   0.006          0.005         0.279
pop3   0.006          0.005         0.293