#!/usr/bin/env python
# coding: utf-8
#
ipyrad-analysis toolkit: construct
#
# The program construct is a STRUCTURE-like tool that incorporates expectations of isolation by distance. It is available as an R package. This notebook demonstrates how to convert data to the proper format for use in construct using simulated data as an example. For details on running construct see [their documentation](https://github.com/gbradburd/conStruct).
# ### Required software
# In[1]:
# conda install ipyrad ipcoal -c conda-forge -c bioconda
# In[3]:
import ipyrad.analysis as ipa
import toytree
import ipcoal
# In[22]:
print('ipyrad', ipa.__version__)
print('toytree', toytree.__version__)
print('ipcoal', ipcoal.__version__)
# ### Simulate example data
# In[5]:
# network model
tree = toytree.rtree.unittree(7, treeheight=3e6, seed=123)
tree.draw(ts='o', admixture_edges=(3, 2));
# In[15]:
# simulation model with admixture and missing data
model = ipcoal.Model(tree, Ne=1e4, nsamples=4, admixture_edges=(3, 2, 0.5, 0.1))
model.sim_snps(250)
model.write_snps_to_hdf5(name="test-construct", outdir="/tmp", diploid=True)
# ### Input data file
# In[16]:
# the path to your HDF5 formatted snps file
SNPS = "/tmp/test-construct.snps.hdf5"
# ### Population assignments
# In[17]:
IMAP = {
"r0": ["r0-0", "r0-1"],
"r1": ["r1-0", "r1-1"],
"r2": ["r2-0", "r2-1"],
"r3": ["r3-0", "r3-1"],
"r4": ["r4-0", "r4-1"],
"r5": ["r5-0", "r5-1"],
"r6": ["r6-0", "r6-1"],
}
# ### Filter missing data and convert to genotype frequencies
# In[18]:
# apply filtering to the SNPs file
tool = ipa.snps_extracter(data=SNPS, imap=IMAP, minmap={i:2 for i in IMAP})
tool.parse_genos_from_hdf5();
# In[19]:
# convert SNP data to genotype frequencies
df = tool.get_population_geno_frequency()
df.head()
# ### Write data to file
# In[20]:
# write to a file
df.to_csv("/tmp/freqs.csv")