#!/usr/bin/env python
# coding: utf-8
#
ipyrad-analysis toolkit: distance
# Key features:
#
# 1. Calculate pairwise genetic distances between samples.
# 2. Filter SNPs to reduce missing data.
# 3. Impute missing data using population allele frequencies.
# ### required software
# In[1]:
# conda install ipyrad -c bioconda
# conda install toyplot -c eaton-lab (optional)
# In[2]:
import ipyrad.analysis as ipa
import toyplot
# ### Short tutorial
# #### Setup input files and params
# In[3]:
# the path to your VCF or HDF5 formatted snps file
data = "/home/deren/Downloads/ref_pop2.snps.hdf5"
# In[4]:
# group individuals into populations
imap = {
"virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"],
"mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"],
"gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"],
"bran": ["BJSL25", "BJSB3", "BJVL19"],
"fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"],
"sagr": ["CUVN10", "CUCA4", "CUSV6", "CUMM5"],
"oleo": ["CRL0030", "CRL0001", "HNDA09", "BZBB1", "MXSA3017"],
}
# minimum n samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}
# #### calculate distances
# In[5]:
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
data=data,
imap=imap,
minmap=minmap,
mincov=0.5,
impute_method="sample",
subsample_snps=False,
)
dist.run()
# #### save results
# In[6]:
# save to a CSV file
dist.dists.to_csv("distances.csv")
# In[7]:
# show the upper corner
dist.dists.head()
# ### Draw the matrix
# In[8]:
toyplot.matrix(
dist.dists,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(dist.names)),
sorted(dist.names),
));
# ### Draw matrix reordered to match groups in imap
# In[9]:
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
ordered_names += group
# reorder matrix to match name order
ordered_matrix = dist.dists[ordered_names].T[ordered_names]
# In[10]:
toyplot.matrix(
ordered_matrix,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(ordered_names)),
ordered_names,
));