#!/usr/bin/env python # coding: utf-8 #

ipyrad-analysis toolkit: distance

# Key features: # # 1. Calculate pairwise genetic distances between samples. # 2. Filter SNPs to reduce missing data. # 3. Impute missing data using population allele frequencies. # ### required software # In[1]: # conda install ipyrad -c bioconda # conda install toyplot -c eaton-lab (optional) # In[2]: import ipyrad.analysis as ipa import toyplot # ### Short tutorial # #### Setup input files and params # In[3]: # the path to your VCF or HDF5 formatted snps file data = "/home/deren/Downloads/ref_pop2.snps.hdf5" # In[4]: # group individuals into populations imap = { "virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"], "mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"], "gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"], "bran": ["BJSL25", "BJSB3", "BJVL19"], "fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"], "sagr": ["CUVN10", "CUCA4", "CUSV6", "CUMM5"], "oleo": ["CRL0030", "CRL0001", "HNDA09", "BZBB1", "MXSA3017"], } # minimum n samples that must be present in each SNP from each group minmap = {i: 0.5 for i in imap} # #### calculate distances # In[5]: # load the snp data into distance tool with arguments from ipyrad.analysis.distance import Distance dist = Distance( data=data, imap=imap, minmap=minmap, mincov=0.5, impute_method="sample", subsample_snps=False, ) dist.run() # #### save results # In[6]: # save to a CSV file dist.dists.to_csv("distances.csv") # In[7]: # show the upper corner dist.dists.head() # ### Draw the matrix # In[8]: toyplot.matrix( dist.dists, bshow=False, tshow=False, rlocator=toyplot.locator.Explicit( range(len(dist.names)), sorted(dist.names), )); # ### Draw matrix reordered to match groups in imap # In[9]: # get list of concatenated names from each group ordered_names = [] for group in dist.imap.values(): ordered_names += group # reorder matrix to match name order ordered_matrix = dist.dists[ordered_names].T[ordered_names] # In[10]: toyplot.matrix( ordered_matrix, bshow=False, tshow=False, rlocator=toyplot.locator.Explicit( range(len(ordered_names)), ordered_names, ));