ipyrad-analysis toolkit: distance

Key features:

Calculate pairwise genetic distances between samples.
Filter SNPs to reduce missing data.
Impute missing data using population allele frequencies.

required software¶

In [1]:

# conda install ipyrad -c bioconda
# conda install toyplot -c eaton-lab (optional)

In [2]:

import ipyrad.analysis as ipa
import toyplot

Short tutorial¶

Setup input files and params¶

In [3]:

# the path to your VCF or HDF5 formatted snps file
data = "/home/deren/Downloads/ref_pop2.snps.hdf5"

In [4]:

# group individuals into populations
imap = {
    "virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"],
    "mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"],
    "gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"],
    "bran": ["BJSL25", "BJSB3", "BJVL19"],
    "fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"],
    "sagr": ["CUVN10", "CUCA4", "CUSV6", "CUMM5"],
    "oleo": ["CRL0030", "CRL0001", "HNDA09", "BZBB1", "MXSA3017"],
}

# minimum n samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}

calculate distances¶

In [5]:

# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
    data=data, 
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
    subsample_snps=False,
)
dist.run()

Samples: 29
Sites before filtering: 349914
Filtered (indels): 0
Filtered (bi-allel): 13379
Filtered (mincov): 30459
Filtered (minmap): 111825
Filtered (combined): 120177
Sites after filtering: 229737
Sites containing missing values: 219551 (95.57%)
Missing values in SNP matrix: 814369 (12.22%)
Imputation: 'sampled'; (0, 1, 2) = 77.3%, 10.7%, 12.0%

save results¶

In [6]:

# save to a CSV file
dist.dists.to_csv("distances.csv")

In [7]:

# show the upper corner 
dist.dists.head()

Out[7]:

	BJSB3	BJSL25	BJVL19	BZBB1	CRL0001	CRL0030	CUCA4	CUMM5	CUSV6	CUVN10	...	FLWO6	HNDA09	LALC2	MXED8	MXGT4	MXSA3017	SCCU3	TXGR3	TXMD3	TXWV2
BJSB3	0.000000	0.250447	0.253472	0.592255	0.530145	0.572576	0.601853	0.597044	0.591990	0.579937	...	0.594005	0.582000	0.568137	0.464618	0.443942	0.579789	0.603638	0.487945	0.487936	0.590440
BJSL25	0.250447	0.000000	0.235900	0.558630	0.494291	0.537193	0.566156	0.559675	0.554665	0.542768	...	0.558769	0.548897	0.532239	0.435050	0.412694	0.547182	0.567323	0.453606	0.457105	0.554882
BJVL19	0.253472	0.235900	0.000000	0.567897	0.502775	0.547391	0.576355	0.569360	0.563000	0.554621	...	0.564728	0.555927	0.539913	0.441844	0.417060	0.556449	0.575336	0.464118	0.465476	0.562278
BZBB1	0.592255	0.558630	0.567897	0.000000	0.280691	0.280569	0.422670	0.422962	0.426266	0.394242	...	0.559152	0.285883	0.532918	0.525701	0.542381	0.317450	0.576455	0.552554	0.551579	0.563571
CRL0001	0.530145	0.494291	0.502775	0.280691	0.000000	0.239596	0.347859	0.322277	0.342836	0.213266	...	0.470064	0.262217	0.451717	0.466429	0.477764	0.299538	0.492224	0.487327	0.484123	0.482726

5 rows × 29 columns

Draw the matrix¶

In [8]:

toyplot.matrix(
    dist.dists, 
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(dist.names)),
        sorted(dist.names),
));

Draw matrix reordered to match groups in imap¶

In [9]:

# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
    ordered_names += group

# reorder matrix to match name order    
ordered_matrix = dist.dists[ordered_names].T[ordered_names]

In [10]:

toyplot.matrix(
    ordered_matrix,
    bshow=False,
    tshow=False,
    rlocator=toyplot.locator.Explicit(
        range(len(ordered_names)),
        ordered_names,
));