Key features:
# conda install ipyrad -c bioconda
# conda install toyplot -c eaton-lab (optional)
import ipyrad.analysis as ipa
import toyplot
# the path to your VCF or HDF5 formatted snps file
data = "/home/deren/Downloads/ref_pop2.snps.hdf5"
# group individuals into populations
imap = {
"virg": ["TXWV2", "LALC2", "SCCU3", "FLSF33", "FLBA140"],
"mini": ["FLSF47", "FLMO62", "FLSA185", "FLCK216"],
"gemi": ["FLCK18", "FLSF54", "FLWO6", "FLAB109"],
"bran": ["BJSL25", "BJSB3", "BJVL19"],
"fusi": ["MXED8", "MXGT4", "TXGR3", "TXMD3"],
"sagr": ["CUVN10", "CUCA4", "CUSV6", "CUMM5"],
"oleo": ["CRL0030", "CRL0001", "HNDA09", "BZBB1", "MXSA3017"],
}
# minimum n samples that must be present in each SNP from each group
minmap = {i: 0.5 for i in imap}
# load the snp data into distance tool with arguments
from ipyrad.analysis.distance import Distance
dist = Distance(
data=data,
imap=imap,
minmap=minmap,
mincov=0.5,
impute_method="sample",
subsample_snps=False,
)
dist.run()
Samples: 29 Sites before filtering: 349914 Filtered (indels): 0 Filtered (bi-allel): 13379 Filtered (mincov): 30459 Filtered (minmap): 111825 Filtered (combined): 120177 Sites after filtering: 229737 Sites containing missing values: 219551 (95.57%) Missing values in SNP matrix: 814369 (12.22%) Imputation: 'sampled'; (0, 1, 2) = 77.3%, 10.7%, 12.0%
# save to a CSV file
dist.dists.to_csv("distances.csv")
# show the upper corner
dist.dists.head()
BJSB3 | BJSL25 | BJVL19 | BZBB1 | CRL0001 | CRL0030 | CUCA4 | CUMM5 | CUSV6 | CUVN10 | ... | FLWO6 | HNDA09 | LALC2 | MXED8 | MXGT4 | MXSA3017 | SCCU3 | TXGR3 | TXMD3 | TXWV2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
BJSB3 | 0.000000 | 0.250447 | 0.253472 | 0.592255 | 0.530145 | 0.572576 | 0.601853 | 0.597044 | 0.591990 | 0.579937 | ... | 0.594005 | 0.582000 | 0.568137 | 0.464618 | 0.443942 | 0.579789 | 0.603638 | 0.487945 | 0.487936 | 0.590440 |
BJSL25 | 0.250447 | 0.000000 | 0.235900 | 0.558630 | 0.494291 | 0.537193 | 0.566156 | 0.559675 | 0.554665 | 0.542768 | ... | 0.558769 | 0.548897 | 0.532239 | 0.435050 | 0.412694 | 0.547182 | 0.567323 | 0.453606 | 0.457105 | 0.554882 |
BJVL19 | 0.253472 | 0.235900 | 0.000000 | 0.567897 | 0.502775 | 0.547391 | 0.576355 | 0.569360 | 0.563000 | 0.554621 | ... | 0.564728 | 0.555927 | 0.539913 | 0.441844 | 0.417060 | 0.556449 | 0.575336 | 0.464118 | 0.465476 | 0.562278 |
BZBB1 | 0.592255 | 0.558630 | 0.567897 | 0.000000 | 0.280691 | 0.280569 | 0.422670 | 0.422962 | 0.426266 | 0.394242 | ... | 0.559152 | 0.285883 | 0.532918 | 0.525701 | 0.542381 | 0.317450 | 0.576455 | 0.552554 | 0.551579 | 0.563571 |
CRL0001 | 0.530145 | 0.494291 | 0.502775 | 0.280691 | 0.000000 | 0.239596 | 0.347859 | 0.322277 | 0.342836 | 0.213266 | ... | 0.470064 | 0.262217 | 0.451717 | 0.466429 | 0.477764 | 0.299538 | 0.492224 | 0.487327 | 0.484123 | 0.482726 |
5 rows × 29 columns
toyplot.matrix(
dist.dists,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(dist.names)),
sorted(dist.names),
));
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
ordered_names += group
# reorder matrix to match name order
ordered_matrix = dist.dists[ordered_names].T[ordered_names]
toyplot.matrix(
ordered_matrix,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(ordered_names)),
ordered_names,
));