Genetic distance matrices are used in many contexts to study the evolutionary divergence of samples or populations. The ipa.distance
module provides a simple and convenient framework to implement several distance based metrics.
Key features:
# conda install ipyrad -c conda-forge -c bioconda
# conda install ipcoal -c conda-forge
import ipyrad.analysis as ipa
import ipcoal
import toyplot
import toytree
# generate and draw an imbalanced 5 tip tree
tree = toytree.rtree.imbtree(ntips=5, treeheight=500000)
tree.draw(ts='p');
The SNPs output is saved to an HDF5 database file.
# setup a model to simulate 8 haploid samples per species
model = ipcoal.Model(tree=tree, Ne=1e4, nsamples=8)
model.sim_loci(1000, 50)
model.write_snps_to_hdf5(name="test-dist", outdir="/tmp", diploid=True)
wrote 1044 SNPs to /tmp/test-dist.snps.hdf5
# the path to the HDF5 formatted snps file
SNPS = "/tmp/test-dist.snps.hdf5"
A dictionary mapping of population names to sample names.
from itertools import groupby
# load sample names from SNPs file
tool = ipa.snps_extracter(SNPS)
# group names by prefix before '-'
groups = groupby(tool.names, key=lambda x: x.split("-")[0])
# arrange into a dictionary
IMAP = {i[0]: list(i[1]) for i in groups}
# show the dict
IMAP
{'r0': ['r0-0', 'r0-1', 'r0-2', 'r0-3'], 'r1': ['r1-0', 'r1-1', 'r1-2', 'r1-3'], 'r2': ['r2-0', 'r2-1', 'r2-2', 'r2-3'], 'r3': ['r3-0', 'r3-1', 'r3-2', 'r3-3'], 'r4': ['r4-0', 'r4-1', 'r4-2', 'r4-3']}
The correction applies a model of sequence substitution where more complex models can apply a greater penalty for unobserved changes (e.g., HKY or GTR). This allows you to use either SNPs or SEQUENCES as input. Here we are using SNPs. More on this later... (TODO).
dist = ipa.distance(
data=SNPS,
imap=IMAP,
minmap={i: 1 for i in IMAP},
mincov=0.5,
impute_method=None,
)
# infer the distance matrix from sequence data
dist.run()
Samples: 20 Sites before filtering: 1044 Filtered (indels): 0 Filtered (bi-allel): 9 Filtered (mincov): 0 Filtered (minmap): 0 Filtered (subsample invariant): 0 Filtered (combined): 9 Sites after filtering: 1035 Sites containing missing values: 0 (0.00%) Missing values in SNP matrix: 0 (0.00%) Imputation: 'None'; (0, 1, 2) = 100.0%, 0.0%, 0.0%
# show the first few data cells
dist.dists.iloc[:5, :12]
r0-0 | r0-1 | r0-2 | r0-3 | r1-0 | r1-1 | r1-2 | r1-3 | r2-0 | r2-1 | r2-2 | r2-3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
r0-0 | 0.000 | 0.033 | 0.045 | 0.036 | 0.510 | 0.499 | 0.510 | 0.518 | 0.911 | 0.915 | 0.892 | 0.908 |
r0-1 | 0.033 | 0.000 | 0.032 | 0.024 | 0.508 | 0.497 | 0.508 | 0.516 | 0.909 | 0.913 | 0.890 | 0.906 |
r0-2 | 0.045 | 0.032 | 0.000 | 0.031 | 0.509 | 0.498 | 0.509 | 0.517 | 0.910 | 0.914 | 0.891 | 0.907 |
r0-3 | 0.036 | 0.024 | 0.031 | 0.000 | 0.505 | 0.494 | 0.505 | 0.513 | 0.906 | 0.910 | 0.887 | 0.903 |
r1-0 | 0.510 | 0.508 | 0.509 | 0.505 | 0.000 | 0.043 | 0.050 | 0.046 | 0.990 | 0.994 | 0.969 | 0.984 |
tool = ipa.neighbor_joining(matrix=dist.dists)
--------------------------------------------------------------------------- ToytreeError Traceback (most recent call last) <ipython-input-37-963bfc412bc1> in <module>() ----> 1 tool = ipa.neighbor_joining(matrix=dist.dists) /home/deren/Documents/ipyrad/ipyrad/analysis/neighbor_joining.py in __init__(self, matrix, steps) 69 # relabel tips 70 if isinstance(matrix, pd.DataFrame): ---> 71 tree = tree.set_node_values( 72 feature="name", 73 values={i: j for (i, j) in enumerate(matrix.index)} /home/deren/miniconda3/envs/py38/lib/python3.7/site-packages/toytree/Toytree.py in set_node_values(self, feature, values, default) 613 if nidx not in ndict: 614 raise ToytreeError( --> 615 "node idx or name {} not in tree".format(nidx)) 616 617 # or, set everyone to a null value ToytreeError: node idx or name 6 not in tree
# create a canvas
canvas = toyplot.Canvas(width=500, height=450);
# add tree
axes = canvas.cartesian(bounds=("10%", "35%", "10%", "90%"))
gtree.draw(axes=axes, tip_labels=True, tip_labels_align=True)
# add matrix
table = canvas.table(
rows=matrix.shape[0],
columns=matrix.shape[1],
margin=0,
bounds=("40%", "95%", "9%", "91%"),
)
colormap = toyplot.color.brewer.map("BlueRed")
# apply a color to each cell in the table
for ridx in range(matrix.shape[0]):
for cidx in range(matrix.shape[1]):
cell = table.cells.cell[ridx, cidx]
cell.style = {
"fill": colormap.colors(matrix.iloc[ridx, cidx], 0, 1),
}
dist.dists
r0-0 | r0-1 | r1-0 | r1-1 | r2-0 | r2-1 | r3-0 | r3-1 | r4-0 | r4-1 | r5-0 | r5-1 | r6-0 | r6-1 | r7-0 | r7-1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
r0-0 | 0.000 | 0.079 | 0.556 | 0.587 | 0.873 | 0.778 | 0.714 | 0.794 | 0.730 | 0.730 | 0.825 | 0.778 | 0.778 | 0.730 | 0.746 | 0.841 |
r0-1 | 0.079 | 0.000 | 0.508 | 0.540 | 0.825 | 0.730 | 0.667 | 0.746 | 0.683 | 0.683 | 0.778 | 0.730 | 0.730 | 0.683 | 0.698 | 0.794 |
r1-0 | 0.556 | 0.508 | 0.000 | 0.063 | 0.825 | 0.730 | 0.667 | 0.746 | 0.683 | 0.683 | 0.778 | 0.730 | 0.730 | 0.683 | 0.698 | 0.794 |
r1-1 | 0.587 | 0.540 | 0.063 | 0.000 | 0.857 | 0.762 | 0.698 | 0.778 | 0.714 | 0.714 | 0.810 | 0.762 | 0.762 | 0.714 | 0.730 | 0.825 |
r2-0 | 0.873 | 0.825 | 0.825 | 0.857 | 0.000 | 0.063 | 0.476 | 0.556 | 0.746 | 0.746 | 0.841 | 0.794 | 0.794 | 0.746 | 0.762 | 0.857 |
r2-1 | 0.778 | 0.730 | 0.730 | 0.762 | 0.063 | 0.000 | 0.381 | 0.460 | 0.651 | 0.651 | 0.746 | 0.698 | 0.698 | 0.651 | 0.667 | 0.762 |
r3-0 | 0.714 | 0.667 | 0.667 | 0.698 | 0.476 | 0.381 | 0.000 | 0.079 | 0.587 | 0.587 | 0.683 | 0.635 | 0.635 | 0.587 | 0.603 | 0.698 |
r3-1 | 0.794 | 0.746 | 0.746 | 0.778 | 0.556 | 0.460 | 0.079 | 0.000 | 0.667 | 0.667 | 0.762 | 0.714 | 0.714 | 0.667 | 0.683 | 0.778 |
r4-0 | 0.730 | 0.683 | 0.683 | 0.714 | 0.746 | 0.651 | 0.587 | 0.667 | 0.000 | 0.095 | 0.317 | 0.270 | 0.397 | 0.286 | 0.492 | 0.587 |
r4-1 | 0.730 | 0.683 | 0.683 | 0.714 | 0.746 | 0.651 | 0.587 | 0.667 | 0.095 | 0.000 | 0.254 | 0.206 | 0.397 | 0.286 | 0.492 | 0.587 |
r5-0 | 0.825 | 0.778 | 0.778 | 0.810 | 0.841 | 0.746 | 0.683 | 0.762 | 0.317 | 0.254 | 0.000 | 0.016 | 0.492 | 0.381 | 0.587 | 0.683 |
r5-1 | 0.778 | 0.730 | 0.730 | 0.762 | 0.794 | 0.698 | 0.635 | 0.714 | 0.270 | 0.206 | 0.016 | 0.000 | 0.444 | 0.333 | 0.540 | 0.635 |
r6-0 | 0.778 | 0.730 | 0.730 | 0.762 | 0.794 | 0.698 | 0.635 | 0.714 | 0.397 | 0.397 | 0.492 | 0.444 | 0.000 | 0.111 | 0.667 | 0.762 |
r6-1 | 0.730 | 0.683 | 0.683 | 0.714 | 0.746 | 0.651 | 0.587 | 0.667 | 0.286 | 0.286 | 0.381 | 0.333 | 0.111 | 0.000 | 0.556 | 0.651 |
r7-0 | 0.746 | 0.698 | 0.698 | 0.730 | 0.762 | 0.667 | 0.603 | 0.683 | 0.492 | 0.492 | 0.587 | 0.540 | 0.667 | 0.556 | 0.000 | 0.063 |
r7-1 | 0.841 | 0.794 | 0.794 | 0.825 | 0.857 | 0.762 | 0.698 | 0.778 | 0.587 | 0.587 | 0.683 | 0.635 | 0.762 | 0.651 | 0.063 | 0.000 |
# style the gaps between cells
table.body.gaps.columns[:] = 3
table.body.gaps.rows[:] = 3
# hide axes coordinates
axes.show = False
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /home/deren/miniconda3/envs/py38/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2888 try: -> 2889 return self._engine.get_loc(casted_key) 2890 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: (0, 0) The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-39-fbc956fc29f0> in <module>() 21 cell = table.cells.cell[ridx, cidx] 22 cell.style = { ---> 23 "fill": colormap.colors(matrix[ridx, cidx], 0, 100), 24 } 25 /home/deren/miniconda3/envs/py38/lib/python3.7/site-packages/pandas/core/frame.py in __getitem__(self, key) 2897 if self.columns.nlevels > 1: 2898 return self._getitem_multilevel(key) -> 2899 indexer = self.columns.get_loc(key) 2900 if is_integer(indexer): 2901 indexer = [indexer] /home/deren/miniconda3/envs/py38/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 2889 return self._engine.get_loc(casted_key) 2890 except KeyError as err: -> 2891 raise KeyError(key) from err 2892 2893 if tolerance is not None: KeyError: (0, 0)
# load the snp data into distance tool with arguments
dist = Distance(
data=data,
imap=imap,
minmap=minmap,
mincov=0.5,
impute_method="sample",
subsample_snps=False,
)
dist.run()
Samples: 29 Sites before filtering: 349914 Filtered (indels): 0 Filtered (bi-allel): 13379 Filtered (mincov): 30459 Filtered (minmap): 111825 Filtered (combined): 120177 Sites after filtering: 229737 Sites containing missing values: 219551 (95.57%) Missing values in SNP matrix: 814369 (12.22%) Imputation: 'sampled'; (0, 1, 2) = 77.3%, 10.7%, 12.0%
# save to a CSV file
dist.dists.to_csv("distances.csv")
# show the upper corner
dist.dists.head()
BJSB3 | BJSL25 | BJVL19 | BZBB1 | CRL0001 | CRL0030 | CUCA4 | CUMM5 | CUSV6 | CUVN10 | ... | FLWO6 | HNDA09 | LALC2 | MXED8 | MXGT4 | MXSA3017 | SCCU3 | TXGR3 | TXMD3 | TXWV2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
BJSB3 | 0.000000 | 0.250447 | 0.253472 | 0.592255 | 0.530145 | 0.572576 | 0.601853 | 0.597044 | 0.591990 | 0.579937 | ... | 0.594005 | 0.582000 | 0.568137 | 0.464618 | 0.443942 | 0.579789 | 0.603638 | 0.487945 | 0.487936 | 0.590440 |
BJSL25 | 0.250447 | 0.000000 | 0.235900 | 0.558630 | 0.494291 | 0.537193 | 0.566156 | 0.559675 | 0.554665 | 0.542768 | ... | 0.558769 | 0.548897 | 0.532239 | 0.435050 | 0.412694 | 0.547182 | 0.567323 | 0.453606 | 0.457105 | 0.554882 |
BJVL19 | 0.253472 | 0.235900 | 0.000000 | 0.567897 | 0.502775 | 0.547391 | 0.576355 | 0.569360 | 0.563000 | 0.554621 | ... | 0.564728 | 0.555927 | 0.539913 | 0.441844 | 0.417060 | 0.556449 | 0.575336 | 0.464118 | 0.465476 | 0.562278 |
BZBB1 | 0.592255 | 0.558630 | 0.567897 | 0.000000 | 0.280691 | 0.280569 | 0.422670 | 0.422962 | 0.426266 | 0.394242 | ... | 0.559152 | 0.285883 | 0.532918 | 0.525701 | 0.542381 | 0.317450 | 0.576455 | 0.552554 | 0.551579 | 0.563571 |
CRL0001 | 0.530145 | 0.494291 | 0.502775 | 0.280691 | 0.000000 | 0.239596 | 0.347859 | 0.322277 | 0.342836 | 0.213266 | ... | 0.470064 | 0.262217 | 0.451717 | 0.466429 | 0.477764 | 0.299538 | 0.492224 | 0.487327 | 0.484123 | 0.482726 |
5 rows × 29 columns
toyplot.matrix(
dist.dists,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(dist.names)),
sorted(dist.names),
));
# get list of concatenated names from each group
ordered_names = []
for group in dist.imap.values():
ordered_names += group
# reorder matrix to match name order
ordered_matrix = dist.dists[ordered_names].T[ordered_names]
toyplot.matrix(
ordered_matrix,
bshow=False,
tshow=False,
rlocator=toyplot.locator.Explicit(
range(len(ordered_names)),
ordered_names,
));