#!/usr/bin/env python # coding: utf-8 #

ipyrad-analysis toolkit: Dimensionality reduction

# # The `pca` tool can be used to implement a number of dimensionality reduction methods on SNP data (PCA, t-SNE, UMAP) and to filter and/or impute missing data in genotype matrices to reduce the effects of missing data. # ### Load libraries # In[1]: # conda install ipyrad -c conda-forge -c bioconda # conda install ipcoal -c conda-forge # conda install scikit-learn -c conda-forge # In[2]: import ipyrad.analysis as ipa import toytree import toyplot import ipcoal # In[3]: print(ipa.__version__) print(toytree.__version__) print(toyplot.__version__) print(ipcoal.__version__) # ### Simulate linked SNP data from this tree and write to a database (HDF5) # Here we simulate data on a tree/network that includes an introgressive edge. # In[29]: # make a random tree with root height in generations tree = toytree.rtree.unittree(ntips=8, treeheight=5e5, seed=123) tree.draw(ts='p', admixture_edges=(3, 4)); # In[30]: # create a coalescent simulator w/ 8 samples per species and high Ne model = ipcoal.Model(tree, Ne=2e5, nsamples=6, admixture_edges=(3, 4, 0.5, 0.15)) # simulate many short loci on coalescent genealogies (a few minutes) model.sim_loci(nloci=2000, nsites=50) # write simulated SNPs to a database file (.snps.hdf5) model.write_snps_to_hdf5(name='test-pca', outdir="/tmp", diploid=True) # now randomly drop data from many loci (RAD-seq like) model.apply_missing_mask(coverage=0.75) # write a data file with missingness in it (.snps.hdf5) model.write_snps_to_hdf5(name='test-pca-cov75', outdir="/tmp", diploid=True) # ### The input data # In[31]: # the simulated SNP database file SNPS = "/tmp/test-pca.snps.hdf5" # ### Make an IMAP dictionary (map popnames to list of samplenames) # In[34]: from itertools import groupby # load sample names from SNPs file tool = ipa.snps_extracter(SNPS) # group names by prefix before '-' groups = groupby(tool.names, key=lambda x: x.split("-")[0]) # arrange into a dictionary IMAP = {i[0]: list(i[1]) for i in groups} # show the imap dict IMAP # ### Run PCA # Unlinked SNPs are automatically sampled from each locus. By setting `nreplicates=30` the subsampling procedure is repeated 30X to show variation over the subsampled SNPs. The imap dictionary is used in the `.draw()` function to color points. # In[57]: # run on dataset with no missing data and with minor allele freq filter tool = ipa.pca(data=SNPS, minmaf=0.03) # In[58]: # run to subsample unlinked SNPs and run PCA tool.run(nreplicates=20) tool.draw(imap=IMAP); # In[59]: # a convenience function for plotting across three axes tool.draw_panels(0, 1, 2, imap=IMAP); # ### Run TSNE # t-SNE is a manifold learning algorithm that can sometimes better project data into a 2-dimensional plane. The distances between points in this space are harder to interpret. You can see in this example that the gene flow between r3 and r4 is not readily apparent. # In[87]: # run tsne on same dataset (try several perplexity values) tool.run_tsne(perplexity=4, seed=333) tool.draw(imap=IMAP, size=8); # ### Run UMAP # UMAP is similar to t-SNE but the distances between clusters are more representative of the differences betwen groups. This requires another package that if it is not yet installed it will ask you to install. # In[86]: # try several n_neighbors values tool.run_umap(subsample=False, n_neighbors=13, seed=222) tool.draw(imap=IMAP, size=8); # ### Dealing with missing data (filtering and imputation) # Missing data has large effects on dimensionality reduction methods, and it is best to (1) minimize the amount of missing data in your input data set by using filtering, and (2) impute missing data values. In the example below we use the simulated datset with 25% missing data and use filtering and imputation methods in the ipyrad-analysis tool. This allows us to make inferences from large SNP datasets despite missing data. # Below the `pca` tool uses `impute_method=sample` to sample alleles for missing values based on the frequency of alleles at that SNP in the population (defined by the IMAP dictionary). It also drops any sites that do not have data across at least 50% of samples (mincov=0.5). # In[92]: # init a tool with the missing data dataset tool2 = ipa.pca( data="/tmp/test-pca-cov75.snps.hdf5", imap=IMAP, minmap={i: 1 for i in IMAP}, mincov=0.5, minmaf=0.03, impute_method="sample", ) # The results below are nearly identical to the results above where there was 0% missing data, showing that filtering and imputation worked properly. # In[93]: tool.run(nreplicates=10) tool.draw_panels(imap=IMAP) # ### Imputing without a priori groups # In[ ]: # ### What happens if you don't impute # Even a small amount of missing values (e.g., 5%) will cause significant bias if you arbitrarily set it to the ancestral state (0) instead of imputing. This example uses the `impute_method=None` and you can see the difference from the results above. I do not recommend using this setting. # In[95]: # init a tool with the missing data dataset tool3 = ipa.pca( data="/tmp/test-pca-cov75.snps.hdf5", imap=IMAP, minmap={i: 1 for i in IMAP}, mincov=0.5, minmaf=0.03, impute_method=None, ) # In[19]: tool.run_umap(n_neighbors=6) tool.draw();