import ipyrad.analysis as ipa
import numpy as np
import pandas as pd
import itertools
# get a sequence alignment from the HDF5
database = "/home/deren/Downloads/spp30.seqs.hdf5"
idx=0
wmin=0
wmax=100000
parse_phystring
# init a popgen analysis object
pop = ipa.popgen(
name="ped",
workdir="analysis-popgen",
data="./pedicularis/clust85_min12_outfiles/clust85_min12.loci",
)
pop._fst()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-11-0dc99510b3b6> in <module>() ----> 1 pop._fst() ~/Documents/ipyrad/ipyrad/analysis/popgen.py in _fst(self) 114 ) 115 d = self.npops --> 116 117 # iterate over pairs of pops and fill Fst values 118 pairs = itertools.combinations(self.popdict.key(), 2) AttributeError: 'dict' object has no attribute 'key'
loci_to_arr()
¶... Now I'm thinking maybe I really should make a HDF5 format as the default for downstream analyses..., it can have SNPs, mapfile, and whole sequences with locimap.
def _loci_to_arr(self):
"""
return a frequency array from a loci file for all loci with taxa from
taxdict and min coverage from mindict.
"""
#
loci = infile.read().strip().split("|\n")
## make the array (4 or 5) and a mask array to remove loci without cov
nloci = len(loci)
keep = np.zeros(nloci, dtype=np.bool_)
arr = np.zeros((nloci, 4, 300), dtype=np.float64)
## six rows b/c one for each p3, and for the fused p3 ancestor
if len(taxdict) == 5:
arr = np.zeros((nloci, 6, 300), dtype=np.float64)
## if not mindict, make one that requires 1 in each taxon
if isinstance(mindict, int):
mindict = {i: mindict for i in taxdict}
elif isinstance(mindict, dict):
mindict = {i: mindict[i] for i in taxdict}
else:
mindict = {i: 1 for i in taxdict}
## raise error if names are not 'p[int]'
allowed_names = ['p1', 'p2', 'p3', 'p4', 'p5']
if any([i not in allowed_names for i in taxdict]):
raise IPyradError(\
"keys in taxdict must be named 'p1' through 'p4' or 'p5'")
## parse key names
keys = sorted([i for i in taxdict.keys() if i[0] == 'p'])
outg = keys[-1]
## grab seqs just for the good guys
for loc in xrange(nloci):
## parse the locus
lines = loci[loc].split("\n")[:-1]
names = [i.split()[0] for i in lines]
seqs = np.array([list(i.split()[1]) for i in lines])
## check that names cover the taxdict (still need to check by site)
covs = [sum([j in names for j in taxdict[tax]]) >= mindict[tax] \
for tax in taxdict]
## keep locus
if all(covs):
keep[loc] = True
## get the refseq
refidx = np.where([i in taxdict[outg] for i in names])[0]
refseq = seqs[refidx].view(np.uint8)
ancestral = np.array([reftrick(refseq, GETCONS2)[:, 0]])
## freq of ref in outgroup
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq
## enter 4-taxon freqs
if len(taxdict) == 4:
for tidx, key in enumerate(keys[:-1]):
## get idx of names in test tax
nidx = np.where([i in taxdict[key] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
## get freq of sidx
iseq = _reffreq2(ancestral, sidx, GETCONS2)
## fill it in
arr[loc, tidx, :iseq.shape[1]] = iseq
else:
## entere p5; and fill it in
iseq = _reffreq2(ancestral, refseq, GETCONS2)
arr[loc, -1, :iseq.shape[1]] = iseq
## enter p1
nidx = np.where([i in taxdict['p1'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 0, :iseq.shape[1]] = iseq
## enter p2
nidx = np.where([i in taxdict['p2'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 1, :iseq.shape[1]] = iseq
## enter p3 with p4 masked, and p4 with p3 masked
nidx = np.where([i in taxdict['p3'] for i in names])[0]
nidy = np.where([i in taxdict['p4'] for i in names])[0]
sidx = seqs[nidx].view(np.uint8)
sidy = seqs[nidy].view(np.uint8)
xseq = _reffreq2(ancestral, sidx, GETCONS2)
yseq = _reffreq2(ancestral, sidy, GETCONS2)
mask3 = xseq != 0
mask4 = yseq != 0
xseq[mask4] = 0
yseq[mask3] = 0
arr[loc, 2, :xseq.shape[1]] = xseq
arr[loc, 3, :yseq.shape[1]] = yseq
## enter p34
nidx = nidx.tolist() + nidy.tolist()
sidx = seqs[nidx].view(np.uint8)
iseq = _reffreq2(ancestral, sidx, GETCONS2)
arr[loc, 4, :iseq.shape[1]] = iseq
## size-down array to the number of loci that have taxa for the test
arr = arr[keep, :, :]
## size-down sites to
arr = masknulls(arr)
return arr, keep
Fst can be calculated per SNP, or averaged over sliding windows of SNPs of some size if reference information is present. Or over loci if
pop.results
fst Empty DataFrame Columns: [] Index: [] pi Empty DataFrame Columns: [0] Index: [] theta Empty DataFrame Columns: [0] Index: []
popdict = {'i': [2, 3], 'j': [2, 4]}
mindict = {}
(mindict if mindict else {j:1 for j in popdict})
{'i': 1, 'j': 1}
testdata1 = np.array([
[0, 0, 0, 0, 0],
[1, 0, 0, 0, 0],
[1, 1, 1, 0, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 0],
[1, 1, 1, 1, 1],
])
testdata2 = np.random.binomial(1, 0.01, (6, 100))
class Self:
def __init__(self, data, popdict={}, mindict={}, mapfile=False):
self.data = data
self.popdict = popdict
self.mindict = mindict
self._parsedata()
self._checkdicts()
self._filterdata()
def _parsedata(self):
"""
parse data as an ndarray, vcfile, or locifile and store
in ... arrays, one with all sites, and mapfile...
"""
pass
def _checkdicts(self):
#assert len(popdict) > 2, "must enter at least two populations"
self.mindict = (mindict if mindict else {j:1 for j in popdict})
def _filterdata(self):
pass
def fst_H(self, pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
return abs(1 - (a / b))
def theta_W(self):
"return Watternson's theta for each population"
df = pd.DataFrame(
data=np.zeros(len(self.popdict)),
columns=["theta"],
index=list(self.popdict.keys())
)
for row in self.popdict:
idat = self.data[self.popdict[row], :]
segregating = np.invert(np.all(idat == idat[0], axis=0)).sum()
denom = np.sum((1 / i for i in range(1, idat.shape[1] - 1)))
theta = segregating / denom
df.loc[row] = theta
return df
def pi(self, pop1idx, pop2idx):
pass
popdict = {'A': [0, 1], 'B': [2, 3, 4, 5]}
mindict = {'A': 1, 'B': 2}
self = Self(testdata2, popdict, mindict)
self.theta_W()
theta | |
---|---|
A | 0.387 |
B | 0.581 |
print(self.data)
print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5]))
print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5])))
print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5]))
print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5]))
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] Fst per SNP: [ nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.75 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.5 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.5 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.75 nan nan nan nan nan nan nan nan nan nan nan nan nan 0.5 nan nan] Fst mean: nan
/home/deren/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:40: RuntimeWarning: invalid value encountered in true_divide
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-537-e9d9e53cd653> in <module>() 2 print("\nFst per SNP:", self.fst_H([0, 1], [2, 3, 4, 5])) 3 print("\nFst mean:", np.mean(self.fst_H([0, 1], [2, 3, 4, 5]))) ----> 4 print("\ntheta per individual:", self.theta_W([0, 1], [2, 3, 4, 5])) 5 print("\ntheta per population:", self.theta_W([0, 1], [2, 3, 4, 5])) TypeError: theta_W() takes 1 positional argument but 3 were given
pd.DataFrame(
data=np.zeros((2, 2)),
index=['aaaaaa', 'bbbbbbbbbbb'],
)
0 | 1 | |
---|---|---|
aaaaaa | 0.0 | 0.0 |
bbbbbbbbbbb | 0.0 | 0.0 |
self.fst([0, 1], [2, 3, 4, 5])
(array([0.2, 0. , 0. , 0.6, 0.6]), array([0.2, 0. , 0. , 0.6, 0.6]))
pop1idx = [0, 1]
pop2idx = [2, 3, 4, 5]
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
ilist = itertools.combinations(range(4), 2)
list(ilist)
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
ilist = itertools.combinations(range(pop2.shape[0]), 2)
list(ilist)
[(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
def diffs(pop):
ilist = itertools.combinations(range(pop.shape[0]), 2)
diff = [pop[i] == pop[j] for (i, j) in ilist]
sums = np.sum(diff, axis=0)
return sums/sums.shape[0]
def fst(pop1idx, pop2idx):
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
return abs(1 - (a / b))
#fst([0, 1], [2, 3, 4, 5])
ii = list(itertools.combinations(pop1idx, 2))
jj = list(itertools.combinations(pop2idx, 2))
within = ii + jj
kk = itertools.combinations(pop1idx + pop2idx, 2)
between = itertools.filterfalse(lambda x: bool(x in ii + jj), kk)
ii + jj
[(0, 1), (2, 3), (2, 4), (2, 5), (3, 4), (3, 5), (4, 5)]
between
<itertools.filterfalse at 0x7faa876c4eb8>
self.data
array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
diff = [self.data[i] != self.data[j] for (i, j) in ii + jj]
sums = np.sum(diff, axis=0)
a = sums / sums.shape[0]
a
array([0.2, 0. , 0. , 0.6, 0.6])
abs(1 - (a / b))
array([0.75, 1. , 1. , 0.5 , 0.5 ])
diff = [self.data[i] != self.data[j] for (i, j) in between]
sums = np.sum(diff, axis=0)
b = sums / sums.shape[0]
1 - (a / b)
array([ 0.75, 1. , 1. , 0.5 , -0.5 ])
self.data
array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
between
[(0, 2), (0, 3), (0, 4), (0, 5), (1, 2), (1, 3), (1, 4), (1, 5)]
diffs(pop2) / diffs(popa)
array([0.6 , 0.85714286, 0.85714286, 0.5 , 0.3 ])
for i in range(pop1.shape[0]):
for j in range(pop1.shape[0]):
print(i,j)
0 0 0 1 1 0 1 1
def _fstH(self, pop1idx, pop2idx):
"Hudson's Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
a = (pop1freq - pop2freq) ** 2
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
return pd.Series(a - b - c).round(5)
def _fstWC(self, pop1idx, pop2idx):
"Wier and Cockerham (1984) Fst estimator"
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
nbar = pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
s2a = (
(1. / ((popa.shape[0] - 1) * nbar))
* ((pop1.shape[0] * 1))
)
nc = (
(1. / (npops - 1.))
* (popa.shape[0]
- ((pop1.shape[0] ** 2 + pop2.shape[0] ** 2) / popa.shape[0])
)
)
#t1 =
#t2 =
return t1, t2
def fstWC(self, pop1idx, pop2idx):
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
# frequecy of allele A in the sample of size ni from population i
p = ...
# observed proportion of individuals heterozygous for allele A
hbar =
# the average sample size
nbar =
nc =
pop1.sum(axis=0) / 2. + pop2.sum(axis=0) / 2.
array([2. , 1.5, 1.5, 1. , 0.5])
pop1.shape[0] / 2. + pop2.shape[0] / 2.
2.5
npops= 2.0
nsamples = float(NpopA + NpopB)
n_bar= (NpopA / npops) + (NpopB / npops)
samplefreq = ( (popAcount+popBcount) / (NpopA + NpopB) )
pop1freq = popAcount / float(NpopA )
pop2freq = popBcount / float(NpopB )
Npop1 = NpopA
Npop2 = NpopB
S2A= (1/ ( (npops-1.0) * n_bar) ) *
( ( (Npop1)* ((pop1freq-samplefreq)**2) ) +
( (Npop2)*((pop2freq-samplefreq)**2) ) )
nc = 1.0/(npops-1.0) * ( (Npop1+Npop2) - (((Npop1**2)+(Npop2**2)) / (Npop1+Npop2)) )
T_1 = S2A -( ( 1/(n_bar-1) ) * ( (samplefreq * (1-samplefreq)) - ((npops-1)/npops)* S2A ) )
T_2 = (( (nc-1) / (n_bar-1) ) * samplefreq *(1-samplefreq) ) + (1.0 + (((npops-1)*(n_bar-nc)) / (n_bar-1))) * (S2A/npops)
pop1idx = [0, 1]
pop2idx = [2, 3, 4]
_fst(self, pop1idx, pop2idx)
0 0.000 1 1.000 2 1.000 3 0.333 4 -0.000 dtype: float64
pop1idx = [0, 4]
pop2idx = [1, 2, 3]
_fst(self, pop1idx, pop2idx)
0 0.000 1 -0.333 2 -0.333 3 -0.333 4 0.000 dtype: float64
pop1 = self.data[pop1idx, :]
pop2 = self.data[pop2idx, :]
popa = self.data[pop1idx + pop2idx, :]
print(pop1)
print(pop2)
[[0 0 0 0 0] [1 0 0 0 0]] [[1 1 1 0 0] [1 1 1 1 0] [1 1 1 1 1]]
pop1freq = np.mean(pop1, axis=0)
pop2freq = np.mean(pop2, axis=0)
popafreq = np.mean(popa, axis=0)
a = (pop1freq - pop2freq) ** 2
a
array([0.25 , 1. , 1. , 0.44444444, 0.11111111])
b = ((pop1freq * (1.0 - pop1freq))) / (pop1.shape[0] - 1)
b
array([0.25, 0. , 0. , 0. , 0. ])
c = ((pop2freq * (1.0 - pop2freq))) / (pop2.shape[0] - 1)
c
array([0. , 0. , 0. , 0.11111111, 0.11111111])
pd.Series(a - b - c).round(3)
0 0.000 1 1.000 2 1.000 3 0.333 4 -0.000 dtype: float64
pd.Series(a - b - c).round(3)
0 0.000 1 1.000 2 1.000 3 0.333 4 -0.000 dtype: float64
d = (pop1freq * (1.0 - pop2freq)) + (pop1freq * (1.0 - pop2freq))
d
array([0., 0., 0., 0., 0.])
(np.mean(popa, axis=0) - np.mean(pop2, axis=0)) / (np.mean(popa, axis=0))
array([-0.25 , -0.66666667, -0.66666667, -0.66666667, -0.66666667])
popa
array([[0, 0, 0, 0, 0], [1, 0, 0, 0, 0], [1, 1, 1, 0, 0], [1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
np.count_nonzero(pop2, axis=0)
array([3, 3, 3, 2, 1])
a = np.mean(popa, axis=0)
mask = a > 0.5
a[a > 0.5] = abs(a[a > 0.5] - 1)
a
array([0.2, 0.4, 0.4, 0.4, 0.2])
b = np.mean(pop2, axis=0)
mask = b > 0.5
b[b > 0.5] = abs(b[b > 0.5] - 1)
b
array([0. , 0. , 0. , 0.33333333, 0.33333333])
(a - b) / a
array([ 1. , 1. , 1. , 0.16666667, -0.66666667])