# activiate inline plotting
%matplotlib inline
from setup import *
This tutorial demonstrates how genotype data can be used to generate simulated phenotpye datasetes.
# load 1000g data:
#load 1000 genomes data
#the data used in this study have been pre-converted into an hdf5 file.
#to preprocess your own data, please use limix command line brinary
genotype_file = tutorial_data.get_file('1000g')
annotation_file = tutorial_data.get_file('1000g_annotation')
geno_reader_1000G = gr.genotype_reader_tables(genotype_file)
#read annotation
fa = h5py.File(annotation_file,'r')
population_1000G = fa['Population'][:]
pos_1000G = geno_reader_1000G.getPos()
#select YRI and CEU populations
Ipop = (population_1000G=='CEU') | (population_1000G=='YRI') | (population_1000G=='GBR') | (population_1000G=='FIN')
population_1000G = population_1000G[Ipop]
M1000G = geno_reader_1000G.getGenotypes()[Ipop]
# atwell 107
file_name = tutorial_data.get_file('arab107')
geno_reader_atwell = gr.genotype_reader_tables(file_name)
Matwell = geno_reader_atwell.getGenotypes()
posatwell = geno_reader_atwell.getPos()
Matwell = Matwell[:,::10]
Matwell = Matwell[0:200,:]
posatwell = posatwell[::10]
# ByXRM
file_name = tutorial_data.get_file('BYxRM')
geno_reader_BYxRM = gr.genotype_reader_tables(file_name)
# M is a binary matrix [samples,genotypes]
MBYxRM = geno_reader_BYxRM.getGenotypes()
posBYxRM = geno_reader_BYxRM.getPos()
if 0:
M = M1000G
POS = pos_1000G
minor_allele=2
if 1:
M = Matwell
POS = posatwell
minor_allele=1
if 0:
M= MBYxRM
POS = posBYxRM
minor_allele=1
#M1000G = geno_reader_1000G.getGenotypes()
#note: the 1000g samples have different populational background.
#a subset of the samples are YRI and the remaining CEU
#you can query this information using the array "popuatlion_1000G"
#filter by allele frequency
af = calc_AF(M,minor=minor_allele)
Iok = (af['af']>=0.05) & ((af['af']<0.95))
M = M[:,Iok]
POS = POS[Iok]
N = M.shape[0]
#set random seed to make this reporducible
sp.random.seed(5)
snps = sp.array(M,dtype='float')
#specifiy number of causal variants and total variance attributable to genetic effects
#10% are causal genetic effects
Vfg = 0.2
#variance due to non-causal population structure
Vbg = 0.3
#noise
Vresidual = 1.0-Vbg-Vfg
#number of causal loci and polygenic signal of population structure
Nfg = 5
#background:
Nbg = 10
#random selection of SNPs
Irandom = sp.random.permutation(M.shape[1])
i_snp =0
#simulate population structure effects
W_bg = sp.randn(Nbg)
#and causal loci
W_fg = sp.randn(Nfg)
#simulate random noise
y_residual = sp.randn(N)
#background snps (weak)
I_bg = Irandom[i_snp:i_snp+Nbg]
snps_bg = snps[:,I_bg]
snps_bg -= snps_bg.mean(axis=0)
snps_bg /= snps_bg.std(axis=0)
y_bg = sp.dot(snps_bg,W_bg)
i_snp+=Nbg
#foreground SNPs (strong)
I_fg = Irandom[i_snp:i_snp+Nfg]
snps_fg = snps[:,I_fg]
snps_fg -= snps_fg.mean(axis=0)
snps_fg /= snps_fg.std(axis=0)
y_fg = sp.dot(snps_fg,W_fg)
i_snp+=Nfg
#
I_causal = sp.concatenate([I_fg,I_bg])
#normalize and add up
y = y_fg/y_fg.std()*sp.sqrt(Vfg)
y += y_bg/y_bg.std()*sp.sqrt(Vbg)
y += y_residual/y_residual.std()*sp.sqrt(Vresidual)
y = y[:,sp.newaxis]
#run gwas:
covs = None
test = 'lrt'
lm = qtl.test_lm(snps=snps,pheno=y,
covs=covs, test=test)
K = sp.dot(snps,snps.T)
K /= K.diagonal().mean()
lmm = qtl.test_lmm(snps=snps,pheno=y,K=K,
covs=covs, test=test)
pos = POS['pos'].values
pv_lmm = lmm.getPv().ravel()
pv = lm.getPv().ravel()
pl.figure(figsize=[17,5])
plt.subplot(2,2,1)
# Genome-wide manhatton plots for one phenotype:
pos = POS['pos'].values
plot_manhattan(pos,pv)
pl.title('LM')
plt.subplot(2,2,2)
pos = POS['pos'].values
plot_manhattan(pos,pv_lmm)
pl.title('LMM')
<matplotlib.text.Text at 0x126d72910>
What are the loci that are being discovered. Are all associations found by LM background signals or SNPs with a large effect?
pl.figure(figsize=[17,5])
plt.subplot(2,2,1)
# Genome-wide manhatton plots for one phenotype:
pos = POS['pos'].values
plot_manhattan(pos,pv)
#plot causal loci and background loci
pl.plot(pos[I_bg],0.1*sp.ones(len(I_bg)),'g.',markersize=15,label='background')
pl.plot(pos[I_fg],0.1*sp.ones(len(I_fg)),'r.',markersize=15,label='large effect')
pl.legend(loc='upper left')
pl.title('LM')
plt.subplot(2,2,2)
plot_manhattan(pos,pv_lmm)
#plot causal loci and background loci
pl.plot(pos[I_bg],0.1*sp.ones(len(I_bg)),'g.',markersize=15,label='background')
pl.plot(pos[I_fg],0.1*sp.ones(len(I_fg)),'r.',markersize=15,label='large effect')
pl.legend(loc='upper left')
pl.title('LMM')
<matplotlib.text.Text at 0x1277f5350>
pl.figure(figsize=[8,5])
pl.plot(-sp.log10(pv[I_causal]),-sp.log10(pv_lmm[I_causal]),'k.',markersize=10)
maxx = max(pl.xlim()[1],pl.ylim()[1])
pl.plot([0,maxx],[0,maxx],'k--')
pl.xlabel('LM $-log_{10}(pv)$')
pl.ylabel('LMN $-log_{10}(pv)$')
<matplotlib.text.Text at 0x12782a2d0>
alpha = 0.05/snps.shape[1]
lmm,RV = qtl.forward_lmm(snps=snps,pheno=y,K=K,
covs=covs, test=test,threshold=alpha,maxiter=10)
pl.figure(figsize=[15,10])
Nadded = RV['pvall'].shape[0]
for i in xrange(Nadded):
plt.subplot(Nadded,1,1+i)
pv = RV['pvall'][i]
plot_manhattan(pos,pv)
pl.plot(pos[I_bg],0.1*sp.ones(len(I_bg)),'g.',markersize=15,label='background')
pl.plot(pos[I_fg],0.1*sp.ones(len(I_fg)),'r.',markersize=15,label='large effect')
#mark the most associated SNP
if i<(Nadded-1):
pl.plot(pos[RV['iadded'][i]],-sp.log10(pv)[RV['iadded'][i]],'dy',markersize=10,label='lead SNP')
pl.legend(loc='upper right')
# variance component model
vc = var.VarianceDecomposition(y)
vc.addFixedEffect()
vc.addRandomEffect(K=K)
vc.addRandomEffect(is_noise=True)
vc.optimize()
_var = vc.getVarianceComps()
_var /= _var.sum()
_var = _var.ravel()
print 'genetic: %.2f, noise: %.2f' % (_var[0],_var[1])
genetic: 0.63, noise: 0.37
# variance component model
I_non_causal = sp.arange(snps.shape[1])
I_non_causal = sp.setdiff1d(I_non_causal,I_causal)
K_causal = sp.dot(snps[:,I_causal],snps[:,I_causal].T)
K_noncausal = sp.dot(snps[:,I_non_causal],snps[:,I_non_causal].T)
vc = var.VarianceDecomposition(y)
vc.addFixedEffect()
vc.addRandomEffect(K=K_causal)
vc.addRandomEffect(K=K_noncausal)
vc.addRandomEffect(is_noise=True)
vc.optimize()
_var = vc.getVarianceComps()
_var /= _var.sum()
_var = _var.ravel()
print 'genetic: %.2f, non causal: %.2f, noise: %.2f' % (_var[0],_var[1],_var[2])
genetic: 0.67, non causal: 0.00, noise: 0.33
pl.imshow(K_causal,aspect='auto',interpolation='none')
pl.imshow(K_noncausal,aspect='auto',interpolation='none')
<matplotlib.image.AxesImage at 0x126f46650>