#!/usr/bin/env python
# coding: utf-8

# In[ ]:


# # Genotype quality control

# This tutorial explores typical genotype quality control metrics, including allele frequency distribution, linkage disequilibrium and methods to test for population structure.

# ## Allele frequency distribution

# Allele frequency has profound implications for the gentic effects that can be identifeid. Here, we explore allele frequency distributions in multiple datasets, which differ in terms of their underlying genetic designs.
# 
# The example below shows how some basic statistics can be obtained for genotype data in a yeast cross (Bloom et al., Nature 2013).

# The task:
# 
# Explore allele frequency distributions in other datasets, including data from 1000 genomes and A. thaliana. 
# What are the implications for genotype to phenotype maping ?
# 
# * What are the advantages and disadvantages of partiular genetic designs ?  
# * Both to detect associations with common and rare variants ? 
# * Why do some datasets have allele frequencies [0..0.5] and others [0..1] ?

# In[1]:


# activiate inline plotting
get_ipython().run_line_magic('matplotlib', 'inline')
from setup import *


# In[2]:


#import data: BYxRM cross
#import data
#the data used in this study have been pre-converted into an hdf5 file. 
#to preprocess your own data, please use limix command line tool
file_name = tutorial_data.get_file('BYxRM')
geno_reader_BYxRM  = gr.genotype_reader_tables(file_name)
# M is a binary matrix [samples,genotypes]
MBYxRM = geno_reader_BYxRM.getGenotypes()
posBYxRM = geno_reader_BYxRM.getPos()['pos']

#import data: 1000 genomes
#the data used in this study have been pre-converted into an hdf5 file. 
#to preprocess your own data, please use limix command line brinary
genotype_file = tutorial_data.get_file('1000g')
annotation_file = tutorial_data.get_file('1000g_annotation')
geno_reader_1000G  = gr.genotype_reader_tables(genotype_file)
#read annotation
fa = h5py.File(annotation_file,'r')
population_1000G = fa['Population'][:]
#select YRI and CEU populations
Ipop = (population_1000G=='CEU')


population_1000G = population_1000G[Ipop]
M1000G = geno_reader_1000G.getGenotypes()[Ipop]
pos1000G = geno_reader_1000G.getPos()['pos']
#note: the 1000g samples have different populational background.
#a subset of the samples are Yerubuan (YRI) and the remaining are Central European (EU)
#you can query this information using the array "popuatlion_1000G"

#import data: Arabidopsis
#import data
#the data used in this study have been pre-converted into an hdf5 file. 
#to preprocess your own data, please use limix command line brinary
file_name = tutorial_data.get_file('arab107')
geno_reader_atwell  = gr.genotype_reader_tables(file_name)
Matwell = geno_reader_atwell.getGenotypes()
posatwell = geno_reader_atwell.getPos()['pos']


# In[3]:


K = sp.dot(M1000G,M1000G.T)
K2 = sp.dot(M1000G[:,0:2],M1000G[:,0:2].T)


# In[4]:


pl.subplot(121)
pl.imshow(K,aspect='auto')
pl.subplot(122)
pl.imshow(K2,aspect='auto')


# In[5]:


#strains in the yeast cross are homozygous, which is encoded [0,1] - [ref/ref,alt/alt]
#for heterozygous systems, we need to set minor=2 as the encoding is [0,1,2] - [ref/ref,ref/alt,alt/alt]
afBYxRM = calc_AF(MBYxRM,minor=1)


# In[6]:


##cut
af1000G = calc_AF(M1000G,minor=2)
afatwell = calc_AF(Matwell,minor=1)


# In[7]:


pl.figure(figsize=[15,5])

plt.subplot(1,3,1)
plot=pl.hist(afBYxRM['af'],50,normed=True)
pl.ylabel('density')
pl.title('Yeast cross')
pl.xlabel('allele frequency')

##cut
plt.subplot(1,3,2)
plot=pl.hist(af1000G['af'],50,normed=True)
pl.ylabel('density')
pl.title('1000 genomes')
pl.xlabel('allele frequency')

plt.subplot(1,3,3)
plot=pl.hist(afatwell['af'],50,normed=True)
pl.ylabel('density')
pl.title('A. thaliana')
pl.xlabel('allele frequency')


# ## Filtering by allele frequency

# Allele frequency is an important filter for QTL mapping. We here exlore the implications of alleele frequency filters on the number of variants available for analysis. 

# In[8]:


pl.figure(figsize=[18,5])
maf = 10.**(-sp.arange(6))

plt.subplot(1,3,1)

N   = [(afBYxRM['af']>=m).sum() for m in maf]
pl.plot(maf,N,'k-')
pl.xscale('log')
pl.xlabel('maf cutoff')
pl.ylabel('number of variants')
pl.title('Yeast cross')


##cut
plt.subplot(1,3,2)

N   = [(af1000G['af']>=m).sum() for m in maf]
pl.plot(maf,N,'k-')
pl.xscale('log')
pl.xlabel('maf cutoff')
pl.ylabel('number of variants')
pl.title('1000 genomes')

plt.subplot(1,3,3)

N   = [(afatwell['af']>=m).sum() for m in maf]
pl.plot(maf,N,'k-')
pl.xscale('log')
pl.xlabel('maf cutoff')
pl.ylabel('number of variants')
pl.title('A. thaliana')


# In[ ]:


# ## Linkage disequilibrium

# The resolution of genotype to phenotype mapping is affect by linkage disequilibrium. We here explore the correlation between nearby genetic markers as a function of distance.
# 
# Tasks:
# 
# * How does the minor allele frequency affect LD?
# * How to global and local correlations compare and are related to differences in the genetic designs ? 
# * Is there long-range LD and if yes, what might be causes for this effect ? 

# In[9]:


#Local LD plots

pl.figure(figsize=[15,5])

plt.subplot(1,3,1)
LDByxRM = calc_LD(MBYxRM,posBYxRM.values)
pl.plot(LDByxRM[0],LDByxRM[1])
pl.ylabel('r^2')
pl.xlabel('dist (bp)')
pl.title('Yeast cross')

#the calc_LD function has an optional parameter i_start, determining from how many positions the sampling started.
#you can use this to improve the resolution. try i_start=sp.arange(10) or i_start=sp.arange(100)

## CUT
maf = 0.01
plt.subplot(1,3,2)
Iaf = af1000G['af']>=maf
LDp = calc_LD(M1000G[:,Iaf],pos1000G.values[Iaf],i_start=sp.arange(100))
pl.plot(LDp[0],LDp[1])
pl.ylabel('r^2')
pl.xlabel('dist (bp)')
pl.title('1000 genomes')


plt.subplot(1,3,3)
Iaf = afatwell['af']>=maf
LDp = calc_LD(Matwell[:,Iaf],posatwell.values[Iaf],i_start=sp.arange(10))
pl.plot(LDp[0],LDp[1])
pl.ylabel('r^2')
pl.xlabel('dist (bp)')
pl.title('A. thaliana')


# ## Population structure

# Principal component analysis can be used to reveal particular relationship between samples. We here apply PCA to multiple different datasets to explore the extent of population structure.
# 
# Tasks:
# * Apply PCA to the yeast cross, the Arabidopsis data and 1000 genomes data.
# * Do you see clusters witin the data? 
# * If yes, what do these clusters corresond to ? 

# In[11]:


# PCA

import limix.deprecated.stats.pca as pca

pl.figure(figsize=[18,5])

plt.subplot(1,3,1)
[snpsX,snpsW] = pca.PCA(MBYxRM[:,0:5000],components=10)
pl.plot(snpsX[:,0],snpsX[:,1],'k.')
pl.xlabel('PC1')
pl.ylabel('PC2')
pl.title('Yeast cross')

## cut
plt.subplot(1,3,2)
[snpsX,snpsW] = pca.PCA(M1000G[:,0:5000],components=10)
pl.plot(snpsX[:,0],snpsX[:,1],'k.')
ICEU = (population_1000G=='CEU')
IYRI = (population_1000G=='YRI')

pl.plot(snpsX[:,0][ICEU],snpsX[:,1][ICEU],'k.',label='CEU')
pl.plot(snpsX[:,0][IYRI],snpsX[:,1][IYRI],'r.',label='YRI')
pl.xlabel('PC1')
pl.ylabel('PC2')
pl.legend()
pl.title('1000 genomes')


plt.subplot(1,3,3)
[snpsX,snpsW] = pca.PCA(Matwell[:,0:5000],components=10)
pl.plot(snpsX[:,0],snpsX[:,1],'k.')
pl.xlabel('PC1')
pl.ylabel('PC2')
pl.title('A. thaliana')


# In[10]:


# In[10]:


# In[10]:


# In[10]:


# In[10]:


# In[10]: