Differentially Gene Expression and Gaussian Processing Regression for ALS disease

The aim of project is specifying the significantly differences genes that infect in speed of ALS progression by build Gaussian Process model to distinguish between two types for mutations for two mouse strains, where we use coregionalization principle apply Gaussian Process regression using GPy package in Ipython , In the begining , We use PUMA package to getting on gene expression values by using mmgmos method, then where we test four cases for the corelation between two strains and two mutations,then finding the best Model from these cases

In [1]:
import numpy as np
import pylab as pb
import GPy
import Bio.Affy.CelFile as ss
from xlrd import open_workbook
import pickle 
%pylab inline
Populating the interactive namespace from numpy and matplotlib

Read all Cases's Models , compute Probabilty of each model and then compute ODDs and Logit , then compare and select best model according these values, then Ranking according Probabilty value.

In [2]:
num_genes=10000
from xlrd import open_workbook
wb = open_workbook('ann1.xlsx')
ann=np.repeat("ccccccccccccc",num_genes*2).flatten()
ann=ann.reshape(num_genes,2)
for ss in wb.sheets():
    for row in range(1,num_genes+1):
        ann[row-1,0] = ss.cell(row,0).value
        ann[row-1,1] = ss.cell(row,1).value
        #print ann[row,1]
#print ann[36,0], ann[36,1]
In [3]:
def maxx(odds):
    if odds[0]<0.5 :
        if odds[3]<0.5:
            if odds[5]<0.5:
                return 4
            else :
                return 3
        elif odds[4]<0.5:
            return 4
        else : return 2
    elif odds[1]<0.5:
        if odds[5]<0.5: 
            return 4
        else :
            return 3
    elif odds[2]<0.5:
        return 4
    else : return 1

max1=np.zeros(num_genes).flatten()           
num_genes=1000
import pickle 
kk=np.repeat('cccccccccccccc',(num_genes+1)*25).reshape(num_genes+1,25)
odds=np.zeros(6)

kk[0,:]=['Prob_ID','Gene symbol', 'likelihood value C1','likelihood value C2','likelihood value C3','likelihood value C4','propab. C1','propab.C2','propab. C3' 
         ,'propab. C4','ODDS C1/c2','ODDS C1/c3','ODDS  C1/c4' ,'ODDS . C2/c3',' ODDS . C2/c4','ODDS . C3/c4', ' logit C1/c2',' logit C1/c3',' logit C1/c4'
         ,' logit C2/c3',' logit C2/c4',' logit C3/c4','best model','Gene No.','model 1 or 2' ]
new=[]
m=[0,1,2,3]
for j in range(0,4):
    new.append(numpy.load("Case{}_models.npy".format(j+1)))

norm_case=np.zeros(4*num_genes).reshape(4,num_genes)
for i in range(0,num_genes):
    for j in range(0,4):
        f= open('Case{}_models{}/model_{}.pickle'.format(j+1,int (new[j][i,0]),int (new[j][i,1])),'r' )
        m[j]=pickle.load(f).log_likelihood()
   
    m1=np.exp(m)
    
    prop=m1/np.sum(m1)
    l=0

    for k in range(0,3):
      
        for j in range(k+1,4):
            odds[l]=(prop[k]/prop[j])/(1+(prop[k]/prop[j]))
            l=l+1 
    logit=np.log(odds)
    max1[i]=int(maxx(odds)) # find the best model by it's Odds value     
    
    kk[i+1,:]=[ann[i,0],ann[i,1],m1[0],m1[1],m1[2],m1[3],prop[0],prop[1],prop[2],prop[3],
               odds[0],odds[1],odds[2],odds[3],odds[4],odds[5],logit[0],logit[1],logit[2],logit[3]
               ,logit[4],logit[5] ,int(max1[i]),i ,int(new[int(max1[i])-1][i,0])]
    
/Users/suraAlrashid/anaconda/lib/python2.7/site-packages/GPy/kern/parts/prod.py:122: RuntimeWarning: invalid value encountered in equal
  self._params == self._get_params().copy()

Below table shows all these values,

In [4]:
for i in range(0,25):
    print kk[0,i], '\t:\t', kk[1,i]
Prob_ID 	:	1415670_at
Gene symbol 	:	Copg
likelihood val 	:	1.57926678831e
likelihood val 	:	1.5148095285e-
likelihood val 	:	8.09685974376e
likelihood val 	:	3.91435140224e
propab. C1 	:	0.154541559933
propab.C2 	:	0.014823399647
propab. C3 	:	0.792330557839
propab. C4 	:	0.038304482580
ODDS C1/c2 	:	0.912476584977
ODDS C1/c3 	:	0.163212705319
ODDS  C1/c4 	:	0.801372731941
ODDS . C2/c3 	:	0.018365021332
 ODDS . C2/c4 	:	0.279013561732
ODDS . C3/c4 	:	0.953885303754
 logit C1/c2 	:	-0.09159285415
 logit C1/c3 	:	-1.81270098826
 logit C1/c4 	:	-0.22142910688
 logit C2/c3 	:	-3.99730743813
 logit C2/c4 	:	-1.27649488998
 logit C3/c4 	:	-0.04721184143
best model 	:	3
Gene No. 	:	0
model 1 or 2 	:	1

Below cells save and read the table

In [10]:
numpy.save("All_Prop_models.npy", kk)
In [11]:
import csv
with open('All_Prop_models.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ')
    for rowx in kk:
        spamwriter.writerow(rowx)
        

        
In [12]:
kk=numpy.load("All_Prop_models.npy")

Ranking the the models according of probabilty of each case

In [13]:
s=np.repeat('ccccccccc',25)
for i in range(1,500):
    
    for j in range(i+1,num_genes+1):
         
        if kk[i,int(kk[i,22])+5]<kk[j,int(kk[j,22])+5]:
            #print i , j
            s[:]=kk[i,:]
            kk[i,:]=kk[j,:]
            kk[j,:]=s[:]
            #print kk[i,:],kk[j,:]

Saving the Ranking result

In [17]:
import csv
with open('After_Ranking_Prop_models.csv', 'wb') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=' ')
    for rowx in kk:
        spamwriter.writerow(rowx)
# another way to save file 
numpy.save("After_Ranking_Prop_models.npy", kk)

Reading the ranking result from the disk and all mmultiple cases for best model

In [18]:
kk=numpy.load("After_Ranking_Prop_models.npy")
In [19]:
list_Ntg_Ntg_down=[22,71,0,89,74,39,6,52,56,75,18,50,8,29,57,63,4,36,12,190,697,513,264,600,105,808,836,412,11,259,357,925,245,830,427,162,617,339,792,900,612,665,608,881,463,129,338,787,968,649]
list_Ntg_Ntg_up=[269,402,396,651,519,712,222,761,959,639,140,701,236,466]
list_Ntg_SOD1=[484,994,920,858,737,390]
list_same_behaver =[430,318,242,352,967,195]
In [44]:
x=0
for i in range(0,1000):
    if int(kk[i+1,23]) in list_Ntg_Ntg_down :
        
        f=open('Case{}_models{}/model_{}.pickle'.format(int(kk[i+1,22]),int(kk[i+1,24]),int(kk[i+1,23])),'r')
        
        mm= pickle.load(f)
        mm.plot(fixed_inputs=[(1, 0), (2, 0)],which_data_rows=range(16),linecol='red')#,fillcol='red')
        mm.plot(fixed_inputs=[(1, 0), (2, 1)],which_data_rows=range(16,32), linecol='blue',ax=pb.gca())
        mm.plot(fixed_inputs=[(1, 1), (2, 0)], which_data_rows=range(32,48),linecol='green',resolution=8, ax=pb.gca())
        mm.plot(fixed_inputs=[(1, 1), (2, 1)], which_data_rows=range(48,64),linecol='yellow',ax=pb.gca())
        
        pb.text(140, -.1,'Case{}_models{}/model_{}'.format(int(kk[i+1,22]),int(kk[i+1,24]),int(kk[i+1,23])), fontsize = 12, color = 'b')
        pb.xlabel('The time series ')
        pb.ylabel('Gene expression {} \n for each mutation for each strain   '.format(kk[i+1,23]))
        pb.title( ' The gene  "{}" and the probe_ID "{}"'.format( kk[i+1,1] ,kk[i+1,0]))
        #savefig('only_g93A/fig{}.jpg'.format(i))