Notebook

In [92]:

from skbio.diversity.beta import pw_distances
from skbio import DistanceMatrix
from pandas import DataFrame, read_csv,Series
from skbio.stats.distance import mantel
import matplotlib.pyplot as plt

DIR = "/Users/qingpeng/2013-diversity-Iterated/New_Test/"

dm_real = read_csv(DIR+'real_matrix.txt',sep="\t", header = 0,index_col=0)

In [93]:

mantel_error={}
mantel_noerror={}

for cov in [0.1, 1.0,10.0]:
    mantel_error[cov] = []
    mantel_noerror[cov] = []
    for e_v in [0,0.005,0.01,0.015,0.02,0.025,0.03]:
        
        file_string = str(cov)+'_'+str(e_v)
        #print file_string
        dm_error = read_csv(DIR+'Check_Error/'+file_string+'_error/matrix.txt',sep="\t", header = 0,index_col=0)
        dm_noerror = read_csv(DIR+'Check_Error/'+file_string+'_noerror/matrix.txt',sep="\t", header = 0,index_col=0)

        mantel_error[cov].append(mantel(dm_real,dm_error,method='spearman', strict=False)[0])
        mantel_noerror[cov].append(mantel(dm_real,dm_noerror,method='spearman', strict=False)[0])
        
        

In [94]:

x = [0,0.005,0.01,0.015,0.02,0.025,0.03]
x_list = [i*100 for i in x]

In [95]:

plt.figure(figsize=(10, 8))
plt.plot(x_list,mantel_noerror[0.1],'r*--',label = '0.1x,original')
plt.plot(x_list,mantel_noerror[1.0],'b*--',label = '1x,original')
plt.plot(x_list,mantel_noerror[10.0],'g*--',label = '10x,original')
plt.plot(x_list,mantel_error[0.1],'rs-',label = '0.1x,error_corrected')
plt.plot(x_list,mantel_error[1.0],'bs-',label = '1x,error_corrected')
plt.plot(x_list,mantel_error[10.0],'gs-',label = '10x,error_corrected')
plt.legend(loc=3)
plt.ylim(0,1.0)
plt.xlim(0,3.2)
plt.xlabel('error rate (%)')
plt.ylabel('correlation of calcuated matrix with golden standard matrix')
plt.title('beta diversity of data with different error rate')

Out[95]:

<matplotlib.text.Text at 0x118e6e750>

In [96]:

alpha_list_error={}
alpha_list_noerror={}

for cov in [0.1, 1.0,10.0]:
    alpha_list_error[cov] = []
    alpha_list_noerror[cov] = []
    for e_v in [0,0.005,0.01,0.015,0.02,0.025,0.03]:
        
        file_string = str(cov)+'_'+str(e_v)
#        print file_string
        alpha_error = read_csv(DIR+'Check_Error/'+file_string+'_error/alpha.txt',sep=",", header = 0,index_col=0)
        alpha_noerror = read_csv(DIR+'Check_Error/'+file_string+'_noerror/alpha.txt',sep=",", header = 0,index_col=0)

        alpha_list_error[cov].append(alpha_error)
        alpha_list_noerror[cov].append(alpha_noerror)

In [97]:

x = [0,0.005,0.01,0.015,0.02,0.025,0.03]
x_list = [i*100 for i in x]

In [98]:

alpha = {}
for cov in alpha_list_error.keys():
    alpha[cov] = []
#    print cov
    for df in alpha_list_error[cov]:
        alpha[cov].append(df['estimated_genome_size'])
        #print df
        #print df['estimated_genome_size']
        
#print alpha

alpha_ne = {}
for cov in alpha_list_noerror.keys():
    alpha_ne[cov] = []
#    print cov
    for df in alpha_list_noerror[cov]:
        alpha_ne[cov].append(df['estimated_genome_size'])
        #print df
        #print df['estimated_genome_size']
        
#print alpha_ne

In [109]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha[0.1]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,1000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (0.1X, with error correction)')

{'sample5': [364976.47058823548, 332161.56763190962, 420565.43893766392, 363113.4705882353, 411571.8000000001, 444386.25, 446657.14285714319], 'sample4': [306222.52500000002, 303079.72499999986, 482261.5384615389, 415800.00000000029, 307800.00000000012, 343800.00000000012, 1266953.3999999999], 'sample6': [252492.1875, 323190.0, 444386.25, 444386.25, 363113.4705882353, 521143.875, 422762.54528099619], 'sample1': [125769.60000000001, 107212.06730769233, 123655.3043478261, 138085.71428571432, 184361.0625, 157743.61643835614, 253800.0], 'sample3': [307800.00000000012, 366954.40233236132, 343902.98170554201, 307800.00000000012, 384841.125, 454599.49931584136, 628604.55000000005], 'sample2': [182935.38461538465, 207248.2758620689, 170494.43478260865, 307800.00000000012, 306222.52500000002, 304999.84881895379, 479812.84615384619]}

Out[109]:

<matplotlib.text.Text at 0x119ed40d0>

In [108]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'y-*'}
for df in alpha[1.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,1000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (1X, with error correction)')

{'sample5': [282891.4705882353, 297664.3246463142, 320616.48364485981, 353980.93869096937, 385007.22300263389, 448652.60939012578, 504253.1368421053], 'sample4': [279138.25772825308, 307836.01365705609, 328594.07185155939, 349788.74938373052, 401018.02691790048, 442686.13551177323, 491984.51010886469], 'sample6': [270357.94285714283, 300880.45900411834, 311165.13940092165, 356814.02244389022, 368722.19387755101, 452490.29211295024, 504517.03415659477], 'sample1': [138469.8416613641, 145481.8979148271, 149154.15421570119, 160042.64673396861, 169148.484478158, 186313.34515746986, 204518.97315317465], 'sample3': [276131.0264474625, 306712.85595147841, 324241.37681728881, 343404.85401459853, 381244.01778741868, 419352.70408163272, 478932.43053435115], 'sample2': [199943.91367574254, 217339.75056107726, 232507.63013245034, 250315.36813186808, 263174.48764996469, 289509.65698324022, 345356.75025961909]}

Out[108]:

<matplotlib.text.Text at 0x119e8b790>

In [101]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha[10.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,1000000)
plt.xlim(0,3.2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (10X, with error correction)')

{'sample5': [344412.0, 354385.28794985852, 364714.03902051318, 380617.8939992814, 400745.6502437506, 432976.65683168313, 481547.30406706117], 'sample4': [345465.0, 354314.47865780408, 364612.30908050359, 380283.62937441369, 402037.74837295327, 430836.71580636711, 477304.52576179977], 'sample6': [345870.0, 354162.84806943539, 365414.47652608849, 380013.22538860107, 402574.92756697576, 434517.20658449561, 478679.58957747975], 'sample1': [177147.0, 181278.0, 186563.88861270781, 194033.56596287372, 206715.54098360657, 226216.65405172479, 252547.08406121729], 'sample3': [344655.0, 353585.36785458022, 364682.92244053772, 381515.3116659493, 401183.55185694643, 431095.06070908211, 474884.83542247437], 'sample2': [262440.0, 269274.55665722379, 276930.75195822463, 287943.57096909522, 304440.90247648297, 330525.37683819851, 366400.3106854111]}

Out[101]:

<matplotlib.text.Text at 0x119466890>

In [102]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[0.1]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,7000000)
plt.xlim(0,3.2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (0.1X, no error correction)')

{'sample5': [482261.5384615389, 540697.4546653257, 1273368.6000000001, 2143799.9999999981, 1273368.6000000001, 3223799.9999999967, 3207660.75], 'sample4': [364976.47058823548, 415800.00000000029, 1063799.9999999991, 1058447.25, 3223799.9999999967, 2143799.9999999981, 6463799.9999999935], 'sample6': [264172.69565217395, 572890.90909090859, 909514.28571428487, 1058447.25, 1595750.625, 6431481.0, 2133054.0], 'sample1': [156439.4594594595, 189834.8534057037, 226842.79245283012, 482261.5384615389, 700245.0, 523799.99999999953, 3223799.9999999967], 'sample3': [343800.00000000012, 631799.99999999942, 703799.99999999942, 909514.28571428487, 1603799.9999999988, 2143799.9999999981, 3223799.9999999967], 'sample2': [239248.0800000001, 307800.00000000012, 343902.98170554201, 1603799.9999999988, 1595750.625, 909514.28571428487, 6463799.9999999935]}

Out[102]:

<matplotlib.text.Text at 0x1194b1890>

In [103]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[1.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,7000000)
plt.xlim(0,3.2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (1X, no error correction)')

{'sample5': [344842.46941272443, 426364.08072795148, 561398.6689303905, 752538.46153846162, 1025729.2014787431, 1492602.7846153844, 2182932.3913043467], 'sample4': [341764.18100649357, 436665.23624288425, 556208.24450704223, 756825.50322580652, 1053618.0735155514, 1518875.6132812502, 2364861.7617187491], 'sample6': [326412.11356466875, 427683.21428571432, 546689.39665738156, 776638.41344046732, 993981.68402154406, 1676688.4779516363, 2538646.5344467643], 'sample1': [159136.29959668813, 190994.28482585165, 234655.7760613797, 309365.88149565051, 412627.48922687146, 578218.93262817385, 814074.09566041443], 'sample3': [336381.77259943751, 439737.53336510959, 555961.80508474598, 752514.28265524621, 1021175.1074380165, 1555455.9255319154, 2111302.0774647887], 'sample2': [238053.37914850819, 293171.27932960889, 383303.02282704116, 497589.49868904037, 695386.88931819075, 979566.80994671409, 1501830.539659359]}

Out[103]:

<matplotlib.text.Text at 0x11975a850>

In [107]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[10.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,7000000)
plt.xlim(0,3.2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (10X, no error correction)')

{'sample5': [373491.0, 453178.20010768116, 632545.2973025518, 1008886.3725003281, 1597952.7547767619, 2474716.7666663551, 3605088.352023589], 'sample4': [374463.0, 452594.59324754064, 633498.00581736385, 1003523.5199210072, 1602030.9864894012, 2474840.8640974401, 3673353.5428526057], 'sample6': [375192.0, 453907.81542076921, 629459.20512744901, 997507.34925793088, 1593101.6354836237, 2452288.7624425939, 3678954.1486408771], 'sample1': [189135.0, 239772.48867073769, 440960.36865750654, 931097.81254443049, 1701427.829848943, 2954130.0446187281, 4506363.8239552053], 'sample3': [374058.0, 453385.86497716221, 634160.48159188498, 1006758.7734683269, 1570928.7934844692, 2445115.297676397, 3652899.6069623111], 'sample2': [283014.0, 348618.21949731203, 530052.35363184963, 953005.44956724765, 1576004.42481396, 2673829.8771124957, 4065840.10544826]}

Out[107]:

<matplotlib.text.Text at 0x119ae47d0>

%error correction, over correction!!, fewer k-mers .... , last figure is confusing.

% alpha, simulation, multiple times?????

In [104]: