old title: influence of error rate-k19-v2-low_fp-new_reads-adjusted-high_fp_new_reads_adjusted_error_only

No error correction, error rate 0-1.5%, coverage, 0.1x and 1x

new reads data set (minor modification of make_reads.py)
k=19
modified calculation of IGS based on sequencing depth instead of median kmer count
smaller hash table with false positive rate as 0.1
adjust estimation according to sequencing error rate only. (hash table false positive rate is not considered)

In [2]:

from skbio.diversity.beta import pw_distances
from skbio import DistanceMatrix
from pandas import DataFrame, read_csv,Series
from skbio.stats.distance import mantel
import matplotlib.pyplot as plt

DIR = "/Users/qingpeng/2013-diversity-Iterated/New_Reads/"

dm_real = read_csv(DIR+'real_matrix.txt',sep="\t", header = 0,index_col=0)

In [2]:

mantel_noerror={}

for cov in [0.1, 1.0,10.0]:

    mantel_noerror[cov] = []
    for e_v in [0,0.005,0.01,0.015]:
        
        file_string = str(cov)+'_'+str(e_v)
        #print file_string
        dm_noerror = read_csv(DIR+'High_fp_0.1_adjusted_error_only/'+file_string+'_noerror/matrix.txt',sep="\t", header = 0,index_col=0)

        mantel_noerror[cov].append(mantel(dm_real,dm_noerror,method='spearman', strict=False)[0])

In [6]:

dm_noerror2 = read_csv('/Users/qingpeng/2013-diversity-Iterated/New_Reads/High_fp_0.1_adjusted_error_only/10.0_0.01_noerror/matrix2.txt',sep="\t", header = 0,index_col=0)
dm_noerror1 = read_csv('/Users/qingpeng/2013-diversity-Iterated/New_Reads/High_fp_0.1_adjusted_error_only/10.0_0.01_noerror/matrix.txt',sep="\t", header = 0,index_col=0)

In [8]:

mantel(dm_real,dm_noerror2,method='spearman', strict=False)

Out[8]:

(0.95830761082556037, 0.0089999999999999993, 6)

In [9]:

mantel(dm_real,dm_noerror1,method='spearman', strict=False)

Out[9]:

(0.95830761082556037, 0.01, 6)

In [3]:

x = [0,0.005,0.01,0.015]
x_list = [i*100 for i in x]

In [4]:

mantel_noerror

Out[4]:

{0.1: [0.90590016335853751,
  0.89454774397402703,
  0.89735784159907639,
  0.89442601168815072],
 1.0: [0.93210388709204894,
  0.93210388709204894,
  0.93210388709204894,
  0.9198386225994708],
 10.0: [0.97140947269231603,
  0.97140947269231603,
  0.95830761082556037,
  0.95830761082556037]}

In [5]:

plt.figure(figsize=(10, 8))
plt.plot(x_list,mantel_noerror[0.1],'r*--',label = '0.1x,original')
plt.plot(x_list,mantel_noerror[1.0],'b*--',label = '1x,original')
plt.plot(x_list,mantel_noerror[10.0],'g*--',label = '10x,original')
plt.legend(loc=3)
plt.ylim(0,1.0)
plt.xlim(0,2.0)
plt.xlabel('error rate (%)')
plt.ylabel('correlation of calcuated matrix with golden standard matrix')
plt.title('beta diversity of data with different error rate')

Out[5]:

<matplotlib.text.Text at 0x10fce0e90>

In [6]:

alpha_list_noerror={}

for cov in [0.1, 1.0,10.0]:

    alpha_list_noerror[cov] = []
    for e_v in [0,0.005,0.01,0.015]:
        
        file_string = str(cov)+'_'+str(e_v)
#        print file_string
        alpha_noerror = read_csv(DIR+'High_fp_0.1_adjusted_error_only/'+file_string+'_noerror/alpha.txt',sep=",", header = 0,index_col=0)

        alpha_list_noerror[cov].append(alpha_noerror)

In [7]:

x = [0,0.005,0.01,0.015]
x_list = [i*100 for i in x]

In [8]:

alpha_ne = {}
for cov in alpha_list_noerror.keys():
    alpha_ne[cov] = []
#    print cov
    for df in alpha_list_noerror[cov]:
        alpha_ne[cov].append(df['estimated_genome_size'])
        #print df
        #print df['estimated_genome_size']
        
#print alpha_ne

In [9]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[0.1]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (0.1X, no error correction)')

{'sample5': [592361.5384615385, 660631.09818181861, 1028628.5714285723, 1152943.9319943306], 'sample4': [448288.23529411771, 445994.11764705862, 1440140.0000000016, 1032458.3333333349], 'sample6': [326139.13043478265, 700118.18181818153, 892924.99999999907, 510266.66666666616], 'sample1': [194159.45945945947, 200788.7323943662, 247209.0909090908, 434828.57142857148], 'sample3': [424444.44444444461, 696554.54545454599, 791588.88888888911, 887457.14285714307], 'sample2': [283244.23076923075, 395053.14811778651, 364375.83051932656, 887457.14285714307]}

Out[9]:

<matplotlib.text.Text at 0x110688790>

In [10]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[1.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (1X, no error correction)')

{'sample5': [416282.85024154594, 493807.48759585025, 597278.14387699065, 683592.75362318847], 'sample4': [410298.68263473047, 500979.60766423366, 560172.52228631359, 661225.80645161273], 'sample6': [394981.25972006225, 504658.23772372643, 569039.80891719752, 682504.49826989614], 'sample1': [193851.41962838237, 217076.00682695347, 229251.26561999359, 240569.36416184969], 'sample3': [409853.86762360454, 495043.7528191249, 573456.006406834, 662103.57142857148], 'sample2': [288819.52283956786, 344389.23076923075, 378687.96068796073, 433962.79999999999]}

Out[10]:

<matplotlib.text.Text at 0x110affad0>

In [11]:

number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[10.0]:
    for c in color:
        try:
            number[c].append(df[c])
        except:
            number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
    plt.plot(x_list,number[c],color[c],label = c)
    
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (10X, no error correction)')

{'sample5': [394600.0, 458100.0, 535100.0, 618100.0], 'sample4': [395200.0, 458600.0, 535800.0, 620000.0], 'sample6': [396300.0, 458900.0, 534800.0, 624200.0], 'sample1': [195300.0, 228700.0, 281300.0, 353700.0], 'sample3': [395000.0, 458200.0, 537300.0, 622300.0], 'sample2': [295000.0, 342600.0, 403900.0, 487500.0]}

Out[11]:

<matplotlib.text.Text at 0x110b53d10>

In [11]: