old title: influence of error rate-k19-v2-low_fp-new_reads-adjusted-high_fp_new_reads_adjusted_error_only
No error correction, error rate 0-1.5%, coverage, 0.1x and 1x
from skbio.diversity.beta import pw_distances
from skbio import DistanceMatrix
from pandas import DataFrame, read_csv,Series
from skbio.stats.distance import mantel
import matplotlib.pyplot as plt
DIR = "/Users/qingpeng/2013-diversity-Iterated/New_Reads/"
dm_real = read_csv(DIR+'real_matrix.txt',sep="\t", header = 0,index_col=0)
mantel_noerror={}
for cov in [0.1, 1.0,10.0]:
mantel_noerror[cov] = []
for e_v in [0,0.005,0.01,0.015]:
file_string = str(cov)+'_'+str(e_v)
#print file_string
dm_noerror = read_csv(DIR+'High_fp_0.1_adjusted_error_only/'+file_string+'_noerror/matrix.txt',sep="\t", header = 0,index_col=0)
mantel_noerror[cov].append(mantel(dm_real,dm_noerror,method='spearman', strict=False)[0])
dm_noerror2 = read_csv('/Users/qingpeng/2013-diversity-Iterated/New_Reads/High_fp_0.1_adjusted_error_only/10.0_0.01_noerror/matrix2.txt',sep="\t", header = 0,index_col=0)
dm_noerror1 = read_csv('/Users/qingpeng/2013-diversity-Iterated/New_Reads/High_fp_0.1_adjusted_error_only/10.0_0.01_noerror/matrix.txt',sep="\t", header = 0,index_col=0)
mantel(dm_real,dm_noerror2,method='spearman', strict=False)
(0.95830761082556037, 0.0089999999999999993, 6)
mantel(dm_real,dm_noerror1,method='spearman', strict=False)
(0.95830761082556037, 0.01, 6)
x = [0,0.005,0.01,0.015]
x_list = [i*100 for i in x]
mantel_noerror
{0.1: [0.90590016335853751, 0.89454774397402703, 0.89735784159907639, 0.89442601168815072], 1.0: [0.93210388709204894, 0.93210388709204894, 0.93210388709204894, 0.9198386225994708], 10.0: [0.97140947269231603, 0.97140947269231603, 0.95830761082556037, 0.95830761082556037]}
plt.figure(figsize=(10, 8))
plt.plot(x_list,mantel_noerror[0.1],'r*--',label = '0.1x,original')
plt.plot(x_list,mantel_noerror[1.0],'b*--',label = '1x,original')
plt.plot(x_list,mantel_noerror[10.0],'g*--',label = '10x,original')
plt.legend(loc=3)
plt.ylim(0,1.0)
plt.xlim(0,2.0)
plt.xlabel('error rate (%)')
plt.ylabel('correlation of calcuated matrix with golden standard matrix')
plt.title('beta diversity of data with different error rate')
<matplotlib.text.Text at 0x10fce0e90>
alpha_list_noerror={}
for cov in [0.1, 1.0,10.0]:
alpha_list_noerror[cov] = []
for e_v in [0,0.005,0.01,0.015]:
file_string = str(cov)+'_'+str(e_v)
# print file_string
alpha_noerror = read_csv(DIR+'High_fp_0.1_adjusted_error_only/'+file_string+'_noerror/alpha.txt',sep=",", header = 0,index_col=0)
alpha_list_noerror[cov].append(alpha_noerror)
x = [0,0.005,0.01,0.015]
x_list = [i*100 for i in x]
alpha_ne = {}
for cov in alpha_list_noerror.keys():
alpha_ne[cov] = []
# print cov
for df in alpha_list_noerror[cov]:
alpha_ne[cov].append(df['estimated_genome_size'])
#print df
#print df['estimated_genome_size']
#print alpha_ne
number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[0.1]:
for c in color:
try:
number[c].append(df[c])
except:
number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
plt.plot(x_list,number[c],color[c],label = c)
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (0.1X, no error correction)')
{'sample5': [592361.5384615385, 660631.09818181861, 1028628.5714285723, 1152943.9319943306], 'sample4': [448288.23529411771, 445994.11764705862, 1440140.0000000016, 1032458.3333333349], 'sample6': [326139.13043478265, 700118.18181818153, 892924.99999999907, 510266.66666666616], 'sample1': [194159.45945945947, 200788.7323943662, 247209.0909090908, 434828.57142857148], 'sample3': [424444.44444444461, 696554.54545454599, 791588.88888888911, 887457.14285714307], 'sample2': [283244.23076923075, 395053.14811778651, 364375.83051932656, 887457.14285714307]}
<matplotlib.text.Text at 0x110688790>
number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[1.0]:
for c in color:
try:
number[c].append(df[c])
except:
number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
plt.plot(x_list,number[c],color[c],label = c)
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (1X, no error correction)')
{'sample5': [416282.85024154594, 493807.48759585025, 597278.14387699065, 683592.75362318847], 'sample4': [410298.68263473047, 500979.60766423366, 560172.52228631359, 661225.80645161273], 'sample6': [394981.25972006225, 504658.23772372643, 569039.80891719752, 682504.49826989614], 'sample1': [193851.41962838237, 217076.00682695347, 229251.26561999359, 240569.36416184969], 'sample3': [409853.86762360454, 495043.7528191249, 573456.006406834, 662103.57142857148], 'sample2': [288819.52283956786, 344389.23076923075, 378687.96068796073, 433962.79999999999]}
<matplotlib.text.Text at 0x110affad0>
number = {}
color = {'sample1':'r-o','sample2':'b-o','sample3':'g-o','sample4':'g-s','sample5':'g-p','sample6':'g-*'}
for df in alpha_ne[10.0]:
for c in color:
try:
number[c].append(df[c])
except:
number[c] = [df[c]]
print number
plt.figure(figsize=(10, 8))
for c in color:
plt.plot(x_list,number[c],color[c],label = c)
#plt.plot(x_list,alpha[0.1],'r*--',label = '0.1x,original')
plt.plot([0,3],[200000,200000],'r--',label='size of sample 1,200K')
plt.plot([0,3],[300000,300000],'b--',label='size of sample 2,300K')
plt.plot([0,3],[400000,400000],'g--',label='size of sample 3-6,400K')
plt.legend(loc=2)
plt.ylim(0,3000000)
plt.xlim(0,2)
plt.xlabel('error rate (%)')
plt.ylabel('estimated size of metagenome')
plt.title('richness estimation (10X, no error correction)')
{'sample5': [394600.0, 458100.0, 535100.0, 618100.0], 'sample4': [395200.0, 458600.0, 535800.0, 620000.0], 'sample6': [396300.0, 458900.0, 534800.0, 624200.0], 'sample1': [195300.0, 228700.0, 281300.0, 353700.0], 'sample3': [395000.0, 458200.0, 537300.0, 622300.0], 'sample2': [295000.0, 342600.0, 403900.0, 487500.0]}
<matplotlib.text.Text at 0x110b53d10>