source('./scripts/qpcr_data_cleaning_functions.r') ## Set data directory data_dir = "/Users/mooneymi/Documents/SIG/WNV/qPCR" ## Read in data (byLine) qpcr_data = read.xls(file.path(data_dir, "16-May-2016/Gale_qPCR_byLine_5-16-16 %282%29.xlsx"), sheet=1) head(qpcr_data) dim(qpcr_data) ## Replace line 82 with fixed data (special case for May 16, 2016 data) qpcr_data = qpcr_data[qpcr_data$UW_Line != 82, ] line_82 = read.xls(file.path(data_dir, "18-May-2016/Gale_qPCR_byLine_5-18-16.xlsx"), sheet=1) qpcr_data = rbind(qpcr_data, line_82) dim(qpcr_data) ## Replace lines 54 and 58 with fixed data (special case for May 16, 2016 data) qpcr_data = qpcr_data[!qpcr_data$UW_Line %in% c(54, 58), ] lines_54_58 = read.xls(file.path(data_dir, "23-May-2016/Gale_qPCR_byLine_5-23-16.xlsx"), sheet=1) qpcr_data = rbind(qpcr_data, lines_54_58) dim(qpcr_data) ## Check for duplicates sum(duplicated(qpcr_data)) ## Check that each line has a single mating line_matings = sapply(unique(qpcr_data$UW_Line), function(x){unique(qpcr_data$Mating[qpcr_data$UW_Line==x])}) names(line_matings) = unique(qpcr_data$UW_Line) line_matings ## Add Data_Altered and Notes columns qpcr_data$Data_Altered = NA qpcr_data$Notes = NA ## Annotate Virus qpcr_data$Virus = NA qpcr_data[grep("M",qpcr_data[,"Timepoint"]),"Virus"] <- "Mock" qpcr_data[-grep("M",qpcr_data[,"Timepoint"]),"Virus"] <- "WNV" ## Remove 'M' from time points and convert to numeric qpcr_data[,"Timepoint"] <- as.numeric(as.character(gsub("M","",qpcr_data[,"Timepoint"]))) ## Check experiment names sum(names(summary(qpcr_data[,"Experiment"])) == c("IFIT1","IFITM1", "IFNb1", "IL12b", "WNV")) == 5 summary(qpcr_data[,"Experiment"]) ## Add Group column: UW Line, Timepoint, Virus, Tissue, Experiment separated by "_" qpcr_data$Group = paste(qpcr_data$UW_Line, qpcr_data$Timepoint, qpcr_data$Virus, qpcr_data$Tissue, qpcr_data$Experiment, sep="_") ## Add Lab column qpcr_data$Lab = "Gale" head(qpcr_data) ## Read in data (byMouse) qpcr_data_mouse = read.xls(file.path(data_dir, "16-May-2016/Gale_qPCR_byMouse_5-16-16 %281%29.xlsx"), sheet=1) head(qpcr_data_mouse) dim(qpcr_data_mouse) ## Replace line 82 with fixed data (special case for May 16, 2016 data) qpcr_data_mouse = qpcr_data_mouse[qpcr_data_mouse$UW_Line != 82, ] line_82_mouse = read.xls(file.path(data_dir, "18-May-2016/Gale_qPCR_byMouse_5-18-16.xlsx"), sheet=1) qpcr_data_mouse = rbind(qpcr_data_mouse, line_82_mouse) dim(qpcr_data_mouse) ## Replace lines 54 and 58 with fixed data (special case for May 16, 2016 data) qpcr_data_mouse = qpcr_data_mouse[!qpcr_data_mouse$UW_Line %in% c(54, 58), ] lines_54_58_mouse = read.xls(file.path(data_dir, "23-May-2016/Gale_qPCR_byMouse_5-23-16.xlsx"), sheet=1) qpcr_data_mouse = rbind(qpcr_data_mouse, lines_54_58_mouse) dim(qpcr_data_mouse) ## Check if any CT < 15 length(which(qpcr_data_mouse[,"Ct"] < 15)) ## Check if any reference CT < 15 length(which(qpcr_data_mouse[,"ref.Ct"] < 15)) ## Check if an reference CT == 40 length(which(qpcr_data_mouse[,"ref.Ct"] == 40)) ## Check that each line has a single mating mouse_line_matings = sapply(unique(qpcr_data_mouse$UW_Line), function(x){unique(qpcr_data_mouse$Mating[qpcr_data_mouse$UW_Line==x])}) names(mouse_line_matings) = unique(qpcr_data_mouse$UW_Line) mouse_line_matings ## Remove 'M' from time points and convert to numeric qpcr_data_mouse[,"Timepoint"] <- as.numeric(as.character(gsub("M","",qpcr_data_mouse[,"Timepoint"]))) ## Change Condition column name to Virus names(qpcr_data_mouse)[which(names(qpcr_data_mouse) == "Condition")] <- "Virus" # Check experiment names sum(names(summary(qpcr_data_mouse[,"Experiment"])) == c("IFIT1","IFITM1", "IFNb1", "IL12b", "WNV")) == 5 summary(qpcr_data_mouse[,"Experiment"]) ## Add Group column: UW Line, Timepoint, Virus, Tissue, Experiment separated by "_" qpcr_data_mouse$Group <- paste(qpcr_data_mouse$UW_Line, qpcr_data_mouse$Timepoint, qpcr_data_mouse$Virus, qpcr_data_mouse$Tissue, qpcr_data_mouse$Experiment, sep="_") ## Check that ByLine file contains data from only those animals in ByMouse file length(setdiff(qpcr_data$Group, qpcr_data_mouse$Group)) length(setdiff(qpcr_data_mouse$Group, qpcr_data$Group)) ## Add Data_Altered and Notes columns qpcr_data_mouse$Data_Altered = NA qpcr_data_mouse$Notes = NA ## Add Lab column qpcr_data_mouse$Lab = "Gale" head(qpcr_data_mouse) ## Calculate dCt mean from byMouse data dct_mean = aggregate(formula=qpcr_data_mouse[,"dCt"]~qpcr_data_mouse[,"Group"], data=qpcr_data_mouse, FUN=mean) names(dct_mean) <- c("Group","dCt.mean.V2") dct_mean[order(dct_mean[,1]),] -> dct_mean_order ## Get dCt mean from byLine data byline_dct_mean = unique(qpcr_data[,c("Group","dCt.mean")]) byline_dct_mean[order(byline_dct_mean[,1]),] -> byline_dct_mean_order ## Check that the dCt mean calculations are the same check_dct_mean_v2 = sapply(1:dim(dct_mean_order)[1], function(x){isTRUE(all.equal(dct_mean_order[x,2], byline_dct_mean_order[x,2]))}) ## Print discrepancies, if they exist if(sum(check_dct_mean_v2) == nrow(dct_mean_order)){ print("All dCt mean correct") } else { print("Need to clean dCt mean") dct_mean_errs = cbind(byline_dct_mean_order[which(check_dct_mean_v2==F),], dct_mean_order[which(check_dct_mean_v2==F),]) dct_mean_errs } if (dim(dct_mean_errs)[1] > 0) { for (i in 1:dim(dct_mean_errs)[1]) { qpcr_data$dCt.mean[qpcr_data$Group==dct_mean_errs[i,3]] = dct_mean_errs[i,4] } } ## Calculate the N for each group from the byMouse data data.frame(summary(as.factor(qpcr_data_mouse[,"Group"]),maxsum=8000)) -> bymouse_n names(bymouse_n) <- c("N.V2") bymouse_n[,2] <- row.names(bymouse_n) names(bymouse_n)[2] <- "Group" bymouse_n = bymouse_n[,c(2,1)] bymouse_n[order(bymouse_n[,"Group"]),] -> bymouse_n_order ## Get the N for each group from the byLine data qpcr_data[,c("Group","N")] -> byline_n byline_n[order(byline_n[,"Group"]),] -> byline_n_order ## Print discrepancies, if they exist if(sum(bymouse_n_order[,2] == byline_n_order[,2]) == nrow(bymouse_n_order)){ print("All N correct") } else { print("Need to clean N") n_errs = cbind(byline_n_order[bymouse_n_order[,2] != byline_n_order[,2],], bymouse_n_order[bymouse_n_order[,2] != byline_n_order[,2],]) n_errs } if (dim(n_errs)[1] > 0) { for (i in 1:dim(n_errs)[1]) { qpcr_data$N[qpcr_data$Group==n_errs[i,3]] = n_errs[i,4] } } ## Calculate dCt SD from the byMouse data dct_sd = aggregate(formula=qpcr_data_mouse[,"dCt"]~qpcr_data_mouse[,"Group"], data=qpcr_data_mouse, FUN=sd) names(dct_sd) <- c("Group","dCt.sd.V2") dct_sd[order(dct_sd[,1]),] -> dct_sd_order ## Get dCt SD from the byLine data byline_dct_sd = qpcr_data[,c("Group","dCt.sd")] byline_dct_sd[order(byline_dct_sd[,1]),] -> byline_dct_sd_order check_dct_sd = sapply(1:dim(dct_sd_order)[1],function(x){isTRUE(all.equal(dct_sd_order[x,2],byline_dct_sd_order[x,2]))}) ## Print discrepancies, if they exist if(sum(check_dct_sd) == nrow(dct_sd_order)) { print("All dCt SD correct") } else { print("Need to clean dCt SD") dct_sd_errs = cbind(byline_dct_sd_order[which(check_dct_sd==F),], dct_sd_order[which(check_dct_sd==F),]) dct_sd_errs } if (dim(dct_sd_errs)[1] > 0) { for (i in 1:dim(dct_sd_errs)[1]) { qpcr_data$dCt.sd[qpcr_data$Group==dct_sd_errs[i,3]] = dct_sd_errs[i,4] } } ## Check baseline.dct ## Add new group column to annotate baseline qpcr_data$Group_g <- paste(qpcr_data[,"UW_Line"],qpcr_data[,"Tissue"], qpcr_data[,"Experiment"],sep="_") ## Calculate baseline, baseline is 12 for this data qpcr_data$baseline.dCt.V2 = NA baseline = 12 for(i in unique(qpcr_data[,"Group_g"])){ qpcr_data[which(qpcr_data[,"Group_g"] == i),"baseline.dCt.V2"] <- qpcr_data[which(qpcr_data[,"Group_g"] == i & qpcr_data[,"Virus"] == "Mock" & qpcr_data[,"Timepoint"] == baseline),"dCt.mean"] } # Print discrepancies, if they exist if(sum(qpcr_data[,"baseline.dCt"] == qpcr_data[,"baseline.dCt.V2"]) == nrow(qpcr_data)){ print("All baseline dCt correct") } else { print("Need to clean baseline dCt") baseline_dct_errs = qpcr_data[qpcr_data[,"baseline.dCt"] != qpcr_data[,"baseline.dCt.V2"], c("Group", "Group_g", "baseline.dCt", "baseline.dCt.V2")] baseline_dct_errs } if (dim(baseline_dct_errs)[1] > 0) { for (i in 1:dim(baseline_dct_errs)[1]) { qpcr_data$baseline.dCt[qpcr_data$Group==baseline_dct_errs[i,1]] = baseline_dct_errs[i,4] } } ## Calculate ddCt mean qpcr_data$ddCt.mean.V2 <- as.numeric(as.character(qpcr_data[,"dCt.mean"])) - as.numeric(as.character(qpcr_data[,"baseline.dCt"])) check_ddct_mean = sapply(1:dim(qpcr_data)[1],function(x)isTRUE(all.equal(qpcr_data[x,"ddCt.mean"], qpcr_data[x,"ddCt.mean.V2"], tolerance=5.5e-8))) ## Print discrepancies, if they exist if(sum(check_ddct_mean) == dim(qpcr_data)[1]){ print("All ddCt mean correct") } else { print("Need to clean ddCt mean") ddct_errs = qpcr_data[which(check_ddct_mean==F), c("Group", "Group_g", "ddCt.mean", "ddCt.mean.V2")] ddct_errs } if (dim(ddct_errs)[1] > 0) { for (i in 1:dim(ddct_errs)[1]) { qpcr_data$ddCt.mean[qpcr_data$Group==ddct_errs[i,1]] = ddct_errs[i,4] } } # check fc mean correct qpcr_data$fc.mean.V2 <- 2^-qpcr_data[,"ddCt.mean"] check_fc_mean = sapply(1:dim(qpcr_data)[1],function(x)isTRUE(all.equal(qpcr_data[x,"fc.mean"], qpcr_data[x,"fc.mean.V2"]))) if(sum(check_fc_mean) == dim(qpcr_data)[1]){ print("All FC mean correct") } else { print("Need to clean FC mean") fc_errs = qpcr_data[which(check_fc_mean==F), c("Group", "Group_g", "fc.mean", "fc.mean.V2")] fc_errs } if (dim(fc_errs)[1] > 0) { for (i in 1:dim(fc_errs)[1]) { qpcr_data$fc.mean[qpcr_data$Group==fc_errs[i,1]] = fc_errs[i,4] } } ## Remove extra (unused) columns remove_cols = c("ddCt.sd", "fc.sd", "baseline.dCt.V2", "ddCt.mean.V2", "fc.mean.V2") qpcr_data_v2 = qpcr_data[,-as.vector(unlist(sapply(remove_cols,function(x)which(x==names(qpcr_data)))))] ## Add baseline.dCt.sd column qpcr_data_v2$baseline.dCt.sd = NA # compute baseline sd, use baseline = 12 saved above for (i in unique(qpcr_data_v2[,"Group_g"])){ qpcr_data_v2[which(qpcr_data_v2[,"Group_g"] == i),"baseline.dCt.sd"] <- qpcr_data_v2[which(qpcr_data_v2[,"Group_g"] == i & qpcr_data_v2[,"Virus"] == "Mock" & qpcr_data_v2[,"Timepoint"] == baseline),"dCt.sd"] } ## Add ddCt.se ddCt.se = sapply(1:dim(qpcr_data_v2)[1],function(x){ sqrt((qpcr_data_v2[x,"dCt.sd"]^2/qpcr_data_v2[x,"N"]) + (qpcr_data_v2[x,"baseline.dCt.sd"]^2/qpcr_data_v2[which(qpcr_data_v2[,"Group_g"] == qpcr_data_v2[x,"Group_g"] & qpcr_data_v2[,"Virus"] == "Mock" & qpcr_data_v2[,"Timepoint"] == baseline),"N"]) ) }) qpcr_data_v2$ddCt.se <- ddCt.se ## Order by group qpcr_data_v3_final = qpcr_data_v2[order(qpcr_data_v2[,"Group"]),] ## Double-check all calculations are correct ## dCt mean isTRUE(all.equal(dct_mean_order[,2], qpcr_data_v3_final[,"dCt.mean"])) ## N isTRUE(all.equal(bymouse_n_order[,2], qpcr_data_v3_final[,"N"])) ## dCt SD isTRUE(all.equal(dct_sd_order[,2], qpcr_data_v3_final[,"dCt.sd"])) ## baseline dCt isTRUE(all.equal(qpcr_data[order(qpcr_data[,"Group"]),"baseline.dCt.V2"],qpcr_data_v3_final[,"baseline.dCt"])) ## ddCt mean isTRUE(all.equal(qpcr_data[order(qpcr_data[,"Group"]),"ddCt.mean.V2"],qpcr_data_v3_final[,"ddCt.mean"])) ## FC mean isTRUE(all.equal(qpcr_data[order(qpcr_data[,"Group"]),"fc.mean.V2"],qpcr_data_v3_final[,"fc.mean"])) ## baseline SD, computed using corrected dCt SD at timepoint 12 ## ddCt SE, computed using correct dCt SD and baseline SD ## Remove Group_g column qpcr_data_final_format <- qpcr_data_v3_final[,!(names(qpcr_data_v3_final) %in% "Group_g")] ## Order columns according to data dictionary (byLine) ## Note: you may have to change the path to the data dictionary data_dict <- read.xls(xls="./data/WNV_Data_Dictionary.xlsx", sheet="qPCR Data - By Line", as.is=T) qpcr_data_final_format_order = qpcr_data_final_format[, data_dict[,1]] ## Order columns according to data dictionary (byMouse) data_dict <- read.xls(xls="./data/WNV_Data_Dictionary.xlsx", sheet="qPCR Data - By Mouse", as.is=T) qpcr_data_mouse_order = qpcr_data_mouse[, data_dict[,1]] ## Check both byLine and byMouse have the same UW lines names(summary(as.factor(qpcr_data_mouse_order[,"UW_Line"]))) == names(summary(as.factor(qpcr_data_final_format_order[,"UW_Line"]))) dim(qpcr_data_final_format_order) dim(qpcr_data_mouse_order) ## Read in previous version of byLine data cleaned_data_dir = "~/Documents/SIG/WNV/Cleaned_Data_Releases/23-Mar-2016/" prev_qpcr_byline = read.xls(xls=file.path(cleaned_data_dir, "Gale_qPCR_byLine_23-Mar-2016_final.xlsx"), sheet=1, as.is=T) dim(prev_qpcr_byline) ## Check for duplicates (new data will overwrite old) dup_groups = intersect(qpcr_data_final_format_order$Group, prev_qpcr_byline$Group) length(dup_groups) ## Remove duplicated groups from previous data prev_qpcr_byline = prev_qpcr_byline[!prev_qpcr_byline$Group %in% dup_groups, ] dim(prev_qpcr_byline) ## Combine old and new data qpcr_byline_updated = rbind(prev_qpcr_byline, qpcr_data_final_format_order) dim(qpcr_byline_updated) # Set blanks to NA qpcr_byline_updated_cleaned = clean_na(qpcr_byline_updated) # Remove duplicates if(sum(duplicated(qpcr_byline_updated_cleaned)) != 0){ qpcr_byline_updated_cleaned = qpcr_byline_updated_cleaned[!duplicated(qpcr_byline_updated_cleaned),] } dim(qpcr_byline_updated_cleaned) ## Read in previous version of byMouse data prev_qpcr_bymouse = read.xls(xls=file.path(cleaned_data_dir, "Gale_qPCR_byMouse_23-Mar-2016_final.xlsx"), sheet=1) dim(prev_qpcr_bymouse) ## Check for duplicates (new data will overwrite old) prev_ids = paste(prev_qpcr_bymouse$ID, prev_qpcr_bymouse$Tissue, prev_qpcr_bymouse$Experiment, sep='_') new_ids = paste(qpcr_data_mouse_order$ID, qpcr_data_mouse_order$Tissue, qpcr_data_mouse_order$Experiment, sep='_') dup_ids = intersect(prev_ids, new_ids) length(dup_ids) ## Remove duplicated IDs from previous data prev_qpcr_bymouse = prev_qpcr_bymouse[which(!prev_ids %in% dup_ids), ] dim(prev_qpcr_bymouse) ## Combine old and new data qpcr_bymouse_updated = rbind(prev_qpcr_bymouse, qpcr_data_mouse_order) dim(qpcr_bymouse_updated) ## Set blanks to NA qpcr_bymouse_updated_cleaned = clean_na(qpcr_bymouse_updated) ## Remove duplicates if(sum(duplicated(qpcr_bymouse_updated_cleaned)) != 0){ qpcr_bymouse_updated_cleaned = qpcr_byline_updated_cleaned[!duplicated(qpcr_bymouse_updated_cleaned),] } dim(qpcr_bymouse_updated_cleaned) ## Save ByLine Data write.table(qpcr_byline_updated_cleaned, file=file.path(data_dir, "23-May-2016/Gale_qPCR_byLine_5-23-16_MM_updated.txt"), col.names=T, row.names=F, sep='\t', quote=F, na="") ## Save ByMouse Data write.table(qpcr_bymouse_updated_cleaned, file=file.path(data_dir, "23-May-2016/Gale_qPCR_byMouse_5-23-16_MM_updated.txt"), col.names=T, row.names=F, sep='\t', quote=F, na="")