#!/usr/bin/env python # coding: utf-8 # In[ ]: from scipy import stats import numpy as np import matplotlib.pyplot as plt # ## Calculate confidence intervals on prevalence that appear in Skowronski et al preprint (July 15, 2020) # # For each value of prev (=prevalence), calculate the distribution of possible outcomes (ie. number of observed cases). The 95% confidence belt contains the central 95% of possible outcomes. The 95% confidence interval, are those values of prev for which the actual observation is in the central 95% of possible outcomes. # # * n_trial: number of trials # * obs: observed number of positives # * n_control: number of negative samples used to means false positive rate (ie. specificity) # * n_fp: number of false positives in the control # * sensitiviy: fraction of positive samples that would be identifies as positive # # The sensitivity is assumed to be known well. The confidence intervals are not very sensitive to this uncertainty. # # Distribution of possible outcomes: use binomial for signal+background on "n_trial" trials # # The background is from false positives. We know the control: "n_fp" false positives in "n_control" negative sample size. # The knowledge from the control test is encapsulated as the posterior probability distribution of fpr from that control study. Use a uniform prior for fpr. The posterior probability for fpr proportional to the likelihood. # # Make many repetitions of the experiment. For each repetition of the experiment draw fpr from its distributions, and then do a binomial throw given the expectation of signal+background. # In[ ]: study = 7 joint = False if study == 1: obs = [7] n_trial = [870] n_control = [470 + 390 + 99] n_fp = [3] sensitivity = [0.85] elif study == 2: obs = [4] n_trial = [869] n_control = [149 + 105] n_fp = [1] sensitivity = [0.93] elif study == 3: obs = [7,4] n_trial = [870,869] n_control = [470 + 390 + 99,149 + 105] n_fp = [3,1] sensitivity = [0.85,0.93] elif study == 4: obs = [2] n_trial = [869] n_control = [1091+399+100] n_fp = [2] sensitivity = [0.85*0.86] elif study == 5: obs = [6] n_trial = [889] n_control = [470 + 390 + 99] n_fp = [3] sensitivity = [0.85] elif study == 6: obs = [7] n_trial = [885] n_control = [149 + 105] n_fp = [1] sensitivity = [0.93] elif study == 7: joint = True obs = [4] n_trial = [885] n_control = [470 + 390 + 99,149 + 105] n_fp = [3,1] sensitivity = [0.93*0.85] # In[ ]: # This is the un-normalized posterior for fpr after the control experiment is done (uniform prior) fpr_plot = np.arange(0.,0.1,0.001) for i in range(len(n_fp)): p_fpr_plot = stats.binom.pmf(n_fp[i], n_control[i], fpr_plot) plt.plot(fpr_plot,p_fpr_plot,label=str(n_fp[i])+'/'+str(n_control[i])) plt.legend() plt.show() # In[ ]: # draw a fpr from that distribution def get_fpr(i_fp, i_control): nu = 1.*i_fp/i_control big = stats.binom.pmf(i_fp, i_control, nu) max_fpr = 0.05 # choose this value using the plot above, fpr will be restricted to be below this value fpr_result = -1. while fpr_result < 0.: fpr_trial = stats.uniform.rvs(0.,max_fpr) pdf = stats.binom.pmf(i_fp, i_control, fpr_trial) if stats.uniform.rvs(0.,big) < pdf: fpr_result = fpr_trial return fpr_result # In[ ]: # Do repetions of experiment to find boundaries of the x% CL interval on prevalence # cl = 0.95 upper = 1.-(1-cl)/2. lower = (1-cl)/2. # Do two studys, one coarse to find the approximate boundaries (3 digits) then a fine one to 4 digits # number of repititions to use: n_rep_coarse = 1000 n_rep_fine = 10000 n_reps = [n_rep_coarse, n_rep_fine] # range of prevalences to consider: prevs_course = np.arange(0., 0.025, 0.001) prevs_fine = np.arange(0.0142,0.0148,0.0001) prevs = [prevs_course,prevs_fine] sum_obs = 0 for ob in obs: sum_obs += ob for i_study in range(1): first_in = -1 last_in = -1 prev_list = prevs[i_study] for prev in prev_list: results = [] for i in range(n_reps[i_study]): total = 0 if not joint: for j in range(len(n_fp)): fpr = get_fpr(n_fp[j], n_control[j]) prob = prev*sensitivity[j] + fpr total += stats.binom.rvs(n_trial[j], prob) results.append(total) else: fpr = 1. for j in range(len(n_fp)): fpr *= get_fpr(n_fp[j], n_control[j]) prob = prev*sensitivity[0] + fpr total += stats.binom.rvs(n_trial[0], prob) results.append(total) results.sort() i_upper = int(upper*n_reps[i_study]) i_lower = int(lower*n_reps[i_study]) print('prevalence = {:.4f}'.format(prev)+' '+str(cl*100)+'% range =['+ str(results[i_lower])+','+str(results[i_upper])+']') if first_in<0: if sum_obs>=results[i_lower] and sum_obs<=results[i_upper]: first_in = prev if sum_obs>=results[i_lower] and sum_obs<=results[i_upper]: last_in = prev if last_in > 0 and sum_obs