from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
For each value of prev (=prevalence), calculate the distribution of possible outcomes (ie. number of observed cases). The 95% confidence belt contains the central 95% of possible outcomes. The 95% confidence interval, are those values of prev for which the actual observation is in the central 95% of possible outcomes.
The sensitivity is assumed to be known well. The confidence intervals are not very sensitive to this uncertainty.
Distribution of possible outcomes: use binomial for signal+background on "n_trial" trials
The background is from false positives. We know the control: "n_fp" false positives in "n_control" negative sample size. The knowledge from the control test is encapsulated as the posterior probability distribution of fpr from that control study. Use a uniform prior for fpr. The posterior probability for fpr proportional to the likelihood.
Make many repetitions of the experiment. For each repetition of the experiment draw fpr from its distributions, and then do a binomial throw given the expectation of signal+background.
study = 7
joint = False
if study == 1:
obs = [7]
n_trial = [870]
n_control = [470 + 390 + 99]
n_fp = [3]
sensitivity = [0.85]
elif study == 2:
obs = [4]
n_trial = [869]
n_control = [149 + 105]
n_fp = [1]
sensitivity = [0.93]
elif study == 3:
obs = [7,4]
n_trial = [870,869]
n_control = [470 + 390 + 99,149 + 105]
n_fp = [3,1]
sensitivity = [0.85,0.93]
elif study == 4:
obs = [2]
n_trial = [869]
n_control = [1091+399+100]
n_fp = [2]
sensitivity = [0.85*0.86]
elif study == 5:
obs = [6]
n_trial = [889]
n_control = [470 + 390 + 99]
n_fp = [3]
sensitivity = [0.85]
elif study == 6:
obs = [7]
n_trial = [885]
n_control = [149 + 105]
n_fp = [1]
sensitivity = [0.93]
elif study == 7:
joint = True
obs = [4]
n_trial = [885]
n_control = [470 + 390 + 99,149 + 105]
n_fp = [3,1]
sensitivity = [0.93*0.85]
# This is the un-normalized posterior for fpr after the control experiment is done (uniform prior)
fpr_plot = np.arange(0.,0.1,0.001)
for i in range(len(n_fp)):
p_fpr_plot = stats.binom.pmf(n_fp[i], n_control[i], fpr_plot)
plt.plot(fpr_plot,p_fpr_plot,label=str(n_fp[i])+'/'+str(n_control[i]))
plt.legend()
plt.show()
# draw a fpr from that distribution
def get_fpr(i_fp, i_control):
nu = 1.*i_fp/i_control
big = stats.binom.pmf(i_fp, i_control, nu)
max_fpr = 0.05 # choose this value using the plot above, fpr will be restricted to be below this value
fpr_result = -1.
while fpr_result < 0.:
fpr_trial = stats.uniform.rvs(0.,max_fpr)
pdf = stats.binom.pmf(i_fp, i_control, fpr_trial)
if stats.uniform.rvs(0.,big) < pdf:
fpr_result = fpr_trial
return fpr_result
# Do repetions of experiment to find boundaries of the x% CL interval on prevalence
#
cl = 0.95
upper = 1.-(1-cl)/2.
lower = (1-cl)/2.
# Do two studys, one coarse to find the approximate boundaries (3 digits) then a fine one to 4 digits
# number of repititions to use:
n_rep_coarse = 1000
n_rep_fine = 10000
n_reps = [n_rep_coarse, n_rep_fine]
# range of prevalences to consider:
prevs_course = np.arange(0., 0.025, 0.001)
prevs_fine = np.arange(0.0142,0.0148,0.0001)
prevs = [prevs_course,prevs_fine]
sum_obs = 0
for ob in obs:
sum_obs += ob
for i_study in range(1):
first_in = -1
last_in = -1
prev_list = prevs[i_study]
for prev in prev_list:
results = []
for i in range(n_reps[i_study]):
total = 0
if not joint:
for j in range(len(n_fp)):
fpr = get_fpr(n_fp[j], n_control[j])
prob = prev*sensitivity[j] + fpr
total += stats.binom.rvs(n_trial[j], prob)
results.append(total)
else:
fpr = 1.
for j in range(len(n_fp)):
fpr *= get_fpr(n_fp[j], n_control[j])
prob = prev*sensitivity[0] + fpr
total += stats.binom.rvs(n_trial[0], prob)
results.append(total)
results.sort()
i_upper = int(upper*n_reps[i_study])
i_lower = int(lower*n_reps[i_study])
print('prevalence = {:.4f}'.format(prev)+' '+str(cl*100)+'% range =['+
str(results[i_lower])+','+str(results[i_upper])+']')
if first_in<0:
if sum_obs>=results[i_lower] and sum_obs<=results[i_upper]:
first_in = prev
if sum_obs>=results[i_lower] and sum_obs<=results[i_upper]:
last_in = prev
if last_in > 0 and sum_obs<results[i_lower]-1:
break
if i_study==0:
print(str(cl*100)+'% CL central interval for prevalence is: [{0:.3f},{1:.3f}]'.format(first_in,last_in))
else:
print(str(cl*100)+'% CL central interval upper bound for prevalence is: {0:.4f}'.format(last_in))
study # | n_trial | obs | n_control | n_fp | sensitivity | quoted CI (%) | correct CI (%) |
---|---|---|---|---|---|---|---|
1 | 870 | 7 | 959 | 3 | 0.85 | 0.32-1.65 | 0.0-1.5 |
2 | 869 | 4 | 254 | 1 | 0.93 | 0.13-1.17 | 0.0-0.8 |
3 | 869 | 11 | [959,254] | [3,1] | [0.85,0.93] | 0.63-2.25 | 0.0-0.8 |
4 | 869 | 2 | 1590 | 2 | 0.73 | 0.03-0.83 | 0.0-0.8 |
5 | 889 | 6 | 959 | 3 | 0.85 | 0.25-1.46 | 0.0-1.3 |
6 | 885 | 7 | 254 | 1 | 0.93 | 0.32-1.62 | 0.0-1.3 |
7 | 885 | 4 | - | - | 0.79 | 0.25-1.46 | 0.2-1.4 |
Correct confidence intervals are calculated to 0.1%. A fine scan can be made to get another digit if needed.
Studies:
In March snapshot, all 7 S1 positives were nucleosapsid negative. We can reject the following hypotheses:
Let $C$ be the combined $S1$ and nucleosapsid ($N$) tests. Probability for a false positive is
$$ P(C|n) = P(S1|n) P(N|S1,n) $$where $n$ means a negative sample.
The worst case (in terms of power of test) has $P(N|S1,n) = 1$. This is ruled out by the March snapshot. The hypothesis that the tests are independent is that $P(N|S1,n) = P(N|n)$. Is there good reason to suggest that they are independent?
All 95% confidence intervals contain 0% prevalence, except for the final one when the assumption of independence is applied.
Summarizing the results as "we estimate ~8 times more infections than reported cases" does not properly convey the very large uncertainty on that number. Perhaps better to say "we estimate 4-20 times more infections..."?