#!/usr/bin/env python # coding: utf-8 # # Hiding failures with travis_retry # # `travis_retry` runs tests up to 3 times, potentially hiding intermittent failures. # Let's check how likely we are to hide real failures, given their frequency: # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns # We start with the probability of a single test failure, $b$: # # $$ # P(1) = b # $$ # # The probability of a single run failing all $r$ retries is: # # $$ # P(r) = b^r # $$ # # And if we are doing $n$ simultaneous test runs with $r$ retries, # the probability that we will register *any* failure is: # # $$ # P(\text{any fail}) = 1 - P(\text{all succeed}) \\ # = 1 - (1 - P(r))^{n} \\ # = 1 - (1 - b^r)^{n} # $$ # # which we can plot this vs $b$ for various $n$: # In[2]: import numpy as np badness = np.linspace(0,1) # probability of a single failure retries = 3 # number of times a single test matrix will be run before accepting defeat p_retry_ok = 1 - (badness ** retries) # P(single success) for runs in range(1, 8): p_all_fail = 1 - (p_retry_ok ** runs) plt.plot(badness, p_all_fail, label='n=%i' % runs) plt.legend(loc=0) plt.title("%i retries" % retries) plt.xlabel("$P($single failure$)$") plt.ylabel("$P($registered failure$)$"); # In the notebook right now, $n$ is either 2 or 8, depending on where the bug lies. # In[3]: def any_fail(b, n): r = 3 return (1 - (1 - b**r)**n) # In[4]: print("n=2,b=0.5: %.2f" % any_fail(b=0.5, n=2)) print("n=8,b=0.25: %.2f" % any_fail(b=0.25, n=20)) # So if there's a failure happening 50% of the time in a single test group, # we'll still see it on about 25% of test runs. # If it occurs 25% of the time across *all* js test groups, we'll see at least a single failure about 25% of the time.