#!/usr/bin/env python # coding: utf-8 # ## HLL Sketch Examples # ### Basic Sketch Usage # In[1]: from datasketches import hll_sketch, hll_union, tgt_hll_type # We'll create a sketch with log2(k) = 12 # In[2]: sk = hll_sketch(12) # Insert ~2 million points. Values are hashed, so using sequential integers is fine for demonstration purposes. # In[3]: n = 1 << 21 for i in range(0, n): sk.update(i) print(sk) # Since we know the exact value of n we can look at the estimate and upper/lower bounds as a % of the true value. We'll look at the bounds at 1 standard deviation. In this case, the true value does lie within the bounds, but since these are probabilistic bounds the true value will sometimes be outside them (especially at 1 standard deviation). # In[4]: print("Upper bound (1 std. dev) as % of true value: ", round(100*sk.get_upper_bound(1) / n, 4)) # In[5]: print("Estimate as % of true value: ", round(100*sk.get_estimate() / n, 4)) # In[6]: print("Lower bound (1 std. dev) as % of true value: ", round(100*sk.get_lower_bound(1) / n, 4)) # Finally, we can serialize and deserialize the sketch, which will give us back the same structure. # In[7]: sk_bytes = sk.serialize_compact() len(sk_bytes) # In[8]: sk2 = hll_sketch.deserialize(sk_bytes) print(sk2) # ### Sketch Union Usage # Here, we'll create two sketches with partial overlap in values. For good measure, we'll let k be larger in one sketch. For most applications we'd generally create all new data using the same size sketch, allowing differences to creep in when combining new and historica data. # In[9]: k = 12 n = 1 << 20 offset = int(3 * n / 4) # In[10]: sk1 = hll_sketch(k) sk2 = hll_sketch(k + 1) for i in range(0, n): sk1.update(i) sk2.update(i + offset) # Create a union object and add the sketches to that. To demonstrate smoothly handling multiple sketch sizes, we'll use a size of k+1 here. # In[11]: union = hll_union(k+1) union.update(sk1) union.update(sk2) # Note how log config k has automatically adopted the value of the smaller input sketch. # In[12]: result = union.get_result() print(result) # We can again compare against the exact result, in this case 1.75*n # In[13]: print("Estimate as % of true value: ", round(100*result.get_estimate() / (7*n/4), 4)) # In[ ]: