#!/usr/bin/env python
# coding: utf-8

# # Checking whether an ip is in a large ip set
# 
# We (Binder) want to block traffic from well-known datacenters (AWS, GCP, Azure, etc.).
# These services publish their ip ranges (it's a lot!).
# Since there are so many, it's worthwhile to measure how much it costs to check whether an ip is in any of these ranges.
# 
# I'm going to measure a standard-library-only implementation with [ipaddress](https://docs.python.org/3/library/ipaddress.html)
# as we ll as an implementation with [netaddr](https://netaddr.readthedocs.io)
# to see if the performance is worth adding the dependency.
# 
# The main reason for the difference is that `netaddr` has an IPSet construct for non-contiguous collections of addresses, whereas the standard library doesn't have this,
# so I need to implement the `any(ip in network for network in networks)` logic myself.
# The simplest version is O(Networks), whereas netaddr has a smarter implementation.
# 
# 
# Reasoning behind this notebook is in https://github.com/jupyterhub/mybinder.org-deploy/pull/1829
# 
# The scale we are interested in is ~3000, which is the number of unique networks found for GCP, Azure, and AWS.

# In[1]:


import ipaddress
import netaddr


# We want to sample random ip addresses, so we measure evenly across the address space.

# In[2]:


import random

def random_ip():
    """retun a random ip as a string

    since this will be the input we are measuring
    """
    return str(ipaddress.ip_address(random.randint(0, 2**32-1)))


# In[3]:


def random_networks(n):
    networks = set()
    while len(networks) < n:
        # exclude 0.* and 255.* just because
        net = ipaddress.ip_network(random.randint(2**24, 2**32 - 2**24-1))
        net = net.supernet(new_prefix=random.randint(20, 32))
        networks.add(str(net))
    return sorted(networks)


# The standard library doesn't have a single "IPSet" construct, so we need to check against each network CIDR individually. This means it's O(number of networks) to check any ip not in our network (i.e. all allowed requests in the case we are interested in), which is ~3k for Google, Azure, and AWS.

# In[4]:


cidrs = random_networks(3000)
cidrs[::500]


# In[5]:


# construct the list of IPv4Network objects
networks = [ipaddress.ip_network(cidr) for cidr in cidrs]

def check_ip_stdlib_list(ip):
    addr = ipaddress.ip_address(ip)
    for net in networks:
        if addr in net:
            return True
    return False


# In[6]:


get_ipython().run_cell_magic('timeit', 'ip = random_ip()', 'check_ip_stdlib_list(ip)\n')


# netaddr has an IPSet object which can represent a *collection* of networks,
# which is exactly what we have.
# 
# It has an O(1) [implementation](https://netaddr.readthedocs.io/en/latest/_modules/netaddr/ip/sets.html#IPSet.__contains__) of checking whether an ip is in a collection.

# In[7]:


ip_set = netaddr.IPSet(cidrs)

def check_ip_netaddr(ip):
    return (ip in ip_set)


# In[8]:


get_ipython().run_cell_magic('timeit', 'ip = random_ip()', 'check_ip_netaddr(ip)\n')


# We can also reimplement the IPSet version with the standard library,
# to get similar O(1) behavior.
# Let's see how it compares.
# 
# First, create a set of networks.
# 
# Then, for every subnet (at most 31 for ipv4),
# check whether that subnet is in our set:

# In[9]:


network_set = set(networks)

def check_ip_stdlib_set(ip):
    check_net = ipaddress.ip_network(ip)
    while check_net.prefixlen:
        if check_net in network_set:
            return True
            break
        check_net = check_net.supernet(1)
    return False


# In[10]:


get_ipython().run_cell_magic('timeit', 'ip = random_ip()', 'check_ip_stdlib_set(ip)\n')


# We can re-run the tests with 50k CIDRs, to see scaling.
# 
# The list implementation scales linearly,
# while the others are static.

# In[11]:


cidrs = random_networks(50000)
# this will be a worst-case scenario for all implementations,
# as it's never in the set so all iterations will be exhausted
ip = '255.255.255.255'

networks = [ipaddress.ip_network(cidr) for cidr in cidrs]
network_set = set(networks)
ip_set = netaddr.IPSet(cidrs)

print("stdlib_list")
get_ipython().run_line_magic('timeit', 'check_ip_stdlib_list(ip)')
print("netaddr")
get_ipython().run_line_magic('timeit', 'check_ip_netaddr(ip)')
print("stdlib_set")
get_ipython().run_line_magic('timeit', 'check_ip_stdlib_set(ip)')


# ## Scaling the list check
# 
# We can scale-down the list check to see how many CIDRs we should be checking
# before the O(1) implementations are more costly than the simpler list check

# In[12]:


n = 1000
while n > 100:
    cidrs = random_networks(n)
    networks = [ipaddress.ip_network(cidr) for cidr in cidrs]
    print(f"n={n}")
    tr = get_ipython().run_line_magic('timeit', '-o check_ip_stdlib_list(ip)')
    if tr.average < 100e-6:
        break
    n = n // 2


# ### Conclusions
# 
# The standard library takes ~2ms on average to check an ip against 3k CIDRs (30ms for 50k),
# a small but not insignificant cost to incur on every request.
# Netaddr's native IPSet construct, on the other hand, takes about 50µs,
# ~40x faster, and much closer to a negligible contribution to request duration.
# The stdlib reimplementation of `IPSet.__contains__` is ~2-3x slower than netaddr (probably because of the `supernet()` calls creating new Network objets intead of IPSet's modification of integer private attributes), but still an order of magnitude faster than the O(N) impelementation, and most importantly a fixed cost, that doesn't go up as we add networks to check. It's still 50µs for 50k networks, and faster for any number of CIDRs greater than ~100.
# 
# So I think using netaddr is appropriate here.
# If we care strongly about avoiding a small, pure-Python dependency,
# we can use the set implementation with the stdlib.