#!/usr/bin/env python
# coding: utf-8

# In[1]:


import ipyparallel as ipp
rc = ipp.Client()
engines = rc[:]


# Approximate π with random $x,y$ samples on a unit square:
# 
# 
# $$
# x \in [0, 1],\\
# y \in [0, 1]
# $$
# 
# where the probability that any given point $x,y$ is inside the unit circle is
# the relative area of the circle within the square:
# 
# $$
# P(x^2 + y^2 \leq 1) = \frac{\pi}{4}
# $$
# 
# so we can approximate π by collecting random samples on the unit square and counting how many are inside the unit circle:
# 
# $$
# \pi \approx \frac{4}{N}  \sum_{i=1}^N \left( x_i^2 + y_i^2 \right) \leq 1
# $$

# In[2]:


def mc_pi(n):
    """Monte Carlo approximation of π
    
    Throw darts uniformly distributed on a square,
    count how many land in circumscribed circle.

    Fraction inside circle approaches π / 4
    """
    import random
    samples = []
    for i in range(n):
        x = random.random()
        y = random.random()
        in_circle = (x * x) + (y * y) <= 1
        samples.append(in_circle)
    return 4 * sum(samples) / n


# Run our tests for a series of sample sizes in serial and parallel,
# comparing the times of each to see how the parallel performance
# improves over serial.

# In[3]:


n_engines = len(engines)

print(f"Using {n_engines} processes")

for n_samples in [100, 1000, 10_000, 100_000, 1_000_000]:
    if n_samples % len(engines):
        n_samples += len(engines)
    samples_per_engine = n_samples // n_engines
    print(f"\nMonte Carlo sampling of π: {n_samples} samples ({samples_per_engine} per engine)")
    print("serial:")
    tr_serial = get_ipython().run_line_magic('timeit', '-o mc_pi(n_samples)')
    print("parallel:")
    tr_parallel = get_ipython().run_line_magic('timeit', '-o sum(engines.apply(mc_pi, samples_per_engine)) / len(engines)')
    print(f"speedup: {tr_serial.average / tr_parallel.average:.2f}x")


# We see that in this example with four local engines that the parallel implementation
# is slower up to about 100,000 samples, where the serial case takes about 30 milliseconds.
# 
# The overhead of scheduling and waiting for a single parallel task
# means that parallelism on tasks quicker than several milliseconds will overwhelm the benefit of the the concurrent workload.
# 
# Tasks must take long enough for the overhead of managing parallel and distributed tasks to be worth it.
# In the case of IPython Parallel, that's on the order of 10s of milliseconds per task.
# If there are many tasks that can be queued concurrently, the overhead can be pipelined and shared,
# reducing the effective per-task overhead.