In [1]:

function mcpi(L=10^9)
    c = 0
    for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi()
@time mcpi()
@time mcpi()

  2.978052 seconds
  2.977178 seconds
  2.951632 seconds

Out[1]:

3.141649096

In [2]:

function mcpi_threads(L=10^9)
    c = zeros(Int, Threads.nthreads())
    Threads.@threads for i in 1:L
        tid = Threads.threadid()
        c[tid] += rand()^2 + rand()^2 ≤ 1
    end
    4sum(c)/L
end

@time mcpi_threads()
@time mcpi_threads()
@time mcpi_threads()

  0.597769 seconds (14.40 k allocations: 962.984 KiB, 49.09% compilation time)
  0.586309 seconds (186 allocations: 18.641 KiB)
  0.569634 seconds (185 allocations: 18.266 KiB)

Out[2]:

3.141586064

In [3]:

using LoopVectorization

# The result shall be 4.0 or 0.0.
function mcpi_turbo_incorrect(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand()^2 + rand()^2 ≤ 1
    end
    4c/L
end

@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()

  0.009801 seconds
  0.010009 seconds
  0.009699 seconds

Out[3]:

4.0

In [4]:

using LoopVectorization

rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1

function mcpi_turbo(L=10^9)
    c = 0
    @turbo for i in 1:L
        c += rand_is_in_unit_circle(i)
    end
    4c/L
end

@time mcpi_turbo()
@time mcpi_turbo()
@time mcpi_turbo()

  0.473665 seconds
  0.477346 seconds
  0.477735 seconds

Out[4]:

3.141584064

In [5]:

using LoopVectorization

rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1

function mcpi_tturbo(L=10^9)
    c = 0
    @tturbo for i in 1:L
        c += rand_is_in_unit_circle(i)
    end
    4c/L
end

@show Threads.nthreads()
@time mcpi_tturbo()
@time mcpi_tturbo()
@time mcpi_tturbo()

Threads.nthreads() = 12
  0.137297 seconds (27.16 k allocations: 1.736 MiB, 33.36% compilation time)
  0.121686 seconds (9 allocations: 336 bytes)
  0.111965 seconds (4 allocations: 496 bytes)

Out[5]:

3.141732224

In [6]:

using CUDA
using BenchmarkTools

function mcpi_cuda_count(L=10^8, t::Type{T}=Float32) where T
    4count(≤(1), sum(x->x^2, CUDA.rand(T, 2, L); dims=1)) / L
end

a = @btime mcpi(10^8)
b = @btime mcpi_threads(10^8) # Threads.@threads
c = @btime mcpi_turbo(10^8) # LoopVectorization.@turbo (single thread)
d = @btime mcpi_tturbo(10^8) # LoopVectorization.@tturbo (multi-thread)
e = @btime mcpi_cuda_count(10^8) # CUDA (GPU)
a, b, c, d, e

  292.623 ms (0 allocations: 0 bytes)
  55.793 ms (62 allocations: 7.61 KiB)
  46.542 ms (0 allocations: 0 bytes)
  8.269 ms (0 allocations: 0 bytes)
  18.527 ms (122 allocations: 6.25 KiB)

Out[6]:

(3.1416882, 3.14183216, 3.14119872, 3.14136, 3.14176212)

In [ ]: