function mcpi(L=10^9)
c = 0
for i in 1:L
c += rand()^2 + rand()^2 ≤ 1
end
4c/L
end
@time mcpi()
@time mcpi()
@time mcpi()
2.978052 seconds 2.977178 seconds 2.951632 seconds
3.141649096
function mcpi_threads(L=10^9)
c = zeros(Int, Threads.nthreads())
Threads.@threads for i in 1:L
tid = Threads.threadid()
c[tid] += rand()^2 + rand()^2 ≤ 1
end
4sum(c)/L
end
@time mcpi_threads()
@time mcpi_threads()
@time mcpi_threads()
0.597769 seconds (14.40 k allocations: 962.984 KiB, 49.09% compilation time) 0.586309 seconds (186 allocations: 18.641 KiB) 0.569634 seconds (185 allocations: 18.266 KiB)
3.141586064
using LoopVectorization
# The result shall be 4.0 or 0.0.
function mcpi_turbo_incorrect(L=10^9)
c = 0
@turbo for i in 1:L
c += rand()^2 + rand()^2 ≤ 1
end
4c/L
end
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
@time mcpi_turbo_incorrect()
0.009801 seconds 0.010009 seconds 0.009699 seconds
4.0
using LoopVectorization
rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1
function mcpi_turbo(L=10^9)
c = 0
@turbo for i in 1:L
c += rand_is_in_unit_circle(i)
end
4c/L
end
@time mcpi_turbo()
@time mcpi_turbo()
@time mcpi_turbo()
0.473665 seconds 0.477346 seconds 0.477735 seconds
3.141584064
using LoopVectorization
rand_is_in_unit_circle(i) = rand()^2 + rand()^2 ≤ 1
function mcpi_tturbo(L=10^9)
c = 0
@tturbo for i in 1:L
c += rand_is_in_unit_circle(i)
end
4c/L
end
@show Threads.nthreads()
@time mcpi_tturbo()
@time mcpi_tturbo()
@time mcpi_tturbo()
Threads.nthreads() = 12 0.137297 seconds (27.16 k allocations: 1.736 MiB, 33.36% compilation time) 0.121686 seconds (9 allocations: 336 bytes) 0.111965 seconds (4 allocations: 496 bytes)
3.141732224
using CUDA
using BenchmarkTools
function mcpi_cuda_count(L=10^8, t::Type{T}=Float32) where T
4count(≤(1), sum(x->x^2, CUDA.rand(T, 2, L); dims=1)) / L
end
a = @btime mcpi(10^8)
b = @btime mcpi_threads(10^8) # Threads.@threads
c = @btime mcpi_turbo(10^8) # LoopVectorization.@turbo (single thread)
d = @btime mcpi_tturbo(10^8) # LoopVectorization.@tturbo (multi-thread)
e = @btime mcpi_cuda_count(10^8) # CUDA (GPU)
a, b, c, d, e
292.623 ms (0 allocations: 0 bytes) 55.793 ms (62 allocations: 7.61 KiB) 46.542 ms (0 allocations: 0 bytes) 8.269 ms (0 allocations: 0 bytes) 18.527 ms (122 allocations: 6.25 KiB)
(3.1416882, 3.14183216, 3.14119872, 3.14136, 3.14176212)