f(x) = 3x.^2 + 5x + 2

# traditional-style vectorization:
vec!(X) = X .= f(2X.^2 + 6X.^3 - sqrt.(X))

# new-style vectorization (dot operations = syntactic loop fusion):
newvec!(X) = X .= f.(2 .* X.^2 .+ 6 .* X.^3 .- sqrt.(X))

# devectorized (explicit loops):
function devec!(X)
    for i in eachindex(X)
        x = X[i]
        X[i] = f(2x^2 + 6x^3 - sqrt(x))
    end
    return X
end

using BenchmarkTools
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 # use 10s benchmarks to reduce timing noise

X = zeros(10^6)

t_vec = @benchmark vec!($X)

t_devec = @benchmark devec!($X)

t_newvec = @benchmark newvec!($X)

println("traditional vectorization slowdown = ", time(minimum(t_vec)) / time(minimum(t_devec)), "×")
println("new-style vectorization slowdown = ", time(minimum(t_newvec)) / time(minimum(t_devec)), "×")
println("\ntraditional vectorization memory overhead = ", memory(minimum(t_vec)) / sizeof(X), "×")

function vec_prealloc!(X, TWELVE_TEMP_ARRAYS)
    T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12 = TWELVE_TEMP_ARRAYS
    
    # evaluate T7 = 2X.^2 + 6X.^3 - sqrt.(X):
    T1 .= X.^2
    T2 .= 2 .* T1
    T3 .= X.^3
    T4 .= 6 .* T3
    T5 .= T2 .+ T4
    T6 .= sqrt.(X)
    T7 .= T5 .- T6
    
    # evaluate T12 = f(T7):
    T8 .= T7.^2
    T9 .= 3 .* T8
    T10 .= 5 .* T7
    T11 .= T9 .+ T10
    T12 .= T11 .+ 2
    
    # store result in X
    X .= T12
    return X
end
vec_prealloc!(X) = vec_prealloc!(X, ntuple(i -> similar(X), 12))

Y = rand(100)
vec!(copy(Y)) == vec_prealloc!(copy(Y)) == devec!(copy(Y)) == newvec!(copy(Y))

t_vec_prealloc = @benchmark vec_prealloc!($X, $(ntuple(i -> similar(X), 12)))

println("preallocated traditional vectorization slowdown = ", time(minimum(t_vec_prealloc)) / time(minimum(t_devec)), "×")

n = [1,5,10,20,50,100,1000,5000,10^4,10^5,10^6,10^7] # array sizes to benchmark

t = Array{Float64}(length(n), 4)
for i = 1:length(n)
    X = zeros(n[i])
    println("benchmarking n = ", n[i])
    t[i,1] = time(minimum(@benchmark devec!($X)))
    t[i,2] = time(minimum(@benchmark vec!($X)))
    t[i,3] = time(minimum(@benchmark vec_prealloc!($X, $(ntuple(i -> similar(X), 12)))))
    t[i,4] = time(minimum(@benchmark newvec!($X)))
end

["n" "devec" "vec" "vec_pre" "newvec"; n (t ./ t[:,1]) ]

C_code = """
#include <stddef.h>
#include <math.h>
void devec(size_t n, double *X) {
    for (size_t i = 0; i < n; ++i) {
        double x = X[i];
        double x2 = x*x;
        double y = 2*x2 + 6*x*x2 - sqrt(x);
        X[i] = 3*y*y + 5*y + 2;
    }
}
"""
# compile to a shared library by piping C_code to gcc:
# (only works if you have gcc installed)
const Clib = tempname()
open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code)
end

function c_devec!(X::Array{Float64})
    ccall(("devec", Clib), Void, (Csize_t, Ptr{Float64}), length(X), X)
    return X
end

devec!(copy(Y)) ≈ c_devec!(copy(Y))

tc = Array{Float64}(length(n))
for i = 1:length(n)
    X = zeros(n[i])
    println("benchmarking n = ", n[i])
    tc[i] = time(minimum(@benchmark c_devec!($X)))
end
println("\nJulia devec time / C time: ", t[:,1] ./ tc)