f(x) = 3x.^2 + 5x + 2 # traditional-style vectorization: vec!(X) = X .= f(2X.^2 + 6X.^3 - sqrt.(X)) # new-style vectorization (dot operations = syntactic loop fusion): newvec!(X) = X .= f.(2 .* X.^2 .+ 6 .* X.^3 .- sqrt.(X)) # devectorized (explicit loops): function devec!(X) for i in eachindex(X) x = X[i] X[i] = f(2x^2 + 6x^3 - sqrt(x)) end return X end using BenchmarkTools BenchmarkTools.DEFAULT_PARAMETERS.seconds = 10 # use 10s benchmarks to reduce timing noise X = zeros(10^6) t_vec = @benchmark vec!($X) t_devec = @benchmark devec!($X) t_newvec = @benchmark newvec!($X) println("traditional vectorization slowdown = ", time(minimum(t_vec)) / time(minimum(t_devec)), "×") println("new-style vectorization slowdown = ", time(minimum(t_newvec)) / time(minimum(t_devec)), "×") println("\ntraditional vectorization memory overhead = ", memory(minimum(t_vec)) / sizeof(X), "×") function vec_prealloc!(X, TWELVE_TEMP_ARRAYS) T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12 = TWELVE_TEMP_ARRAYS # evaluate T7 = 2X.^2 + 6X.^3 - sqrt.(X): T1 .= X.^2 T2 .= 2 .* T1 T3 .= X.^3 T4 .= 6 .* T3 T5 .= T2 .+ T4 T6 .= sqrt.(X) T7 .= T5 .- T6 # evaluate T12 = f(T7): T8 .= T7.^2 T9 .= 3 .* T8 T10 .= 5 .* T7 T11 .= T9 .+ T10 T12 .= T11 .+ 2 # store result in X X .= T12 return X end vec_prealloc!(X) = vec_prealloc!(X, ntuple(i -> similar(X), 12)) Y = rand(100) vec!(copy(Y)) == vec_prealloc!(copy(Y)) == devec!(copy(Y)) == newvec!(copy(Y)) t_vec_prealloc = @benchmark vec_prealloc!($X, $(ntuple(i -> similar(X), 12))) println("preallocated traditional vectorization slowdown = ", time(minimum(t_vec_prealloc)) / time(minimum(t_devec)), "×") n = [1,5,10,20,50,100,1000,5000,10^4,10^5,10^6,10^7] # array sizes to benchmark t = Array{Float64}(length(n), 4) for i = 1:length(n) X = zeros(n[i]) println("benchmarking n = ", n[i]) t[i,1] = time(minimum(@benchmark devec!($X))) t[i,2] = time(minimum(@benchmark vec!($X))) t[i,3] = time(minimum(@benchmark vec_prealloc!($X, $(ntuple(i -> similar(X), 12))))) t[i,4] = time(minimum(@benchmark newvec!($X))) end ["n" "devec" "vec" "vec_pre" "newvec"; n (t ./ t[:,1]) ] C_code = """ #include #include void devec(size_t n, double *X) { for (size_t i = 0; i < n; ++i) { double x = X[i]; double x2 = x*x; double y = 2*x2 + 6*x*x2 - sqrt(x); X[i] = 3*y*y + 5*y + 2; } } """ # compile to a shared library by piping C_code to gcc: # (only works if you have gcc installed) const Clib = tempname() open(`gcc -fPIC -O3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f print(f, C_code) end function c_devec!(X::Array{Float64}) ccall(("devec", Clib), Void, (Csize_t, Ptr{Float64}), length(X), X) return X end devec!(copy(Y)) ≈ c_devec!(copy(Y)) tc = Array{Float64}(length(n)) for i = 1:length(n) X = zeros(n[i]) println("benchmarking n = ", n[i]) tc[i] = time(minimum(@benchmark c_devec!($X))) end println("\nJulia devec time / C time: ", t[:,1] ./ tc)