using BenchmarkTools, Base.Test, PyPlot

function E₁_slow{T<:AbstractFloat}(z::Union{T,Complex{T}})
    real(z) < 0 && error("real(z) < 0 not implemented")
    return quadgk(u -> exp(-z/u)/u, 0, 1, reltol=eps(T)*10)[1]
end
E₁_slow{T<:Real}(z::Union{T,Complex{T}}) = E₁_slow(float(z))

# compute the relative error of approx compared to exact
relerr(approx,exact) = norm(approx - exact) / norm(exact)

relerr(E₁_slow(1.2), 0.158408436851462561424955970710861738534157976840578963378)

relerr(E₁_slow(1.2 + 3.4im),
       -0.0196798781439709839467398951111963946354437628483798953 +
        0.0708764302707789307217865597073426047954413415009995511im)

function E₁_taylor(z::Number)
    E₁ = -eulergamma - log(z)
    # iteratively compute the terms in the series, starting with k=1
    term = oftype(E₁, z) # use oftype to ensure type stability
    E₁ += term
    for k=2:1000000
        term = -term * z * (k-1) / (k * k)
        prev_E₁ = E₁
        E₁ += term
        E₁ == prev_E₁ && return (E₁, k-1)
    end
    return E₁, typemax(Int)
end

relerr(E₁_taylor(1.2)[1], 0.158408436851462561424955970710861738534157976840578963378)

relerr(E₁_taylor(1.2 + 3.4im)[1],
       -0.0196798781439709839467398951111963946354437628483798953 +
        0.0708764302707789307217865597073426047954413415009995511im)

x = linspace(0,10,200)
pcolor(x',x, [E₁_taylor(x+y*im)[2] for y in x, x in x], cmap="hsv")
colorbar()
xlabel("real(z)")
ylabel("imag(z)")
title(L"number of Taylor-series terms for $E_1(z)$")

# SOLUTION code
# n coefficients of the Taylor series of E₁(z) + log(z), in type T:
function E₁_taylor_coefficients{T<:Number}(::Type{T}, n::Integer)
    n < 0 && throw(ArgumentError("$n ≥ 0 is required"))
    n == 0 && return T[]
    n == 1 && return T[-eulergamma]
    # iteratively compute the terms in the series, starting with k=1
    term::T = 1
    terms = T[-eulergamma, term]
    for k=2:n
        term = -term * (k-1) / (k * k)
        push!(terms, term)
    end
    return terms
end

E₁_taylor_coefficients(Float64, 10)

# SOLUTION code
macro E₁_taylor64(z, n::Integer)
    c = E₁_taylor_coefficients(Float64, n)
    taylor = Expr(:macrocall, Symbol("@evalpoly"), :t, c...)
    quote
        let t = $(esc(z))
            $taylor - log(t)
        end
    end
end

relerr(@E₁_taylor64(2.0, 30), E₁_taylor(2.0)[1])

# compute E₁ via n terms of the continued-fraction expansion, implemented
# in the simplest way:
function E₁_cf(z::Number, n::Integer)
    # starting with z seems to give many fewer terms for intermediate |z| ~ 3
    cf::typeof(inv(z)) = z
    for i = n:-1:1
        cf = z + (1+i)/cf
        cf = 1 + i/cf
    end
    return exp(-z) / (z + inv(cf))
end

[relerr(E₁_cf(4, n), E₁_slow(4)) for n in 1:20]

[relerr(E₁_cf(8, n), E₁_slow(8)) for n in 1:20]

function E₁_cf_nterms(z::Number, reltol=1e-14)
    for n = 1:100 # give up after 100 terms
        doubled = E₁_cf(z, 2n)
        if abs(E₁_cf(z, n) - doubled) ≤ reltol*abs(doubled)
            return n
        end
    end
    return 101
end

x = linspace(1,10,100)
pcolor(x',x, [E₁_cf_nterms(x+y*im) for y in x, x in x], cmap="hsv")
colorbar()
xlabel("real(z)")
ylabel("imag(z)")
title(L"number of cf-series terms for $E_1(z)$")

using SymPy
display("text/latex", E₁_cf(Sym(:z), 4))

# SOLUTION code
# for numeric-literal coefficients: simplify to a ratio of two polynomials:
import Polynomials
# return (p,q): the polynomials p(x) / q(x) corresponding to E₁_cf(x, a...),
# but without the exp(-x) term
function E₁_cfpoly{T<:Real}(n::Integer, ::Type{T}=BigInt)
    q = Polynomials.Poly(T[1])
    p = x = Polynomials.Poly(T[0,1])
    for i = n:-1:1
        p, q = x*p+(1+i)*q, p # from cf = x + (1+i)/cf = x + (1+i)*q/p
        p, q = p + i*q, p     # from cf = 1 + i/cf = 1 + i*q/p
    end
    # do final 1/(x + inv(cf)) = 1/(x + q/p) = p/(x*p + q)
    return p, x*p + q
end
macro E₁_cf64(x, n::Integer)
    p,q = E₁_cfpoly(n, BigInt)
    evalpoly = Symbol("@evalpoly")
    num_expr = Expr(:macrocall, evalpoly, :t, Float64.(Polynomials.coeffs(p))...)
    den_expr = Expr(:macrocall, evalpoly, :t, Float64.(Polynomials.coeffs(q))...)
    quote
        let t = $(esc(x))
            exp(-t) * $num_expr / $den_expr
        end
    end
end

display("text/latex", cancel(E₁_cf(Sym(:z), 4)))

E₁_cfpoly(4)

display("text/latex", @E₁_cf64 Sym(:z) 4)

semilogy(0:30, [relerr(@eval(@E₁_cf64(3.0, $n)), E₁_cf(3.0, n)) for n in 0:30], "ro")
xlabel("order n")
ylabel("relative error")

function time_taylor(z)
    E₁, n = E₁_taylor(z)
    f = gensym() # generate a function name so that we can benchmark in a function
    @eval $f(z) = @E₁_taylor64 z $n
    b = @eval @benchmark $f($z)
    return time(minimum(b)) # time in ns
end
function time_cf(z)
    n = E₁_cf_nterms(z)
    f = gensym() # generate a function name so that we can benchmark in a function
    @eval $f(z) = @E₁_cf64 z $n
    b = @eval @benchmark $f($z)
    return time(minimum(b)) # time in ns
end

x = linspace(2,5, 10)
semilogy(x, [time_taylor(x) for x in x], "ro-")
semilogy(x, [time_cf(x) for x in x], "bs-")
xlabel("real(z)")
ylabel("time (ns)")
legend(["taylor", "cf"])
title("timing comparison for inlined Taylor vs. cf (real z)")

x = linspace(2,5, 10)
semilogy(x, [time_taylor(x + 0im) for x in x], "ro-")
semilogy(x, [time_cf(x + 0im) for x in x], "bs-")
xlabel("real(z)")
ylabel("time (ns)")
legend(["taylor", "cf"])
title("timing comparison for inlined Taylor vs. cf (complex z)")

x = linspace(3,8, 10)
semilogy(x, [time_taylor(x*im) for x in x], "ro-")
semilogy(x, [time_cf(x*im) for x in x], "bs-")
xlabel("real(z)")
ylabel("time (ns)")
legend(["taylor", "cf"])
title("timing comparison for inlined Taylor vs. cf (imaginary z)")

E₁_cf_nterms(3.0), E₁_cf_nterms(5.0im)

E₁_taylor(3.0)[2], E₁_taylor(5.0im)[2]

x = 1:0.2:1000
cf_cutoffs = hcat([ x[findfirst(x -> E₁_cf_nterms(x) < n, x)] for n in (30,15,8,4) ],
                  [ x[findfirst(x -> E₁_cf_nterms(x*im) < n, x)] for n in (30,15,8,4) ])

hcat( (cf_cutoffs[:,1]./cf_cutoffs[:,2]).^2, cf_cutoffs[:,1].^2 )

x = logspace(-8,1,10000)
taylor_cutoffs = hcat([ x[findlast(x -> E₁_taylor(x)[2] < n, x)] for n in (30,15,8,4) ],
                      [ x[findlast(x -> E₁_taylor(x*im)[2] < n, x)] for n in (30,15,8,4) ])

hcat( (taylor_cutoffs[:,1]./taylor_cutoffs[:,2]).^2, taylor_cutoffs[:,1].^2 )

E₁_taylor(5.8*im)[2]

x = linspace(1,2000,10000)
x[findfirst(x -> E₁_cf(x,0) == 0.0, x)]

E₁_cf(739 - 4im,0)

739^2

# SOLUTION:
function E₁(z::Union{Float64,Complex{Float64}})
    x² = real(z)^2
    y² = imag(z)^2
    if x² + 0.233*y² ≥ 7.84 # use cf expansion, ≤ 30 terms
        if (x² ≥ 546121) & (real(z) > 0) # underflow
            return zero(z)
        elseif x² + 0.401*y² ≥ 58.0 # ≤ 15 terms
            if x² + 0.649*y² ≥ 540.0 # ≤ 8 terms
                x² + y² ≥ 4e4 && return @E₁_cf z 4
                return @E₁_cf64 z 8
            end
            return @E₁_cf64 z 15
        end
        return @E₁_cf64 z 30
    else # use Taylor expansion, ≤ 37 terms
        r² = x² + y²
        return r² ≤ 0.36 ? (r² ≤ 2.8e-3 ? (r² ≤ 2e-7 ? @E₁_taylor64(z,4) :
                                                       @E₁_taylor64(z,8)) :
                                         @E₁_taylor64(z,15)) :
                          @E₁_taylor64(z,37)
    end
end
E₁{T<:Integer}(z::Union{T,Complex{T},Rational{T},Complex{Rational{T}}}) = E₁(float(z))

E₁_test(z::Union{Float64,Complex{Float64}}) =
    real(z)^2 + imag(z)^2 > 6 ? E₁_cf(z,100) : E₁_taylor(z)[1]

x = linspace(1e-10,10,100)
pcolor(x',x, [log10(1e-16 + relerr(E₁(x+y*im),E₁_test(x+y*im))) for y in x, x in x], cmap="cool")
colorbar()
xlabel("real(z)")
ylabel("imag(z)")
title(L"$\log_{10}$ relative error in optimized $E_1(z)$")

@btime E₁(2+2im)
@btime E₁_slow(2+2im)

@btime E₁(10+2im)
@btime E₁_slow(10+2im)

@btime E₁(5)
@btime E₁_slow(5)

using PyCall
exp1 = pyimport_conda("scipy.special", "scipy")["exp1"]

relerr(E₁(2+2im), exp1(2+2im))

z = rand(10^6)*10 + rand(10^6)*10im
@btime pycall($exp1, PyObject, $(PyObject(z))) evals=1;

E₁(z::AbstractVector) = map(E₁, z)
@btime E₁($z) evals=1;

@btime pycall($exp1, PyObject, $(PyObject(real(z)))) evals=1;
@btime E₁($(real(z))) evals=1;

function preduce(⊕, lst::AbstractVector)
    if nprocs() == 1
        return reduce(⊕, lst)
    end
    n = length(lst)
    queue = Base.copymutable(lst)
    @sync begin
        for p in workers()
            @async begin
                while length(queue) > 1
                    a = pop!(queue)
                    b = pop!(queue)
                    # compute results[p] = results[p] ⊕ lst[idx]
                    push!(queue, remotecall_fetch(⊕, p, a, b))
                end
            end
        end
    end
    if isempty(queue)
        return ⊕() # return empty call if it is defined
    else
        assert(length(queue) == 1)
        return queue[1]
    end
end
preduce(⊕, n::Integer) = preduce(⊕, 1:n)

preduce(+, 1000) == 500500

function weirdplus(x,y)
    sleep(1e-3 / (x+y)) # wait for some number of seconds
    return x + y
end
preduce(weirdplus, 1000)

@btime preduce(weirdplus, 1000) evals=1

addprocs(4) # parallelize over 4 workers
workers()

@everywhere function weirdplus(x,y)
    sleep(1e-3 / (x+y)) # wait for some number of seconds
    return x + y
end
@btime preduce(weirdplus, 1000) evals=1

1.376 / 0.434293