using TensorOperations, BenchmarkTools

function expr_ijk(N)
    is = Symbol.(:i, 1:N)
    js = Symbol.(:j, 1:N)
    ks = circshift(is, -1)
    is, js, ks
end

function expr_Gprod(G, is, js, ks)
    N = length(is)
    Gs = Expr.(:ref, Ref(G), 1:N)
    Gsijk = Expr.(:ref, Gs, is, js, ks)
    Expr(:call, :*, Gsijk...)
end

function expr_substitution(A, G, N)
    is, js, ks = expr_ijk(N)
    Gprod = expr_Gprod(G, is, js, ks)
    Aj = Expr(:ref, A, js...)
    :($Aj := $Gprod)
end

"""
# Example

`@multr_simple(G, 4)`
"""
macro multr_simple(G, N)
    A, G = esc(gensym(:A)), esc(G)
    subst = expr_substitution(A, G, N)
    :(@tensor $subst)
end

genG(is = [100, 8, 6, 5], js = [6, 7, 5, 4]) = randn.(is, js, circshift(is, -1))
G = genG()

@macroexpand1 @multr_simple(G, 4)

"""
# Example

`@multr_order(G, (i3, i1, i2, i4))`
"""
macro multr_order(G, S)
    A, G = esc(gensym(:A)), esc(G)
    N = length(S.args)
    subst = expr_substitution(A, G, N)
    :(@tensor $subst order=$S)
end

@macroexpand1 @multr_order(G, (i1, i2, i3, i4))

# それぞれの次元の大きさをコスト関数として与えることにする
# とりあえず試してみただけで、このコスト関数がベストかは不明

"""
# Example

```julia-repl
julia> expr_cost1(:i1, 100)
:(i1 => 100χ)
```
"""
expr_cost1(isym, n) = Expr.(:call, :(=>), isym, Expr.(:call, :*, n, :χ))

function expr_cost(isize, jsize)
    N = length(isize)
    is, js, _ = expr_ijk(N)
    costs = expr_cost1.(Iterators.flatten((is, js)), Iterators.flatten((isize, jsize)))
    Expr(:tuple, costs...)
end

expr_cost((100, 8, 6, 5), (6, 7, 5, 4))

"""
# Example

`@multr_opt(G, ((100, 8, 6, 5), (6, 7, 5, 4)))`
"""
macro multr_opt(G, S)
    #Core.println("macro: ", S)
    A, G = esc(gensym(:A)), esc(G)
    isize, jsize = S.args
    N = length(isize.args)
    cost = expr_cost(isize.args, jsize.args)
    subst = expr_substitution(A, G, N)
    :(@tensoropt $cost $subst)
end

@macroexpand1 @multr_opt(G, ((100, 8, 6, 5), (6, 7, 5, 4)))

# Generated function からは S::Tuple が渡される

macro multr_opt(G, S::Tuple)
    #Core.println("macro: ", S)
    A, G = esc(gensym(:A)), esc(G)
    isize, jsize = S
    N = length(isize)
    cost = expr_cost(isize, jsize)
    subst = expr_substitution(A, G, N)
    #Core.println("out: ", cost, subst)
    :(@tensoropt $cost $subst)
end

#Expr(:macrocall, Symbol("@multr_opt"), nothing, :G, ((100, 8, 6, 5), (6, 7, 5, 4))) |> eval |> size

# マクロを直接呼び出した場合、 S は Expr になる
# S を評価してタプルに変換した上で Generated function に処理を渡す
# （マクロからマクロに渡そうとすると Expr になってしまう？）

macro multr_opt(G, S::Expr)
    :(multr($(esc(G)), Val($S)))
end

@macroexpand1 @multr_opt(G, ((100, 8, 6, 5), (6, 7, 5, 4)))

function getijsize(G)
    N = length(G)
    isize = ntuple(i -> size(G[i], 1), N)
    jsize = ntuple(i -> size(G[i], 2), N)
    (isize, jsize)
end

getijsize(G)

@generated function multr(G, ::Val{S}) where S
    #Core.println("generated: ", S)
    :(@multr_opt(G, $S)) # S は Expr ではなくタプルとしてマクロが呼び出される
end

multr(G) = multr(G, Val(getijsize(G)))

multr(G) ≈ @multr_order(G, (i3, i1, i2, i4)) ≈ @multr_simple(G, 4)


# optimized by @tensoropt
# cost = length of each dimension

@benchmark multr($G)

@optimalcontractiontree((i1 => 100χ, i2 => 8χ, i3 => 6χ, i4 => 5χ, j1 => 6χ, j2 => 7χ, j3 => 5χ, j4 => 4χ),
    A[j1, j2, j3, j4] := G[1][i1, j1, i2] * G[2][i2, j2, i3] * G[3][i3, j3, i4] * G[4][i4, j4, i1])

# the order optimized above
@benchmark @multr_order($G, (i3, i1, i2, i4))

# default order of @tensor
@benchmark @multr_simple($G, 4)

# maybe the same order as @multr_simple
@benchmark @multr_order($G, (i2, i3, i4, i1))