# import Pkg; Pkg.instantiate()

using DifferentiableStateSpaceModels, DifferenceEquations, LinearAlgebra, Zygote, Distributions, Plots, DiffEqBase, Symbolics, BenchmarkTools

∞ = Inf
@variables α, β, ρ, δ, σ, Ω_1
@variables t::Integer, k(..), z(..), c(..), q(..)

x = [k, z] # states
y = [c, q] # controls
p = [α, β, ρ, δ, σ, Ω_1] # parameters

H = [1 / c(t) - (β / c(t + 1)) * (α * exp(z(t + 1)) * k(t + 1)^(α - 1) + (1 - δ)),
     c(t) + k(t + 1) - (1 - δ) * k(t) - q(t),
     q(t) - exp(z(t)) * k(t)^α,
     z(t + 1) - ρ * z(t)]  # system of model equations

# analytic solutions for the steady state.  Could pass initial values and run solver and use initial values with steady_states_iv
steady_states = [k(∞) ~ (((1 / β) - 1 + δ) / α)^(1 / (α - 1)),
                 z(∞) ~ 0,
                 c(∞) ~ (((1 / β) - 1 + δ) / α)^(α / (α - 1)) -
                        δ * (((1 / β) - 1 + δ) / α)^(1 / (α - 1)),
                 q(∞) ~ (((1 / β) - 1 + δ) / α)^(α / (α - 1))]


Γ = [σ;;] # matrix for the 1 shock.  The [;;] notation just makes it a matrix rather than vector in julia
η = [0; -1;;] # η is n_x * n_ϵ matrix.  The [;;] notation just makes it a matrix rather than vector in julia

# observation matrix.  order is "y" then "x" variables, so [c,q,k,z] in this example
Q = [1.0 0  0   0; # select c as first "z" observable
     0   0  1.0 0] # select k as second "z" observable

# diagonal cholesky of covariance matrix for observation noise (so these are standard deviations).  Non-diagonal observation noise not currently supported
Ω = [Ω_1, Ω_1]

# Generates the files and includes if required.  If the model is already created, then just loads
overwrite_model_cache  = true
model_rbc = @make_and_include_perturbation_model("rbc_notebook_example", H, (; t, y, x, p, steady_states, Γ, Ω, η, Q, overwrite_model_cache)) # Convenience macro.  Saves as ".function_cache/rbc_notebook_example.jl"

# After generation could just include the files directly, or add the following to prevent overwriting the models
# isdefined(Main, :rbc_notebook_example) || include(joinpath(pkgdir(DifferentiableStateSpaceModels),
#                                              "test/generated_models/rbc_notebook_example.jl"))
# Alternatively, this is also the same example as a prebuilt example
# model_rbc = @include_example_module(DifferentiableStateSpaceModels.Examples.rbc_observables)

model_H_latex(model_rbc)

model_steady_states_latex(model_rbc)  # this is the closed form steady state, could instead pass in the state_states_iv to give approximate initial conditions for optimizer instead

@show model_rbc.n_x, model_rbc.n_y, model_rbc.n_ϵ, model_rbc.n_z

p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
p_d = (α = 0.5, β = 0.95) # Pseudo-true values
m = model_rbc  # ensure notebook executed above
sol = generate_perturbation(model_rbc, p_d, p_f) # Solution to the first-order RBC
sol_2 = generate_perturbation(model_rbc, p_d, p_f, Val(2)); # Solution to the second-order RBC
@show sol.retcode, sol_2.retcode, verify_steady_state(m, p_d, p_f) # the final call checks that the analytically provided steady-state solution is correct

@show sol.y, sol.x  # steady states y_ss and x_ss  These are the values such that y ≡ ŷ + sol.y and x ≡ x̂ + sol.x
@show sol.g_x # the policy
@show sol.A, sol.B # the evolution equation of the state, so that x̂' = A x̂ + B ϵ
@show sol.C, sol.D; # the evolution equation of the state, so that z = C x̂ + ν  with variance of ν as D D'.
@show sol.x_ergodic_var; # covariance matrix of the ergodic distribution of x̂, which is mean zero since x̂ ≡ x - x_ss

@show sol.Q, sol.η, sol.Γ; # various matrices for loading of shocks and observations
@show sol.n_y, sol.n_x, sol.n_ϵ, sol.n_z, sol.n_p # sizes of model
@show sol.x_symbols, sol.y_symbols, sol.p_symbols; # the order of symbols for the x vector

function IRF(p_d, ϵ_0; m, p_f, steps)
    sol = generate_perturbation(m, p_d, p_f) # First-order perturbation by default, pass Val(2) as additional argument to do 2nd order.
    x = sol.B * ϵ_0 # start after applying impulse with the model's shock η and Γ(p)
    for _ in 1:steps
        # A note: you cannot use mutating expressions here with most AD code.  i.e. x .= sol.A * x  wouldn't work
        # For more elaborate simuluations, you would want to use DifferenceEquations.jl in practice
        x = sol.A * x # iterate forward using first-order observation equation        
    end    
    return [0, 1]' * sol.C * x # choose the second observable using the model's C observation equation since first-order
end

m = model_rbc  # ensure notebook executed above
p_f = (ρ=0.2, δ=0.02, σ=0.01, Ω_1=0.01) # not differentiated 
p_d = (α=0.5, β=0.95) # different parameters
steps = 10 # steps ahead to forecast
ϵ_0 = [1.0] # shock size
IRF(p_d, ϵ_0; m, p_f, steps) # Function works on its own, calculating perturbation

# Using the Zygote auto-differentiation library already loaded above
p_d = (α=0.5, β=0.95) # different parameters
ϵ_0 = [1.0] # shock size
IRF_grad = gradient((p_d, ϵ_0) -> IRF(p_d, ϵ_0; m, p_f, steps), p_d, ϵ_0) # "closes" over the m, p_f, and steps to create a new function, and differentiates it with respect to other arguments

println("d/dα IRF(p_d, ϵ0) = $(IRF_grad[1].α), d/dβ IRF(p_d, ϵ0) = $(IRF_grad[1].β), d/dϵ0 IRF(p_d, ϵ0) = $(IRF_grad[2])")

function IRF_cache(p_d, ϵ_0; m, p_f, steps, cache)
    sol = generate_perturbation(m, p_d, p_f; cache)  # only difference is passing in a pre-allocated cache to the perturbation solver
    x = sol.B * ϵ_0
    for _ in 1:steps
        x = sol.A * x
    end    
    return [0, 1]' * sol.C * x
end

# To call it
m = model_rbc  # ensure notebook executed above
p_d = (α=0.5, β=0.95)  # Differentiated parameters
p_f = (ρ=0.2, δ=0.02, σ=0.01, Ω_1=0.01)
steps = 10 # steps ahead to forecast
ϵ_0 = [1.0] # shock size
cache = SolverCache(m, Val(1), p_d)   # the p_d passed into the cache just reuses the symbols, and the values in p_d are unused
@show IRF_cache(p_d, ϵ_0; m, p_f, steps, cache)
gradient((p_d, ϵ_0) -> IRF_cache(p_d, ϵ_0; m, p_f, steps, cache), p_d, ϵ_0)  # as before, do not try to differentiate with respect to the cache values itself.

# However, in a problem this small you will not see much of a difference in performance
@btime IRF($p_d, $ϵ_0; m = $m, p_f = $p_f, steps = $steps)
@btime IRF_cache($p_d, $ϵ_0; m = $m, p_f = $p_f, steps = $steps, cache = $cache)

p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
p_d = (α = 0.5, β = 0.95) # Pseudo-true values
m = model_rbc  # ensure notebook executed above
sol = generate_perturbation(m, p_d, p_f) # Solution to the first-order RBC

# Simulate T observations from a random initial condition
T = 20

# draw from ergodic distribution for the initial condition
x_iv = MvNormal(sol.x_ergodic_var)
problem = LinearStateSpaceProblem(sol, x_iv, (0, T))
plot(solve(problem))

noise = Matrix([1.0; zeros(T-1)]') # the ϵ shocks are "noise" in DifferenceEquations for SciML compatibility
x_iv = [0.0, 0.0]  # can pass in a single value rather than a distribution 
problem = LinearStateSpaceProblem(sol, x_iv, (0, T); noise)
plot(solve(problem))

function last_observable(p_d, noise, x_iv; m, p_f, T)
    sol = generate_perturbation(m, p_d, p_f)
    problem = LinearStateSpaceProblem(sol, x_iv, (0, T);noise, observables_noise = nothing)  # removing observation noise
    return solve(problem).z[end][2]  # return 2nd argument of last observable
end
T = 100
noise = Matrix([1.0; zeros(T-1)]') # the ϵ shocks are "noise" in DifferenceEquations for SciML compatibility
x_iv = [0.0, 0.0]  # can pass in a single value rather than a distribution
p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
p_d = (α = 0.5, β = 0.95) # Pseudo-true values
m = model_rbc  # ensure notebook executed above
last_observable(p_d, noise, x_iv; m, p_f, T)

gradient((p_d, noise, x_iv) -> last_observable(p_d, noise, x_iv; m, p_f, T), p_d, noise, x_iv)

p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
p_d = (α = 0.5, β = 0.95) # Pseudo-true values
m = model_rbc
sol = generate_perturbation(model_rbc, p_d, p_f)
ϵ0 = [1.0]
T = 10
sim = irf(sol, ϵ0, T)
plot(sim)

# Simulate multiple trajectories with T observations
trajectories = 40
x_iv = MvNormal(sol.x_ergodic_var)
problem = LinearStateSpaceProblem(sol, x_iv, (0, T))

# Solve multiple trajectories and plot an ensemble
ensemble_results = solve(EnsembleProblem(problem), DirectIteration(), EnsembleThreads();
                 trajectories)
summ = EnsembleSummary(ensemble_results)  # see SciML documentation.  Calculates median and other quantiles automatically.
summ.med # median values for the "x" simulated ensembles

plot(summ, fillalpha= 0.2) # plots by default show the median and quantiles of both variables.  Modifying transparency as an example

# Simulate T observations
T = 20

p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
p_d = (α = 0.5, β = 0.95) # Pseudo-true values
sol = generate_perturbation(model_rbc, p_d, p_f) # Solution to the first-order RBC

x_iv = MvNormal(sol.x_ergodic_var) # draw initial conditions from the ergodic distribution
problem = LinearStateSpaceProblem(sol, x_iv, (0, T))
sim = solve(problem, DirectIteration())
ϵ = sim.W # store the underlying noise in the simulation

# Collapse to simulated observables as a matrix  - as required by current DifferenceEquations.jl likelihood
# see https://github.com/SciML/DifferenceEquations.jl/issues/55 for direct support of this datastructure
z_rbc = hcat(sim.z...) 

using Turing
using Turing: @addlogprob!
Turing.setadbackend(:zygote);  # Especially when we sample the latent noise, we will require high-dimensional gradients with reverse-mode AD

# Turing model definition
@model function rbc_kalman(z, m, p_f, settings)
    α ~ Uniform(0.2, 0.8)  # priors
    β ~ Uniform(0.5, 0.99) 
    p_d = (; α, β)
    T = size(z, 2)
    # also passes in the p_f for fixed values
    sol = generate_perturbation(m, p_d, p_f, Val(1); settings) # first-order perturbation
    if !(sol.retcode == :Success) # if the perturbation failed, we want to return -infinity for the likelihood and resample
        @addlogprob! -Inf
        return
    end
    problem = LinearStateSpaceProblem(sol, zeros(2), (0, T), observables = z) # utility constructs a LinearStateSpaceProblem from the sol.A, sol.B, etc.
    @addlogprob! solve(problem, KalmanFilter()).logpdf # Linear-Gaussian so can marginalize with kalman filter.  It should automatically choose the KalmanFilter in this case if not provided.
end
cache = SolverCache(model_rbc, Val(1),  [:α, :β])
settings = PerturbationSolverSettings(; print_level = 0)
p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
z = z_rbc # simulated in previous steps
turing_model = rbc_kalman(z, model_rbc, p_f, settings) # passing observables from before 

n_samples = 1000
n_adapts = 250
δ = 0.65
alg = NUTS(n_adapts,δ)
println("Sampling.  Ignore any 'rejected due to numerical errors' warnings")  # At this point, Turing can't turn off those warnings
chain_1_marginal = sample(turing_model, alg, n_samples; progress = true)

# Turing model definition
@model function rbc_1_joint(z, m, p_f, cache, settings)
    α ~ Uniform(0.2, 0.8)
    β ~ Uniform(0.5, 0.99)
    p_d = (; α, β)
    T = size(z, 2)
    ϵ_draw ~ MvNormal(m.n_ϵ * T, 1.0)
    ϵ = reshape(ϵ_draw, m.n_ϵ, T)
    sol = generate_perturbation(m, p_d, p_f, Val(1); cache, settings)
    if !(sol.retcode == :Success)
        @addlogprob! -Inf
        return
    end
    problem = LinearStateSpaceProblem(sol, zeros(2), (0, T), observables = z, noise=ϵ)
    @addlogprob! solve(problem, DirectIteration()).logpdf # should choose DirectIteration() by default if not provided
end
cache = SolverCache(model_rbc, Val(1),  [:α, :β])
settings = PerturbationSolverSettings(; print_level = 0)
p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
z = z_rbc # simulated in previous steps
turing_model = rbc_1_joint(z, model_rbc, p_f, cache, settings) # passing observables from before 

n_samples = 300
n_adapts = 50
δ = 0.65
alg = NUTS(n_adapts,δ)
chain_1_joint = sample(turing_model, alg, n_samples; progress = true)

# Turing model definition
@model function rbc_2_joint(z, m, p_f, cache, settings)
    α ~ Uniform(0.2, 0.8)
    β ~ Uniform(0.5, 0.99)
    p_d = (; α, β)
    T = size(z, 2)
    ϵ_draw ~ MvNormal(m.n_ϵ * T, 1.0) # add noise to the estimation
    ϵ = reshape(ϵ_draw, m.n_ϵ, T)
    sol = generate_perturbation(m, p_d, p_f, Val(2); cache, settings)
    if !(sol.retcode == :Success)
        @addlogprob! -Inf
        return
    end
    problem = QuadraticStateSpaceProblem(sol, zeros(2), (0, T), observables = z, noise=ϵ)
    @addlogprob! solve(problem, DirectIteration()).logpdf
end
cache = SolverCache(model_rbc, Val(2),  [:α, :β])
settings = PerturbationSolverSettings(; print_level = 0)
z = z_rbc # simulated in previous steps
p_f = (ρ = 0.2, δ = 0.02, σ = 0.01, Ω_1 = 0.01) # Fixed parameters
turing_model = rbc_2_joint(z, model_rbc, p_f, cache, settings) # passing observables from before 

n_samples = 300
n_adapts = 50
δ = 0.65
alg = NUTS(n_adapts,δ)
chain_2_joint = sample(turing_model, alg, n_samples; progress = true)

symbol_to_int(s) = parse(Int, string(s)[9:end-1])
ϵ_chain = sort(chain_1_joint[:, [Symbol("ϵ_draw[$a]") for a in 1:21], 1], lt = (x,y) -> symbol_to_int(x) < symbol_to_int(y))
tmp = describe(ϵ_chain)
ϵ_mean = tmp[1][:, 2]
ϵ_std = tmp[1][:, 3]
plot(ϵ_mean[2:end], ribbon=2 * ϵ_std[2:end], label="Posterior mean", title = "First-Order Joint: Estimated Latents")
plot!(ϵ', label="True values")

ϵ_chain = sort(chain_2_joint[:, [Symbol("ϵ_draw[$a]") for a in 1:21], 1], lt = (x,y) -> symbol_to_int(x) < symbol_to_int(y))
tmp = describe(ϵ_chain)
ϵ_mean = tmp[1][:, 2]
ϵ_std = tmp[1][:, 3]
plot(ϵ_mean[2:end], ribbon=2 * ϵ_std[2:end], label="Posterior mean", title = "Second-Order Joint: Estimated Latents")
plot!(ϵ', label="True values")