# After installing and starting Julia run the following to install the required packages:
# Pkg.init(); Pkg.update()
# for p in ("CUDAdrv","IJulia","PyCall","JLD2","Knet"); Pkg.add(p); end
# Pkg.checkout("Knet","ilkarman") # make sure we have the right Knet version
# Pkg.build("Knet")

using Knet
True=true # so we can read the python params
include("common/params_lstm.py");

println("OS: ", Sys.KERNEL)
println("Julia: ", VERSION)
println("Knet: ", Pkg.installed("Knet"))
println("GPU: ", readstring(`nvidia-smi --query-gpu=name --format=csv,noheader`))

# define model
function initmodel()
    rnnSpec,rnnWeights = rnninit(EMBEDSIZE,NUMHIDDEN; rnnType=:gru)
    inputMatrix = KnetArray(xavier(Float32,EMBEDSIZE,MAXFEATURES))
    outputMatrix = KnetArray(xavier(Float32,2,NUMHIDDEN))
    return rnnSpec,(rnnWeights,inputMatrix,outputMatrix)
end;

# define loss and its gradient
function predict(weights, inputs, rnnSpec)
    rnnWeights, inputMatrix, outputMatrix = weights # (1,1,W), (X,V), (2,H)
    indices = hcat(inputs...)' # (B,T)
    rnnInput = inputMatrix[:,indices] # (X,B,T)
    rnnOutput = rnnforw(rnnSpec, rnnWeights, rnnInput)[1] # (H,B,T)
    return outputMatrix * rnnOutput[:,:,end] # (2,H) * (H,B) = (2,B)
end

loss(w,x,y,r)=nll(predict(w,x,r),y)
lossgradient = grad(loss);

# load data
include(Knet.dir("data","imdb.jl"))
@time (xtrn,ytrn,xtst,ytst,imdbdict)=imdb(maxlen=MAXLEN,maxval=MAXFEATURES)
for d in (xtrn,ytrn,xtst,ytst); println(summary(d)); end

# prepare for training
weights = nothing; knetgc(); # Reclaim memory from previous run
rnnSpec,weights = initmodel()
optim = optimizers(weights, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);

# cold start
@time for (x,y) in minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
    grads = lossgradient(weights,x,y,rnnSpec)
    update!(weights, grads, optim)
end

# prepare for training
weights = nothing; knetgc(); # Reclaim memory from previous run
rnnSpec,weights = initmodel()
optim = optimizers(weights, Adam; lr=LR, beta1=BETA_1, beta2=BETA_2, eps=EPS);

# 29s
info("Training...")
@time for epoch in 1:EPOCHS
    @time for (x,y) in minibatch(xtrn,ytrn,BATCHSIZE;shuffle=true)
        grads = lossgradient(weights,x,y,rnnSpec)
        update!(weights, grads, optim)
    end
end

info("Testing...")
@time accuracy(weights, minibatch(xtst,ytst,BATCHSIZE), (w,x)->predict(w,x,rnnSpec))