using Transformers, CUDA

CUDA.devices()

CUDA.device!(0)

CUDA.allowscalar(false)
enable_gpu(true)

using Transformers.HuggingFace

textenc = hgf"databricks/dolly-v2-12b:tokenizer"
model = todevice(hgf"databricks/dolly-v2-12b:ForCausalLM") # move to gpu with `todevice` (or `Flux.gpu`)

using Flux
using StatsBase

function temp_softmax(logits; temperature = 1.2)
    return softmax(logits ./ temperature)
end

function top_k_sample(probs; k = 1)
    sorted = sort(probs, rev = true)
    indexes = partialsortperm(probs, 1:k, rev=true)
    index = sample(indexes, ProbabilityWeights(sorted[1:k]), 1)
    return index
end

using Transformers.TextEncoders

function generate_text(textenc, model, context = ""; max_length = 512, k = 1, temperature = 1.2, ends = textenc.endsym)
    encoded = encode(textenc, context).token
    ids = encoded.onehots
    ends_id = lookup(textenc.vocab, ends)
    for i in 1:max_length
        input = (; token = encoded) |> todevice
        outputs = model(input)
        logits = @view outputs.logit[:, end, 1]
        probs = temp_softmax(logits; temperature)
        new_id = top_k_sample(collect(probs); k)[1]
        push!(ids, new_id)
        new_id == ends_id && break
    end
    return decode(textenc, encoded)
end

function generate(textenc, model, instruction; max_length = 512, k = 1, temperature = 1.2)
    prompt = """
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    
    ### Instruction:
    $instruction
    
    ### Response:
    """    
    text_token = generate_text(textenc, model, prompt; max_length, k, temperature, ends = "### End")
    gen_text = join(text_token)
    println(gen_text)
end

generate(textenc, model, "Explain to me the difference between nuclear fission and fusion.")