using Transformers, CUDA CUDA.devices() CUDA.device!(0) CUDA.allowscalar(false) enable_gpu(true) using Transformers.HuggingFace textenc = hgf"databricks/dolly-v2-12b:tokenizer" model = todevice(hgf"databricks/dolly-v2-12b:ForCausalLM") # move to gpu with `todevice` (or `Flux.gpu`) using Flux using StatsBase function temp_softmax(logits; temperature = 1.2) return softmax(logits ./ temperature) end function top_k_sample(probs; k = 1) sorted = sort(probs, rev = true) indexes = partialsortperm(probs, 1:k, rev=true) index = sample(indexes, ProbabilityWeights(sorted[1:k]), 1) return index end using Transformers.TextEncoders function generate_text(textenc, model, context = ""; max_length = 512, k = 1, temperature = 1.2, ends = textenc.endsym) encoded = encode(textenc, context).token ids = encoded.onehots ends_id = lookup(textenc.vocab, ends) for i in 1:max_length input = (; token = encoded) |> todevice outputs = model(input) logits = @view outputs.logit[:, end, 1] probs = temp_softmax(logits; temperature) new_id = top_k_sample(collect(probs); k)[1] push!(ids, new_id) new_id == ends_id && break end return decode(textenc, encoded) end function generate(textenc, model, instruction; max_length = 512, k = 1, temperature = 1.2) prompt = """ Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: $instruction ### Response: """ text_token = generate_text(textenc, model, prompt; max_length, k, temperature, ends = "### End") gen_text = join(text_token) println(gen_text) end generate(textenc, model, "Explain to me the difference between nuclear fission and fusion.")