# Transformers installation ! pip install transformers datasets # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git import torch from transformers import XLMTokenizer, XLMWithLMHeadModel tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024") model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024") print(tokenizer.lang2id) input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 language_id = tokenizer.lang2id["en"] # 0 langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, ..., 0]) # We reshape it to be of size (batch_size, sequence_length) langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) outputs = model(input_ids, langs=langs) from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒." tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") encoded_zh = tokenizer(chinese_text, return_tensors="pt") generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) from transformers import AutoTokenizer, AutoModelForSeq2SeqLM en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") encoded_en = tokenizer(en_text, return_tensors="pt") generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id("en_XX")) tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)