!pip install datasets evaluate transformers[sentencepiece] from datasets import load_dataset dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train") def get_training_corpus(): for i in range(0, len(dataset), 1000): yield dataset[i : i + 1000]["text"] with open("wikitext-2.txt", "w", encoding="utf-8") as f: for i in range(len(dataset)): f.write(dataset[i]["text"] + "\n") from tokenizers import ( decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer, ) tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]")) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True) tokenizer.normalizer = normalizers.Sequence( [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()] ) print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?")) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.pre_tokenizer = pre_tokenizers.Whitespace() tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.") pre_tokenizer = pre_tokenizers.WhitespaceSplit() pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.") pre_tokenizer = pre_tokenizers.Sequence( [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()] ) pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.") special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"] trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens) tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) tokenizer.model = models.WordPiece(unk_token="[UNK]") tokenizer.train(["wikitext-2.txt"], trainer=trainer) encoding = tokenizer.encode("Let's test this tokenizer.") print(encoding.tokens) cls_token_id = tokenizer.token_to_id("[CLS]") sep_token_id = tokenizer.token_to_id("[SEP]") print(cls_token_id, sep_token_id) tokenizer.post_processor = processors.TemplateProcessing( single=f"[CLS]:0 $A:0 [SEP]:0", pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)], ) encoding = tokenizer.encode("Let's test this tokenizer.") print(encoding.tokens) encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences.") print(encoding.tokens) print(encoding.type_ids) tokenizer.decoder = decoders.WordPiece(prefix="##") tokenizer.decode(encoding.ids) tokenizer.save("tokenizer.json") new_tokenizer = Tokenizer.from_file("tokenizer.json") from transformers import PreTrainedTokenizerFast wrapped_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, # tokenizer_file="tokenizer.json", # Bạn có thể tải từ tệp tokenizer unk_token="[UNK]", pad_token="[PAD]", cls_token="[CLS]", sep_token="[SEP]", mask_token="[MASK]", ) from transformers import BertTokenizerFast wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer) tokenizer = Tokenizer(models.BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!") trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=["<|endoftext|>"]) tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) tokenizer.model = models.BPE() tokenizer.train(["wikitext-2.txt"], trainer=trainer) encoding = tokenizer.encode("Let's test this tokenizer.") print(encoding.tokens) tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) sentence = "Let's test this tokenizer." encoding = tokenizer.encode(sentence) start, end = encoding.offsets[4] sentence[start:end] tokenizer.decoder = decoders.ByteLevel() tokenizer.decode(encoding.ids) from transformers import PreTrainedTokenizerFast wrapped_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="<|endoftext|>", eos_token="<|endoftext|>", ) from transformers import GPT2TokenizerFast wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer) tokenizer = Tokenizer(models.Unigram()) from tokenizers import Regex tokenizer.normalizer = normalizers.Sequence( [ normalizers.Replace("``", '"'), normalizers.Replace("''", '"'), normalizers.NFKD(), normalizers.StripAccents(), normalizers.Replace(Regex(" {2,}"), " "), ] ) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!") special_tokens = ["", "", "", "", "", "", ""] trainer = trainers.UnigramTrainer( vocab_size=25000, special_tokens=special_tokens, unk_token="" ) tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer) tokenizer.model = models.Unigram() tokenizer.train(["wikitext-2.txt"], trainer=trainer) encoding = tokenizer.encode("Let's test this tokenizer.") print(encoding.tokens) cls_token_id = tokenizer.token_to_id("") sep_token_id = tokenizer.token_to_id("") print(cls_token_id, sep_token_id) tokenizer.post_processor = processors.TemplateProcessing( single="$A:0 :0 :2", pair="$A:0 :0 $B:1 :1 :2", special_tokens=[("", sep_token_id), ("", cls_token_id)], ) encoding = tokenizer.encode("Let's test this tokenizer...", "on a pair of sentences!") print(encoding.tokens) print(encoding.type_ids) tokenizer.decoder = decoders.Metaspace() from transformers import PreTrainedTokenizerFast wrapped_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="", eos_token="", unk_token="", pad_token="", cls_token="", sep_token="", mask_token="", padding_side="left", ) from transformers import XLNetTokenizerFast wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)