!pip install datasets evaluate transformers[sentencepiece] !apt install git-lfs !git config --global user.email "you@example.com" !git config --global user.name "Your Name" from huggingface_hub import notebook_login notebook_login() from datasets import load_dataset # This can take a few minutes to load, so grab a coffee or tea while you wait! raw_datasets = load_dataset("code_search_net", "python") raw_datasets["train"] print(raw_datasets["train"][123456]["whole_func_string"]) # Don't uncomment the following line unless your dataset is small! # training_corpus = [raw_datasets["train"][i: i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)] training_corpus = ( raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000) ) gen = (i for i in range(10)) print(list(gen)) print(list(gen)) def get_training_corpus(): return ( raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000) ) training_corpus = get_training_corpus() def get_training_corpus(): dataset = raw_datasets["train"] for start_idx in range(0, len(dataset), 1000): samples = dataset[start_idx : start_idx + 1000] yield samples["whole_func_string"] from transformers import AutoTokenizer old_tokenizer = AutoTokenizer.from_pretrained("gpt2") example = '''def add_numbers(a, b): """Add the two numbers `a` and `b`.""" return a + b''' tokens = old_tokenizer.tokenize(example) tokens tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000) tokens = tokenizer.tokenize(example) tokens print(len(tokens)) print(len(old_tokenizer.tokenize(example))) example = """class LinearLayer(): def __init__(self, input_size, output_size): self.weight = torch.randn(input_size, output_size) self.bias = torch.zeros(output_size) def __call__(self, x): return x @ self.weights + self.bias """ tokenizer.tokenize(example) tokenizer.save_pretrained("code-search-net-tokenizer") from huggingface_hub import notebook_login notebook_login() tokenizer.push_to_hub("code-search-net-tokenizer") # Replace "huggingface-course" below with your actual namespace to use your own tokenizer tokenizer = AutoTokenizer.from_pretrained("huggingface-course/code-search-net-tokenizer")