# Transformers installation ! pip install transformers datasets # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git #@title from IPython.display import HTML HTML('') from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.") print(encoded_input) tokenizer.decode(encoded_input["input_ids"]) batch_sentences = [ "But what about second breakfast?", "Don't think he knows about second breakfast, Pip.", "What about elevensies?", ] encoded_inputs = tokenizer(batch_sentences) print(encoded_inputs) batch_sentences = [ "But what about second breakfast?", "Don't think he knows about second breakfast, Pip.", "What about elevensies?", ] encoded_input = tokenizer(batch_sentences, padding=True) print(encoded_input) batch_sentences = [ "But what about second breakfast?", "Don't think he knows about second breakfast, Pip.", "What about elevensies?", ] encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) print(encoded_input) batch_sentences = [ "But what about second breakfast?", "Don't think he knows about second breakfast, Pip.", "What about elevensies?", ] encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") print(encoded_input) batch_sentences = [ "But what about second breakfast?", "Don't think he knows about second breakfast, Pip.", "What about elevensies?", ] encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") print(encoded_input) from datasets import load_dataset, Audio dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") dataset[0]["audio"] dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) dataset[0]["audio"] from transformers import AutoFeatureExtractor feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") audio_input = [dataset[0]["audio"]["array"]] feature_extractor(audio_input, sampling_rate=16000) dataset[0]["audio"]["array"].shape dataset[1]["audio"]["array"].shape def preprocess_function(examples): audio_arrays = [x["array"] for x in examples["audio"]] inputs = feature_extractor( audio_arrays, sampling_rate=16000, padding=True, max_length=100000, truncation=True, ) return inputs processed_dataset = preprocess_function(dataset[:5]) processed_dataset["input_values"][0].shape processed_dataset["input_values"][1].shape from datasets import load_dataset dataset = load_dataset("food101", split="train[:100]") dataset[0]["image"] from transformers import AutoImageProcessor image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose size = ( image_processor.size["shortest_edge"] if "shortest_edge" in image_processor.size else (image_processor.size["height"], image_processor.size["width"]) ) _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) def transforms(examples): images = [_transforms(img.convert("RGB")) for img in examples["image"]] examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"] return examples dataset.set_transform(transforms) dataset[0].keys() import numpy as np import matplotlib.pyplot as plt img = dataset[0]["pixel_values"] plt.imshow(img.permute(1, 2, 0)) def collate_fn(batch): pixel_values = [item["pixel_values"] for item in batch] encoding = image_processor.pad_and_create_pixel_mask(pixel_values, return_tensors="pt") labels = [item["labels"] for item in batch] batch = {} batch["pixel_values"] = encoding["pixel_values"] batch["pixel_mask"] = encoding["pixel_mask"] batch["labels"] = labels return batch from datasets import load_dataset lj_speech = load_dataset("lj_speech", split="train") lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) lj_speech[0]["audio"] lj_speech[0]["text"] lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) from transformers import AutoProcessor processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") def prepare_dataset(example): audio = example["audio"] example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) return example prepare_dataset(lj_speech[0])