# Transformers installation ! pip install transformers datasets # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git #@title from IPython.display import HTML HTML('') from huggingface_hub import notebook_login notebook_login() from datasets import load_dataset ds = load_dataset("scene_parse_150", split="train[:50]") ds = ds.train_test_split(test_size=0.2) train_ds = ds["train"] test_ds = ds["test"] train_ds[0] import json from huggingface_hub import cached_download, hf_hub_url repo_id = "huggingface/label-files" filename = "ade20k-id2label.json" id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r")) id2label = {int(k): v for k, v in id2label.items()} label2id = {v: k for k, v in id2label.items()} num_labels = len(id2label) from transformers import AutoImageProcessor checkpoint = "nvidia/mit-b0" image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True) from torchvision.transforms import ColorJitter jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) def train_transforms(example_batch): images = [jitter(x) for x in example_batch["image"]] labels = [x for x in example_batch["annotation"]] inputs = image_processor(images, labels) return inputs def val_transforms(example_batch): images = [x for x in example_batch["image"]] labels = [x for x in example_batch["annotation"]] inputs = image_processor(images, labels) return inputs train_ds.set_transform(train_transforms) test_ds.set_transform(val_transforms) import tensorflow as tf def aug_transforms(image): image = tf.keras.utils.img_to_array(image) image = tf.image.random_brightness(image, 0.25) image = tf.image.random_contrast(image, 0.5, 2.0) image = tf.image.random_saturation(image, 0.75, 1.25) image = tf.image.random_hue(image, 0.1) image = tf.transpose(image, (2, 0, 1)) return image def transforms(image): image = tf.keras.utils.img_to_array(image) image = tf.transpose(image, (2, 0, 1)) return image def train_transforms(example_batch): images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]] labels = [x for x in example_batch["annotation"]] inputs = image_processor(images, labels) return inputs def val_transforms(example_batch): images = [transforms(x.convert("RGB")) for x in example_batch["image"]] labels = [x for x in example_batch["annotation"]] inputs = image_processor(images, labels) return inputs train_ds.set_transform(train_transforms) test_ds.set_transform(val_transforms) import evaluate metric = evaluate.load("mean_iou") def compute_metrics(eval_pred): with torch.no_grad(): logits, labels = eval_pred logits_tensor = torch.from_numpy(logits) logits_tensor = nn.functional.interpolate( logits_tensor, size=labels.shape[-2:], mode="bilinear", align_corners=False, ).argmax(dim=1) pred_labels = logits_tensor.detach().cpu().numpy() metrics = metric.compute( predictions=pred_labels, references=labels, num_labels=num_labels, ignore_index=255, reduce_labels=False, ) for key, value in metrics.items(): if type(value) is np.ndarray: metrics[key] = value.tolist() return metrics def compute_metrics(eval_pred): logits, labels = eval_pred logits = tf.transpose(logits, perm=[0, 2, 3, 1]) logits_resized = tf.image.resize( logits, size=tf.shape(labels)[1:], method="bilinear", ) pred_labels = tf.argmax(logits_resized, axis=-1) metrics = metric.compute( predictions=pred_labels, references=labels, num_labels=num_labels, ignore_index=-1, reduce_labels=image_processor.do_reduce_labels, ) per_category_accuracy = metrics.pop("per_category_accuracy").tolist() per_category_iou = metrics.pop("per_category_iou").tolist() metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)}) metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)}) return {"val_" + k: v for k, v in metrics.items()} from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id) training_args = TrainingArguments( output_dir="segformer-b0-scene-parse-150", learning_rate=6e-5, num_train_epochs=50, per_device_train_batch_size=2, per_device_eval_batch_size=2, save_total_limit=3, evaluation_strategy="steps", save_strategy="steps", save_steps=20, eval_steps=20, logging_steps=1, eval_accumulation_steps=5, remove_unused_columns=False, push_to_hub=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, compute_metrics=compute_metrics, ) trainer.train() trainer.push_to_hub() from transformers import create_optimizer batch_size = 2 num_epochs = 50 num_train_steps = len(train_ds) * num_epochs learning_rate = 6e-5 weight_decay_rate = 0.01 optimizer, lr_schedule = create_optimizer( init_lr=learning_rate, num_train_steps=num_train_steps, weight_decay_rate=weight_decay_rate, num_warmup_steps=0, ) from transformers import TFAutoModelForSemanticSegmentation model = TFAutoModelForSemanticSegmentation.from_pretrained( checkpoint, id2label=id2label, label2id=label2id, ) model.compile(optimizer=optimizer) from transformers import DefaultDataCollator data_collator = DefaultDataCollator(return_tensors="tf") tf_train_dataset = train_ds.to_tf_dataset( columns=["pixel_values", "label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator, ) tf_eval_dataset = test_ds.to_tf_dataset( columns=["pixel_values", "label"], shuffle=True, batch_size=batch_size, collate_fn=data_collator, ) from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback metric_callback = KerasMetricCallback( metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ) push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) callbacks = [metric_callback, push_to_hub_callback] model.fit( tf_train_dataset, validation_data=tf_eval_dataset, callbacks=callbacks, epochs=num_epochs, ) image = ds[0]["image"] image from transformers import pipeline segmenter = pipeline("image-segmentation", model="my_awesome_seg_model") segmenter(image) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU encoding = image_processor(image, return_tensors="pt") pixel_values = encoding.pixel_values.to(device) outputs = model(pixel_values=pixel_values) logits = outputs.logits.cpu() upsampled_logits = nn.functional.interpolate( logits, size=image.size[::-1], mode="bilinear", align_corners=False, ) pred_seg = upsampled_logits.argmax(dim=1)[0] from transformers import AutoImageProcessor image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation") inputs = image_processor(image, return_tensors="tf") from transformers import TFAutoModelForSemanticSegmentation model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation") logits = model(**inputs).logits logits = tf.transpose(logits, [0, 2, 3, 1]) upsampled_logits = tf.image.resize( logits, # We reverse the shape of `image` because `image.size` returns width and height. image.size[::-1], ) pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0] import matplotlib.pyplot as plt import numpy as np color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8) palette = np.array(ade_palette()) for label, color in enumerate(palette): color_seg[pred_seg == label, :] = color color_seg = color_seg[..., ::-1] # convert to BGR img = np.array(image) * 0.5 + color_seg * 0.5 # plot the image with the segmentation map img = img.astype(np.uint8) plt.figure(figsize=(15, 10)) plt.imshow(img) plt.show()