This notebook is based on ImageNet training in PyTorch.
The goal of this notebook is to demonstrate how to use the Neural Network Compression Framework NNCF 8-bit quantization to optimize a PyTorch model for inference with OpenVINO Toolkit. The optimization process contains the following steps:
FP32
model to INT8
For more advanced usage, refer to these examples.
This tutorial uses the ResNet-18 model with the Tiny ImageNet-200 dataset. ResNet-18 is the version of ResNet models that contains the fewest layers (18). Tiny ImageNet-200 is a subset of the larger ImageNet dataset with smaller images. The dataset will be downloaded in the notebook. Using the smaller model and dataset will speed up training and download time. To see other ResNet models, visit PyTorch hub.
NOTE: This notebook requires a C++ compiler.
On Windows, add the required C++ directories to the system PATH.
Import NNCF and all auxiliary packages from your Python code. Set a name for the model, and the image width and height that will be used for the network. Also define paths where PyTorch, ONNX and OpenVINO IR versions of the models will be stored.
NOTE: All NNCF logging messages below ERROR level (INFO and WARNING) are disabled to simplify the tutorial. For production use, it is recommended to enable logging by removing
set_log_level(logging.ERROR)
.
# On Windows, add the directory that contains cl.exe to the PATH to enable PyTorch to find the
# required C++ tools. This code assumes that Visual Studio 2019 is installed in the default
# directory. If you have a different C++ compiler, add the correct path to os.environ["PATH"]
# directly. Note that the C++ Redistributable is not enough to run this notebook.
# Adding the path to os.environ["LIB"] is not always required - it depends on the system configuration
import sys
if sys.platform == "win32":
import distutils.command.build_ext
import os
from pathlib import Path
VS_INSTALL_DIR = r"C:/Program Files (x86)/Microsoft Visual Studio"
cl_paths = sorted(list(Path(VS_INSTALL_DIR).glob("**/Hostx86/x64/cl.exe")))
if len(cl_paths) == 0:
raise ValueError(
"Cannot find Visual Studio. This notebook requires a C++ compiler. If you installed "
"a C++ compiler, please add the directory that contains cl.exe to `os.environ['PATH']`."
)
else:
# If multiple versions of MSVC are installed, get the most recent one.
cl_path = cl_paths[-1]
vs_dir = str(cl_path.parent)
os.environ["PATH"] += f"{os.pathsep}{vs_dir}"
# The code for finding the library dirs is from:
# https://stackoverflow.com/questions/47423246/get-pythons-lib-path
d = distutils.core.Distribution()
b = distutils.command.build_ext.build_ext(d)
b.finalize_options()
os.environ["LIB"] = os.pathsep.join(b.library_dirs)
print(f"Added {vs_dir} to PATH")
import sys
import time
import warnings # To disable warnings on export to ONNX.
import zipfile
from pathlib import Path
import logging
import torch
import nncf # Important - should be imported directly after torch.
import torch.nn as nn
import torch.nn.parallel
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
from nncf.common.utils.logger import set_log_level
set_log_level(logging.ERROR) # Disables all NNCF info and warning messages.
from nncf import NNCFConfig
from nncf.torch import create_compressed_model, register_default_init_args
from openvino.runtime import Core
from torch.jit import TracerWarning
sys.path.append("../utils")
from notebook_utils import download_file
torch.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
MODEL_DIR = Path("model")
OUTPUT_DIR = Path("output")
DATA_DIR = Path("data")
BASE_MODEL_NAME = "resnet18"
image_size = 64
OUTPUT_DIR.mkdir(exist_ok=True)
MODEL_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(exist_ok=True)
# Paths where PyTorch, ONNX and OpenVINO IR models will be stored.
fp32_pth_path = Path(MODEL_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".pth")
fp32_onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_fp32")).with_suffix(".onnx")
fp32_ir_path = fp32_onnx_path.with_suffix(".xml")
int8_onnx_path = Path(OUTPUT_DIR / (BASE_MODEL_NAME + "_int8")).with_suffix(".onnx")
int8_ir_path = int8_onnx_path.with_suffix(".xml")
# It is possible to train FP32 model from scratch, but it might be slow. Therefore, the pre-trained weights are downloaded by default.
pretrained_on_tiny_imagenet = True
fp32_pth_url = "https://storage.openvinotoolkit.org/repositories/nncf/openvino_notebook_ckpts/302_resnet18_fp32_v1.pth"
download_file(fp32_pth_url, directory=MODEL_DIR, filename=fp32_pth_path.name)
Download Tiny ImageNet dataset
def download_tiny_imagenet_200(
data_dir: Path,
url="http://cs231n.stanford.edu/tiny-imagenet-200.zip",
tarname="tiny-imagenet-200.zip",
):
archive_path = data_dir / tarname
download_file(url, directory=data_dir, filename=tarname)
zip_ref = zipfile.ZipFile(archive_path, "r")
zip_ref.extractall(path=data_dir)
zip_ref.close()
def prepare_tiny_imagenet_200(dataset_dir: Path):
# Format validation set the same way as train set is formatted.
val_data_dir = dataset_dir / 'val'
val_annotations_file = val_data_dir / 'val_annotations.txt'
with open(val_annotations_file, 'r') as f:
val_annotation_data = map(lambda line: line.split('\t')[:2], f.readlines())
val_images_dir = val_data_dir / 'images'
for image_filename, image_label in val_annotation_data:
from_image_filepath = val_images_dir / image_filename
to_image_dir = val_data_dir / image_label
if not to_image_dir.exists():
to_image_dir.mkdir()
to_image_filepath = to_image_dir / image_filename
from_image_filepath.rename(to_image_filepath)
val_annotations_file.unlink()
val_images_dir.rmdir()
DATASET_DIR = DATA_DIR / "tiny-imagenet-200"
if not DATASET_DIR.exists():
download_tiny_imagenet_200(DATA_DIR)
prepare_tiny_imagenet_200(DATASET_DIR)
print(f"Successfully downloaded and prepared dataset at: {DATASET_DIR}")
Using NNCF for model compression assumes that a pre-trained model and a training pipeline are already in use.
This tutorial demonstrates one possible training pipeline: a ResNet-18 model pre-trained on 1000 classes from ImageNet is fine-tuned with 200 classes from Tiny-Imagenet.
Subsequently, the training and validation functions will be reused as is for quantization-aware training.
def train(train_loader, model, criterion, optimizer, epoch):
batch_time = AverageMeter("Time", ":3.3f")
losses = AverageMeter("Loss", ":2.3f")
top1 = AverageMeter("Acc@1", ":2.2f")
top5 = AverageMeter("Acc@5", ":2.2f")
progress = ProgressMeter(
len(train_loader), [batch_time, losses, top1, top5], prefix="Epoch:[{}]".format(epoch)
)
# Switch to train mode.
model.train()
end = time.time()
for i, (images, target) in enumerate(train_loader):
images = images.to(device)
target = target.to(device)
# Compute output.
output = model(images)
loss = criterion(output, target)
# Measure accuracy and record loss.
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# Compute gradient and do opt step.
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Measure elapsed time.
batch_time.update(time.time() - end)
end = time.time()
print_frequency = 50
if i % print_frequency == 0:
progress.display(i)
def validate(val_loader, model, criterion):
batch_time = AverageMeter("Time", ":3.3f")
losses = AverageMeter("Loss", ":2.3f")
top1 = AverageMeter("Acc@1", ":2.2f")
top5 = AverageMeter("Acc@5", ":2.2f")
progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix="Test: ")
# Switch to evaluate mode.
model.eval()
with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(val_loader):
images = images.to(device)
target = target.to(device)
# Compute output.
output = model(images)
loss = criterion(output, target)
# Measure accuracy and record loss.
acc1, acc5 = accuracy(output, target, topk=(1, 5))
losses.update(loss.item(), images.size(0))
top1.update(acc1[0], images.size(0))
top5.update(acc5[0], images.size(0))
# Measure elapsed time.
batch_time.update(time.time() - end)
end = time.time()
print_frequency = 10
if i % print_frequency == 0:
progress.display(i)
print(" * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5))
return top1.avg
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=":f"):
self.name = name
self.fmt = fmt
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def __str__(self):
fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
return fmtstr.format(**self.__dict__)
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print("\t".join(entries))
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = "{:" + str(num_digits) + "d}"
return "[" + fmt + "/" + fmt.format(num_batches) + "]"
def accuracy(output, target, topk=(1,)):
"""Computes the accuracy over the k top predictions for the specified values of k"""
with torch.no_grad():
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
res.append(correct_k.mul_(100.0 / batch_size))
return res
А pre-trained floating-point model is a prerequisite for quantization. It can be obtained by tuning from scratch with the code below. However, this usually takes a lot of time. Therefore, this code has already been run and received good enough weights after 4 epochs (for the sake of simplicity, tuning was not done until the best accuracy). By default, this notebook just loads these weights without launching training. To train the model yourself on a model pre-trained on ImageNet, set pretrained_on_tiny_imagenet = False
in the Imports and Settings section at the top of this notebook.
num_classes = 200 # 200 is for Tiny ImageNet, default is 1000 for ImageNet
init_lr = 1e-4
batch_size = 128
epochs = 4
model = models.resnet18(pretrained=not pretrained_on_tiny_imagenet)
# Update the last FC layer for Tiny ImageNet number of classes.
model.fc = nn.Linear(in_features=512, out_features=num_classes, bias=True)
model.to(device)
# Data loading code.
train_dir = DATASET_DIR / "train"
val_dir = DATASET_DIR / "val"
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_dataset = datasets.ImageFolder(
train_dir,
transforms.Compose(
[
transforms.Resize(image_size),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
normalize,
]
),
)
val_dataset = datasets.ImageFolder(
val_dir,
transforms.Compose(
[
transforms.Resize(image_size),
transforms.ToTensor(),
normalize,
]
),
)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, sampler=None
)
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True
)
# Define loss function (criterion) and optimizer.
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=init_lr)
if pretrained_on_tiny_imagenet:
#
# ** WARNING: The `torch.load` functionality uses Python's pickling module that
# may be used to perform arbitrary code execution during unpickling. Only load data that you
# trust.
#
checkpoint = torch.load(str(fp32_pth_path), map_location="cpu")
model.load_state_dict(checkpoint["state_dict"], strict=True)
acc1_fp32 = checkpoint["acc1"]
else:
best_acc1 = 0
# Training loop.
for epoch in range(0, epochs):
# Run a single training epoch.
train(train_loader, model, criterion, optimizer, epoch)
# Evaluate on validation set.
acc1 = validate(val_loader, model, criterion)
is_best = acc1 > best_acc1
best_acc1 = max(acc1, best_acc1)
if is_best:
checkpoint = {"state_dict": model.state_dict(), "acc1": acc1}
torch.save(checkpoint, fp32_pth_path)
acc1_fp32 = best_acc1
print(f"Accuracy of FP32 model: {acc1_fp32:.3f}")
Export the FP32
model to ONNX, which is supported by OpenVINO™ Toolkit, to benchmark it in comparison with the INT8
model.
dummy_input = torch.randn(1, 3, image_size, image_size).to(device)
torch.onnx.export(model, dummy_input, fp32_onnx_path)
print(f"FP32 ONNX model was exported to {fp32_onnx_path}.")
NNCF enables compression-aware training by integrating into regular training pipelines. The framework is designed so that modifications to your original training code are minor. Quantization is the simplest scenario and requires only 3 modifications.
nncf_config_dict = {
"input_info": {"sample_size": [1, 3, image_size, image_size]},
"log_dir": str(OUTPUT_DIR), # The log directory for NNCF-specific logging outputs.
"compression": {
"algorithm": "quantization", # Specify the algorithm here.
},
}
nncf_config = NNCFConfig.from_dict(nncf_config_dict)
nncf_config = register_default_init_args(nncf_config, train_loader)
FP32
model and a configuration object.compression_ctrl, model = create_compressed_model(model, nncf_config)
Evaluate the new model on the validation set after initialization of quantization. The accuracy should be close to the accuracy of the floating-point FP32
model for a simple case like the one being demonstrated here.
acc1 = validate(val_loader, model, criterion)
print(f"Accuracy of initialized INT8 model: {acc1:.3f}")
At this step, a regular fine-tuning process is applied to further improve quantized model accuracy. Normally, several epochs of tuning are required with a small learning rate, the same that is usually used at the end of the training of the original model. No other changes in the training pipeline are required. Here is a simple example.
compression_lr = init_lr / 10
optimizer = torch.optim.Adam(model.parameters(), lr=compression_lr)
# Train for one epoch with NNCF.
train(train_loader, model, criterion, optimizer, epoch=0)
# Evaluate on validation set after Quantization-Aware Training (QAT case).
acc1_int8 = validate(val_loader, model, criterion)
print(f"Accuracy of tuned INT8 model: {acc1_int8:.3f}")
print(f"Accuracy drop of tuned INT8 model over pre-trained FP32 model: {acc1_fp32 - acc1_int8:.3f}")
if not int8_onnx_path.exists():
warnings.filterwarnings("ignore", category=TracerWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# Export INT8 model to ONNX that is supported by OpenVINO™ Toolkit
compression_ctrl.export_model(int8_onnx_path)
print(f"INT8 ONNX model exported to {int8_onnx_path}.")
Use Model Optimizer to convert the ONNX model to OpenVINO IR, with FP16
precision. The models are saved to the current directory. Then, add the mean values to the model and scale the output with the standard deviation by the --mean_values
and --scale_values
arguments. It is not necessary to normalize input data before propagating it through the network with these options.
For more information about Model Optimizer, see the Model Optimizer Developer Guide.
Executing this command may take a while. There may be some errors or warnings in the output. When Model Optimizer successfully converts the model to OpenVINO IR, the last lines of the output will include: [ SUCCESS ] Generated IR version 10 model
if not fp32_ir_path.exists():
!mo --input_model $fp32_onnx_path --input_shape "[1,3, $image_size, $image_size]" --mean_values "[123.675, 116.28 , 103.53]" --scale_values "[58.395, 57.12 , 57.375]" --data_type FP16 --output_dir $OUTPUT_DIR
if not int8_ir_path.exists():
!mo --input_model $int8_onnx_path --input_shape "[1,3, $image_size, $image_size]" --mean_values "[123.675, 116.28 , 103.53]" --scale_values "[58.395, 57.12 , 57.375]" --data_type FP16 --output_dir $OUTPUT_DIR
Finally, measure the inference performance of the FP32
and INT8
models, using Benchmark Tool - inference performance measurement tool in OpenVINO. By default, Benchmark Tool runs inference for 60 seconds in asynchronous mode on CPU. It returns inference speed as latency (milliseconds per image) and throughput (frames per second) values.
NOTE: This notebook runs
benchmark_app
for 15 seconds to give a quick indication of performance. For more accurate performance, it is recommended to runbenchmark_app
in a terminal/command prompt after closing other applications. Runbenchmark_app -m model.xml -d CPU
to benchmark async inference on CPU for one minute. Change CPU to GPU to benchmark on GPU. Runbenchmark_app --help
to see an overview of all command-line options.
def parse_benchmark_output(benchmark_output):
parsed_output = [line for line in benchmark_output if not (line.startswith(r"[") or line.startswith(" ") or line == "")]
print(*parsed_output, sep='\n')
print('Benchmark FP32 model (IR)')
benchmark_output = ! benchmark_app -m $fp32_ir_path -d CPU -api async -t 15
parse_benchmark_output(benchmark_output)
print('Benchmark INT8 model (IR)')
benchmark_output = ! benchmark_app -m $int8_ir_path -d CPU -api async -t 15
parse_benchmark_output(benchmark_output)
Show CPU Information for reference.
ie = Core()
ie.get_property("CPU", "FULL_DEVICE_NAME")