Notebook

In [1]:

from IPython import display
import sys
sys.path.append("../")

import onnxruntime as ort
import numpy as np
import cv2
from PIL import Image

from constants import classes

In [2]:

path_to_model = "../mvit16-1.onnx"
path_to_input_video = "f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4"
path_to_output_video = "output_onnx.mp4"

In [3]:

session = ort.InferenceSession(path_to_model)
input_name = session.get_inputs()[0].name
input_shape = session.get_inputs()[0].shape
window_size = input_shape[3]
output_names = [output.name for output in session.get_outputs()]

threshold = 0.5
frame_interval = 1
mean = [123.675, 116.28, 103.53]
std = [58.395, 57.12, 57.375]

In [4]:

def resize(im, new_shape=(224, 224)):
    """
    Resize and pad image while preserving aspect ratio.

    Parameters
    ----------
    im : np.ndarray
        Image to be resized.
    new_shape : Tuple[int]
        Size of the new image.

    Returns
    -------
    np.ndarray
        Resized image.
    """
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding

    dw /= 2
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # add border
    return im

In [5]:

cap = cv2.VideoCapture(path_to_input_video)
_,frame = cap.read()
shape = frame.shape
fourcc = cv2.VideoWriter_fourcc(*'H264')
writer = cv2.VideoWriter(path_to_output_video, fourcc, 30, (frame.shape[1], frame.shape[0]+50))

tensors_list = []
prediction_list = []
prediction_list.append("---")

frame_counter = 0
while True:
    _, frame = cap.read()
    if frame is None:
        break
    frame_counter += 1
    if frame_counter == frame_interval:
        image = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB)
        image = resize(image, (224, 224))
        image = (image - mean) / std
        image = np.transpose(image, [2, 0, 1])
        tensors_list.append(image)
        if len(tensors_list) == window_size:
            input_tensor = np.stack(tensors_list[: window_size], axis=1)[None][None]
            outputs = session.run(output_names, {input_name: input_tensor.astype(np.float32)})[0]
            gloss = str(classes[outputs.argmax()])
            if outputs.max() > threshold:
                if gloss != prediction_list[-1] and len(prediction_list):
                    if gloss != "---":
                        prediction_list.append(gloss)
            tensors_list.clear()
        frame_counter = 0

    text = "  ".join(prediction_list)
    text_div = np.zeros((50, frame.shape[1], 3), dtype=np.uint8)
    cv2.putText(text_div, text, (10, 30), cv2.FONT_HERSHEY_COMPLEX, 0.7, (255, 255, 255), 2)

    frame = np.concatenate((frame, text_div), axis=0)
    writer.write(frame)
writer.release()
cap.release()

In [6]:

from IPython.display import display, HTML

In [7]:

video_tag = f'<video controls src="{path_to_output_video}" width="640" height="480" />'
display(HTML(video_tag))

In [ ]: