#!/usr/bin/env python # coding: utf-8 # In[1]: from IPython import display import sys sys.path.append("../") import onnxruntime as ort import numpy as np import cv2 from PIL import Image from constants import classes # In[2]: path_to_model = "../mvit16-1.onnx" path_to_input_video = "f17a6060-6ced-4bd1-9886-8578cfbb864f.mp4" path_to_output_video = "output_onnx.mp4" # In[3]: session = ort.InferenceSession(path_to_model) input_name = session.get_inputs()[0].name input_shape = session.get_inputs()[0].shape window_size = input_shape[3] output_names = [output.name for output in session.get_outputs()] threshold = 0.5 frame_interval = 1 mean = [123.675, 116.28, 103.53] std = [58.395, 57.12, 57.375] # In[4]: def resize(im, new_shape=(224, 224)): """ Resize and pad image while preserving aspect ratio. Parameters ---------- im : np.ndarray Image to be resized. new_shape : Tuple[int] Size of the new image. Returns ------- np.ndarray Resized image. """ shape = im.shape[:2] # current shape [height, width] if isinstance(new_shape, int): new_shape = (new_shape, new_shape) # Scale ratio (new / old) r = min(new_shape[0] / shape[0], new_shape[1] / shape[1]) # Compute padding new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r)) dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding dw /= 2 dh /= 2 if shape[::-1] != new_unpad: # resize im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR) top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)) # add border return im # In[5]: cap = cv2.VideoCapture(path_to_input_video) _,frame = cap.read() shape = frame.shape fourcc = cv2.VideoWriter_fourcc(*'H264') writer = cv2.VideoWriter(path_to_output_video, fourcc, 30, (frame.shape[1], frame.shape[0]+50)) tensors_list = [] prediction_list = [] prediction_list.append("---") frame_counter = 0 while True: _, frame = cap.read() if frame is None: break frame_counter += 1 if frame_counter == frame_interval: image = cv2.cvtColor(frame.copy(), cv2.COLOR_BGR2RGB) image = resize(image, (224, 224)) image = (image - mean) / std image = np.transpose(image, [2, 0, 1]) tensors_list.append(image) if len(tensors_list) == window_size: input_tensor = np.stack(tensors_list[: window_size], axis=1)[None][None] outputs = session.run(output_names, {input_name: input_tensor.astype(np.float32)})[0] gloss = str(classes[outputs.argmax()]) if outputs.max() > threshold: if gloss != prediction_list[-1] and len(prediction_list): if gloss != "---": prediction_list.append(gloss) tensors_list.clear() frame_counter = 0 text = " ".join(prediction_list) text_div = np.zeros((50, frame.shape[1], 3), dtype=np.uint8) cv2.putText(text_div, text, (10, 30), cv2.FONT_HERSHEY_COMPLEX, 0.7, (255, 255, 255), 2) frame = np.concatenate((frame, text_div), axis=0) writer.write(frame) writer.release() cap.release() # In[6]: from IPython.display import display, HTML # In[7]: video_tag = f'