#!/usr/bin/env python
# coding: utf-8
# # Monodepth Estimation with OpenVINO
#
# This tutorial demonstrates Monocular Depth Estimation with MidasNet in OpenVINO. Model information can be found [here](https://github.com/openvinotoolkit/open_model_zoo/blob/master/models/public/midasnet/README.md).
#
# 
#
# ### What is Monodepth?
# Monocular Depth Estimation is the task of estimating scene depth using a single image. It has many potential applications in robotics, 3D reconstruction, medical imaging and autonomous systems. This tutorial uses a neural network model called [MiDaS](https://github.com/intel-isl/MiDaS), which was developed by the [Embodied AI Foundation](https://www.embodiedaifoundation.org/). See the research paper below to learn more.
#
# R. Ranftl, K. Lasinger, D. Hafner, K. Schindler and V. Koltun, ["Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer,"](https://ieeexplore.ieee.org/document/9178977) in IEEE Transactions on Pattern Analysis and Machine Intelligence, doi: `10.1109/TPAMI.2020.3019967`.
#
#
# #### Table of contents:
#
# - [Preparation](#Preparation)
# - [Install requirements](#Install-requirements)
# - [Imports](#Imports)
# - [Download the model](#Download-the-model)
# - [Functions](#Functions)
# - [Select inference device](#Select-inference-device)
# - [Load the Model](#Load-the-Model)
# - [Monodepth on Image](#Monodepth-on-Image)
# - [Load, resize and reshape input image](#Load,-resize-and-reshape-input-image)
# - [Do inference on the image](#Do-inference-on-the-image)
# - [Display monodepth image](#Display-monodepth-image)
# - [Monodepth on Video](#Monodepth-on-Video)
# - [Video Settings](#Video-Settings)
# - [Load the Video](#Load-the-Video)
# - [Do Inference on a Video and Create Monodepth Video](#Do-Inference-on-a-Video-and-Create-Monodepth-Video)
# - [Display Monodepth Video](#Display-Monodepth-Video)
#
#
# ### Installation Instructions
#
# This is a self-contained example that relies solely on its own code.
#
# We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
# For details, please refer to [Installation Guide](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/README.md#-installation-guide).
#
#
#
# ## Preparation
# [back to top ⬆️](#Table-of-contents:)
#
# ### Install requirements
# [back to top ⬆️](#Table-of-contents:)
#
# In[ ]:
get_ipython().run_line_magic('pip', 'install -q "openvino>=2023.1.0"')
get_ipython().run_line_magic('pip', 'install -q opencv-python requests tqdm')
get_ipython().run_line_magic('pip', 'install -q "matplotlib>=3.4"')
# Fetch `notebook_utils` module
import requests
from pathlib import Path
if not Path("notebook_utils.py").exists():
r = requests.get(
url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
)
open("notebook_utils.py", "w").write(r.text)
# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry
from notebook_utils import collect_telemetry
collect_telemetry("vision-monodepth.ipynb")
# ### Imports
# [back to top ⬆️](#Table-of-contents:)
#
# In[ ]:
import time
import cv2
import matplotlib.cm
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import (
HTML,
FileLink,
Pretty,
ProgressBar,
Video,
clear_output,
display,
)
import openvino as ov
from notebook_utils import download_file, load_image, device_widget
# ### Download the model
# [back to top ⬆️](#Table-of-contents:)
#
# The model is in the [OpenVINO Intermediate Representation (IR)](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) format.
# In[ ]:
model_folder = Path("model")
ir_model_url = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/models/depth-estimation-midas/FP32/"
ir_model_name_xml = "MiDaS_small.xml"
ir_model_name_bin = "MiDaS_small.bin"
download_file(ir_model_url + ir_model_name_xml, filename=ir_model_name_xml, directory=model_folder)
download_file(ir_model_url + ir_model_name_bin, filename=ir_model_name_bin, directory=model_folder)
model_xml_path = model_folder / ir_model_name_xml
# ## Functions
# [back to top ⬆️](#Table-of-contents:)
#
# In[4]:
def normalize_minmax(data):
"""Normalizes the values in `data` between 0 and 1"""
return (data - data.min()) / (data.max() - data.min())
def convert_result_to_image(result, colormap="viridis"):
"""
Convert network result of floating point numbers to an RGB image with
integer values from 0-255 by applying a colormap.
`result` is expected to be a single network result in 1,H,W shape
`colormap` is a matplotlib colormap.
See https://matplotlib.org/stable/tutorials/colors/colormaps.html
"""
cmap = matplotlib.cm.get_cmap(colormap)
result = result.squeeze(0)
result = normalize_minmax(result)
result = cmap(result)[:, :, :3] * 255
result = result.astype(np.uint8)
return result
def to_rgb(image_data) -> np.ndarray:
"""
Convert image_data from BGR to RGB
"""
return cv2.cvtColor(image_data, cv2.COLOR_BGR2RGB)
# ## Select inference device
# [back to top ⬆️](#Table-of-contents:)
#
# select device from dropdown list for running inference using OpenVINO
# In[6]:
device = device_widget()
device
# ## Load the Model
# [back to top ⬆️](#Table-of-contents:)
#
# Load the model in OpenVINO Runtime with `core.read_model` and compile it for the specified device with `core.compile_model`. Get input and output keys and the expected input shape for the model.
# In[7]:
import openvino.properties as props
# Create cache folder
cache_folder = Path("cache")
cache_folder.mkdir(exist_ok=True)
core = ov.Core()
core.set_property({props.cache_dir(): cache_folder})
model = core.read_model(model_xml_path)
compiled_model = core.compile_model(model=model, device_name=device.value)
input_key = compiled_model.input(0)
output_key = compiled_model.output(0)
network_input_shape = list(input_key.shape)
network_image_height, network_image_width = network_input_shape[2:]
# ## Monodepth on Image
# [back to top ⬆️](#Table-of-contents:)
#
# ### Load, resize and reshape input image
# [back to top ⬆️](#Table-of-contents:)
#
# The input image is read with OpenCV, resized to network input size, and reshaped to (N,C,H,W) (N=number of images, C=number of channels, H=height, W=width).
# In[ ]:
IMAGE_URL = "https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/image/coco_bike.jpg"
IMAGE_NAME = "coco_bike.jpg"
image = load_image(IMAGE_NAME, IMAGE_URL)
# Resize to input shape for network.
resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width))
# Reshape the image to network input shape NCHW.
input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0)
# ### Do inference on the image
# [back to top ⬆️](#Table-of-contents:)
#
# Do inference, convert the result to an image, and resize it to the original image shape.
# In[9]:
result = compiled_model([input_image])[output_key]
# Convert the network result of disparity map to an image that shows
# distance as colors.
result_image = convert_result_to_image(result=result)
# Resize back to original image shape. The `cv2.resize` function expects shape
# in (width, height), [::-1] reverses the (height, width) shape to match this.
result_image = cv2.resize(result_image, image.shape[:2][::-1])
# ### Display monodepth image
# [back to top ⬆️](#Table-of-contents:)
#
# In[10]:
fig, ax = plt.subplots(1, 2, figsize=(20, 15))
ax[0].imshow(to_rgb(image))
ax[1].imshow(result_image);
# ## Monodepth on Video
# [back to top ⬆️](#Table-of-contents:)
#
# By default, only the first 4 seconds are processed in order to quickly check that everything works. Change `NUM_SECONDS` in the cell below to modify this. Set `NUM_SECONDS` to 0 to process the whole video.
# ### Video Settings
# [back to top ⬆️](#Table-of-contents:)
#
# In[ ]:
# Video source: https://www.youtube.com/watch?v=fu1xcQdJRws (Public Domain)
VIDEO_FILE = "Coco-Walking-in-Berkeley.mp4"
download_file(
"https://storage.openvinotoolkit.org/repositories/openvino_notebooks/data/data/video/Coco%20Walking%20in%20Berkeley.mp4",
VIDEO_FILE,
)
# Number of seconds of input video to process. Set `NUM_SECONDS` to 0 to process
# the full video.
NUM_SECONDS = 4
# Set `ADVANCE_FRAMES` to 1 to process every frame from the input video
# Set `ADVANCE_FRAMES` to 2 to process every second frame. This reduces
# the time it takes to process the video.
ADVANCE_FRAMES = 2
# Set `SCALE_OUTPUT` to reduce the size of the result video
# If `SCALE_OUTPUT` is 0.5, the width and height of the result video
# will be half the width and height of the input video.
SCALE_OUTPUT = 0.5
# The format to use for video encoding. The 'vp09` is slow,
# but it works on most systems.
# Try the `THEO` encoding if you have FFMPEG installed.
# FOURCC = cv2.VideoWriter_fourcc(*"THEO")
FOURCC = cv2.VideoWriter_fourcc(*"vp09")
# Create Path objects for the input video and the result video.
output_directory = Path("output")
output_directory.mkdir(exist_ok=True)
result_video_path = output_directory / f"{Path(VIDEO_FILE).stem}_monodepth.mp4"
# ### Load the Video
# [back to top ⬆️](#Table-of-contents:)
#
# Load the video from a `VIDEO_FILE`, set in the *Video Settings* cell above. Open the video to read the frame width and height and fps, and compute values for these properties for the monodepth video.
# In[12]:
cap = cv2.VideoCapture(str(VIDEO_FILE))
ret, image = cap.read()
if not ret:
raise ValueError(f"The video at {VIDEO_FILE} cannot be read.")
input_fps = cap.get(cv2.CAP_PROP_FPS)
input_video_frame_height, input_video_frame_width = image.shape[:2]
target_fps = input_fps / ADVANCE_FRAMES
target_frame_height = int(input_video_frame_height * SCALE_OUTPUT)
target_frame_width = int(input_video_frame_width * SCALE_OUTPUT)
cap.release()
print(f"The input video has a frame width of {input_video_frame_width}, " f"frame height of {input_video_frame_height} and runs at {input_fps:.2f} fps")
print(
"The monodepth video will be scaled with a factor "
f"{SCALE_OUTPUT}, have width {target_frame_width}, "
f" height {target_frame_height}, and run at {target_fps:.2f} fps"
)
# ### Do Inference on a Video and Create Monodepth Video
# [back to top ⬆️](#Table-of-contents:)
#
# In[13]:
# Initialize variables.
input_video_frame_nr = 0
start_time = time.perf_counter()
total_inference_duration = 0
# Open the input video
cap = cv2.VideoCapture(str(VIDEO_FILE))
# Create a result video.
out_video = cv2.VideoWriter(
str(result_video_path),
FOURCC,
target_fps,
(target_frame_width * 2, target_frame_height),
)
num_frames = int(NUM_SECONDS * input_fps)
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT) if num_frames == 0 else num_frames
progress_bar = ProgressBar(total=total_frames)
progress_bar.display()
try:
while cap.isOpened():
ret, image = cap.read()
if not ret:
cap.release()
break
if input_video_frame_nr >= total_frames:
break
# Only process every second frame.
# Prepare a frame for inference.
# Resize to the input shape for network.
resized_image = cv2.resize(src=image, dsize=(network_image_height, network_image_width))
# Reshape the image to network input shape NCHW.
input_image = np.expand_dims(np.transpose(resized_image, (2, 0, 1)), 0)
# Do inference.
inference_start_time = time.perf_counter()
result = compiled_model([input_image])[output_key]
inference_stop_time = time.perf_counter()
inference_duration = inference_stop_time - inference_start_time
total_inference_duration += inference_duration
if input_video_frame_nr % (10 * ADVANCE_FRAMES) == 0:
clear_output(wait=True)
progress_bar.display()
# input_video_frame_nr // ADVANCE_FRAMES gives the number of
# Frames that have been processed by the network.
display(
Pretty(
f"Processed frame {input_video_frame_nr // ADVANCE_FRAMES}"
f"/{total_frames // ADVANCE_FRAMES}. "
f"Inference time per frame: {inference_duration:.2f} seconds "
f"({1/inference_duration:.2f} FPS)"
)
)
# Transform the network result to a RGB image.
result_frame = to_rgb(convert_result_to_image(result))
# Resize the image and the result to a target frame shape.
result_frame = cv2.resize(result_frame, (target_frame_width, target_frame_height))
image = cv2.resize(image, (target_frame_width, target_frame_height))
# Put the image and the result side by side.
stacked_frame = np.hstack((image, result_frame))
# Save a frame to the video.
out_video.write(stacked_frame)
input_video_frame_nr = input_video_frame_nr + ADVANCE_FRAMES
cap.set(1, input_video_frame_nr)
progress_bar.progress = input_video_frame_nr
progress_bar.update()
except KeyboardInterrupt:
print("Processing interrupted.")
finally:
clear_output()
processed_frames = num_frames // ADVANCE_FRAMES
out_video.release()
cap.release()
end_time = time.perf_counter()
duration = end_time - start_time
print(
f"Processed {processed_frames} frames in {duration:.2f} seconds. "
f"Total FPS (including video processing): {processed_frames/duration:.2f}."
f"Inference FPS: {processed_frames/total_inference_duration:.2f} "
)
print(f"Monodepth Video saved to '{str(result_video_path)}'.")
# ### Display Monodepth Video
# [back to top ⬆️](#Table-of-contents:)
#
# In[14]:
video = Video(result_video_path, width=800, embed=True)
if not result_video_path.exists():
plt.imshow(stacked_frame)
raise ValueError("OpenCV was unable to write the video file. Showing one video frame.")
else:
print(f"Showing monodepth video saved at\n{result_video_path.resolve()}")
print("If you cannot see the video in your browser, please click on the " "following link to download the video ")
video_link = FileLink(result_video_path)
video_link.html_link_str = "%s"
display(HTML(video_link._repr_html_()))
display(video)