#!/usr/bin/env python # coding: utf-8 # # Line-level text detection with Surya # # [![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/surya-line-level-text-detection/surya-line-level-text-detection.ipynb) # # In this tutorial we will perform line-level text detection using [Surya](https://github.com/VikParuchuri/surya) toolkit and OpenVINO. # # ![line-level text detection](https://github.com/VikParuchuri/surya/blob/master/static/images/excerpt.png?raw=true) # # [**image source*](https://github.com/VikParuchuri/surya) # # # Model used for line-level text detection based on [Segformer](https://arxiv.org/pdf/2105.15203.pdf). It has the following features: # * It is specialized for document OCR. It will likely not work on photos or other images. # * It is for printed text, not handwriting. # * The model has trained itself to ignore advertisements. # * Languages with very different character sets may not work well. # # # #### Table of contents: # # - [Fetch test image](#Fetch-test-image) # - [Run PyTorch inference](#Run-PyTorch-inference) # - [Convert model to OpenVINO Intermediate Representation (IR) format](#Convert-model-to-OpenVINO-Intermediate-Representation-(IR)-format) # - [Run OpenVINO model](#Run-OpenVINO-model) # - [Apply post-training quantization using NNCF](#Apply-post-training-quantization-using-NNCF) # - [Prepare dataset](#Prepare-dataset) # - [Quantize model](#Quantize-model) # - [Run quantized OpenVINO model](#Run-quantized-OpenVINO-model) # - [Interactive inference](#Interactive-inference) # # # ### Installation Instructions # # This is a self-contained example that relies solely on its own code. # # We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. # For details, please refer to [Installation Guide](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/README.md#-installation-guide). # # # # ## Fetch test image # [back to top ⬆️](#Table-of-contents:) # # We will use an image from a randomly sampled subset of [DocLayNet](https://github.com/DS4SD/DocLayNet) dataset. # In[ ]: import os os.environ["GIT_CLONE_PROTECTION_ACTIVE"] = "false" get_ipython().run_line_magic('pip', 'install -q "openvino>=2024.2.0" "nncf>=2.11.0"') get_ipython().run_line_magic('pip', 'install -q --extra-index-url https://download.pytorch.org/whl/cpu "surya-ocr==0.4.0" torch datasets "gradio>=4.19" Pillow') # In[2]: from datasets import load_dataset import requests from pathlib import Path if not Path("notebook_utils.py").exists(): r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", ) open("notebook_utils.py", "w").write(r.text) # Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry from notebook_utils import collect_telemetry collect_telemetry("surya-line-level-text-detection.ipynb") def fetch_image(): dataset = load_dataset("vikp/doclaynet_bench", split="train", streaming=True) return next(iter(dataset))["image"] test_image = fetch_image() test_image # ## Run PyTorch inference # [back to top ⬆️](#Table-of-contents:) # # To perform line-level text detection we will use `load_model` and `load_processor` functions from `surya` package. We will also use `batch_inference` function which performs pre and post processing. # In[3]: # Predictions visualization function from PIL import ImageDraw def visualize_prediction(image, prediction): image = image.copy() draw = ImageDraw.Draw(image) for polygon_box in prediction.bboxes: draw.rectangle(polygon_box.bbox, width=1, outline="red") display(image) # In[4]: from surya.detection import batch_text_detection from surya.model.detection.segformer import load_model, load_processor model, processor = load_model(), load_processor() predictions = batch_text_detection([test_image], model, processor) visualize_prediction(test_image, predictions[0]) # ## Convert model to OpenVINO Intermediate Representation (IR) format # [back to top ⬆️](#Table-of-contents:) # # For best results with OpenVINO, it is recommended to convert the model to OpenVINO IR format. OpenVINO supports PyTorch via Model conversion API. # To convert the PyTorch model to OpenVINO IR format we will use `ov.convert_model` of [model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html). The `ov.convert_model` Python function returns an OpenVINO Model object ready to load on the device and start making predictions. # # `ov.convert_model` requires a sample of original model input. We will use image pre-processing from `surya` package to prepare example input. # In[5]: # Build example input from surya.input.processing import prepare_image import torch def build_example_input(image, processor): input_values = prepare_image(image.convert("RGB"), processor) return {"pixel_values": torch.unsqueeze(input_values, 0)} example_input = build_example_input(test_image, processor) # In[6]: # Convert model import openvino as ov from pathlib import Path ov_model = ov.convert_model(model, example_input=example_input) FP_MODEL_PATH = Path("model.xml") INT8_MODEL_PATH = Path("int8_model.xml") ov.save_model(ov_model, FP_MODEL_PATH) # ## Run OpenVINO model # [back to top ⬆️](#Table-of-contents:) # # Select device from dropdown list for running inference using OpenVINO # In[7]: from notebook_utils import device_widget device = device_widget() device # We want to reuse model results postprocessing implemented in `batch_inference` function. In order to do that we implement simple wrappers for OpenVINO model with interface required by `batch_inference` function. # In[8]: core = ov.Core() # Compile OpenVINO model for loading on device compiled_ov_model = core.compile_model(ov_model, device.value) class OVModelWrapperResult: def __init__(self, logits): self.logits = logits class OVModelWrapper: dtype = torch.float32 device = model.device config = model.config def __init__(self, ov_model) -> None: self.ov_model = ov_model def __call__(self, **kwargs): # run inference on preprocessed data and get image-text similarity score logits = self.ov_model(kwargs)[0] return OVModelWrapperResult(torch.from_numpy(logits)) ov_model_wrapper = OVModelWrapper(compiled_ov_model) ov_predictions = batch_text_detection([test_image], ov_model_wrapper, processor) visualize_prediction(test_image, ov_predictions[0]) # ## Apply post-training quantization using NNCF # [back to top ⬆️](#Table-of-contents:) # # [NNCF](https://github.com/openvinotoolkit/nncf/) enables post-training quantization by adding the quantization layers into the model graph and then using a subset of the training dataset to initialize the parameters of these additional quantization layers. The framework is designed so that modifications to your original training code are minor. Quantization is the simplest scenario and requires a few modifications. # # The optimization process contains the following steps: # # 1. Create a dataset for quantization. # 1. Run `nncf.quantize` for getting a quantized model. # # # Please select below whether you would like to run quantization to improve model inference speed. # # > **NOTE**: Quantization is time and memory consuming operation. Running quantization code below may take a long time. # In[9]: from notebook_utils import quantization_widget to_quantize = quantization_widget() to_quantize # In[10]: import requests r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/skip_kernel_extension.py", ) open("skip_kernel_extension.py", "w").write(r.text) get_ipython().run_line_magic('load_ext', 'skip_kernel_extension') # Free resources before quantization. # In[11]: import gc del model del ov_model del compiled_ov_model del ov_model_wrapper gc.collect(); # ### Prepare dataset # [back to top ⬆️](#Table-of-contents:) # # We create calibration dataset with randomly sampled set of images from [DocLayNet](https://github.com/DS4SD/DocLayNet). # In[12]: get_ipython().run_cell_magic('skip', 'not $to_quantize.value', '\nfrom surya.input.processing import split_image\n\n\ndef prepare_calibration_dataset(size=1, buffer_size=1):\n\n def collate_fn(data):\n image = data[0]["image"].convert("RGB")\n image_splits, _ = split_image(image, processor)\n image_splits = prepare_image(image_splits[0], processor)\n\n return image_splits\n\n dataset = load_dataset("vikp/doclaynet_bench", split="train", streaming=True)\n train_dataset = dataset.shuffle(seed=42, buffer_size=buffer_size)\n dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n\n def prepare_calibration_data(dataloader, size):\n data = []\n counter = 0\n for batch in dataloader:\n if counter == size:\n break\n counter += 1\n batch = batch.to(torch.float32)\n batch = batch.to("cpu")\n data.append({"pixel_values": torch.stack([batch])})\n return data\n\n return prepare_calibration_data(dataloader, size)\n\n\ncalibration_dataset = prepare_calibration_dataset()\n') # ### Quantize model # [back to top ⬆️](#Table-of-contents:) # # Create a quantized model from the `FP16` model. # In[13]: get_ipython().run_cell_magic('skip', 'not $to_quantize.value', '\nimport nncf\n\nquantized_ov_model = nncf.quantize(\n model=core.read_model(FP_MODEL_PATH),\n calibration_dataset=nncf.Dataset(calibration_dataset),\n advanced_parameters=nncf.AdvancedQuantizationParameters(\n activations_quantization_params=nncf.quantization.advanced_parameters.QuantizationParameters(per_channel=False)\n ),\n)\n\nov.save_model(quantized_ov_model, INT8_MODEL_PATH)\n') # ## Run quantized OpenVINO model # [back to top ⬆️](#Table-of-contents:) # # Now we ready to detect lines with `int8` OpenVINO model. # In[14]: get_ipython().run_cell_magic('skip', 'not $to_quantize.value', '\n# Compile OpenVINO model for loading on device\ncompiled_int8_ov_model = core.compile_model(quantized_ov_model, device.value)\n\nint8_ov_model_wrapper = OVModelWrapper(compiled_int8_ov_model)\n\nint8_ov_predictions = batch_text_detection([test_image], int8_ov_model_wrapper, processor)\n\nvisualize_prediction(test_image, int8_ov_predictions[0])\n') # ## Interactive inference # # [back to top ⬆️](#Table-of-contents:) # # Now, it is your turn! Feel free to upload an image, using the file upload window. # # Below you can select which model to run: original or quantized. # In[15]: from pathlib import Path import ipywidgets as widgets quantized_model_present = Path(INT8_MODEL_PATH).exists() use_quantized_model = widgets.Checkbox( value=True if quantized_model_present else False, description="Use quantized model", disabled=not quantized_model_present, ) use_quantized_model # In[ ]: compiled_model = ov.compile_model(INT8_MODEL_PATH if use_quantized_model.value else FP_MODEL_PATH, device.value) ov_model = OVModelWrapper(compiled_model) if not Path("gradio_helper.py").exists(): r = requests.get( url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/surya-line-level-text-detection/gradio_helper.py" ) open("gradio_helper.py", "w").write(r.text) from gradio_helper import make_demo demo = make_demo(ov_model, processor, test_image) try: demo.launch(debug=True, height=1000) except Exception: demo.launch(share=True, debug=True, height=1000) # If you are launching remotely, specify server_name and server_port # EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')` # To learn more please refer to the Gradio docs: https://gradio.app/docs/