#!/usr/bin/env python # coding: utf-8 # # Document AI | ACCURACY at paragraph level with a Document Understanding layout model (Layout XLM base) fine-tuned on DocLayNet dataset # - Credit: # - notebook created from the notebook [Fine_tuning_LayoutXLM_on_XFUND_for_token_classification_using_HuggingFace_Trainer.ipynb](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutXLM/Fine_tuning_LayoutXLM_on_XFUND_for_token_classification_using_HuggingFace_Trainer.ipynb) # - dataset from IBM Research ([DocLayNet](https://github.com/DS4SD/DocLayNet)) # - Author of this notebook: [Pierre GUILLOU](https://www.linkedin.com/in/pierreguillou/) # - Date: 22/04/2023 # - Blog posts: # - Layout XLM base # - (03/31/2023) [Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base](https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d) # - (03/25/2023) [Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level](https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15) # - (03/05/2023) [Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base](https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc) # - LiLT base # - (02/16/2023) [Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level](https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-c18d16e53cf8) # - (02/14/2023) [Document AI | Inference APP for Document Understanding at line level](https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893) # - (02/10/2023) [Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset](https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8) # - (01/31/2023) [Document AI | DocLayNet image viewer APP](https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956) # - (01/27/2023) [Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)](https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb) # - Notebooks (paragraph level) # - Layout XLM base # - [Document AI | Inference at paragraph level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb) # - [Document AI | Inference APP at paragraph level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet base dataset)](https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb) # - [Document AI | Fine-tune LayoutXLM base on DocLayNet base in any language at paragraph level (chunk of 512 tokens with overlap)](https://github.com/piegu/language-models/blob/master/Fine_tune_LayoutXLM_base_on_DocLayNet_base_in_any_language_at_paragraphlevel_ml_512.ipynb) # - LiLT base # - [Document AI | Inference APP at paragraph level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb) # - [Document AI | Inference at paragraph level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb) # - [Document AI | Fine-tune LiLT on DocLayNet base in any language at paragraph level (chunk of 512 tokens with overlap)](https://github.com/piegu/language-models/blob/master/Fine_tune_LiLT_on_DocLayNet_base_in_any_language_at_paragraphlevel_ml_512.ipynb) # - Notebooks (line level) # - Layout XLM base # - [Document AI | Inference at line level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb) # - [Document AI | Inference APP at line level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet base dataset)](https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb) # - [Document AI | Fine-tune LayoutXLM base on DocLayNet base in any language at line level (chunk of 384 tokens with overlap)](https://github.com/piegu/language-models/blob/master/Fine_tune_LayoutXLM_base_on_DocLayNet_base_in_any_language_at_linelevel_ml_384.ipynb) # - LiLT base # - [Document AI | Inference at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb) # - [Document AI | Inference APP at line level with a Document Understanding model (LiLT fine-tuned on DocLayNet dataset)](https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LiLT_model_finetuned_on_DocLayNet_base_in_any_language_at_levellines_ml384.ipynb) # - [Document AI | Fine-tune LiLT on DocLayNet base in any language at line level (chunk of 384 tokens with overlap)](https://github.com/piegu/language-models/blob/master/Fine_tune_LiLT_on_DocLayNet_base_in_any_language_at_linelevel_ml_384.ipynb) # - [DocLayNet image viewer APP](https://github.com/piegu/language-models/blob/master/DocLayNet_image_viewer_APP.ipynb) # - [Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)](processing_DocLayNet_dataset_to_be_used_by_layout_models_of_HF_hub.ipynb) # ## Inference at paragraph level # ### layoutXLM # LayoutXLM was proposed in [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei. # # It is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. More, it’s a **multilingual extension of the LayoutLMv2 model trained on 53 languages**. # # It relies on an external OCR engine to get words and bboxes from the document image. Thus, let's run in this APP an OCR engine ourselves ([PyTesseract](https://github.com/madmaze/pytesseract#python-tesseract)) as we'll need to do it in real life to get the bounding boxes, then run LayoutXLM base (already fine-tuned on the DocLayNet dataset at paragraph level: [pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512](https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512)) on the individual tokens and visualize the result at paragraph level! # ## Install Detectron 2, Pypdfium2, LangDetect & PyTesseract OCR # ### Detectron 2 # In[ ]: get_ipython().run_cell_magic('capture', '', "!pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html\n!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'\n") # ### Pypdfium2 # In[ ]: get_ipython().run_cell_magic('capture', '', '# !sudo apt-get install poppler-utils\n# !pip install pdf2image\n\n# source: https://levelup.gitconnected.com/4-python-libraries-to-convert-pdf-to-images-7a09eba83a09\n# source: https://pypi.org/project/pypdfium2/\n!pip install -U pypdfium2\n') # ### LangDetect # In[ ]: get_ipython().run_cell_magic('capture', '', '!pip install -q langdetect\n') # ### Tesseract OCR # In[ ]: get_ipython().run_cell_magic('capture', '', "!sudo apt install tesseract-ocr-all # english + osd (Orientation and script detection module)\n# !sudo apt-get install tesseract-ocr-por # portuguese\n\n# import os\n# print(os.popen(f'cat /etc/debian_version').read())\n# print(os.popen(f'cat /etc/issue').read())\n# print(os.popen(f'apt search tesseract').read())\n\n!pip install pytesseract\n") # ## Set-up environment # In[ ]: from google.colab import drive drive.mount('/content/drive', force_remount=True) # ### Libraries # In[ ]: get_ipython().system('pip install -q transformers sentencepiece datasets pypdf') # In[ ]: import os from operator import itemgetter import collections import string import re import pypdf from pypdf import PdfReader from pypdf.errors import PdfReadError import pypdfium2 as pdfium import langdetect from langdetect import detect_langs import pytesseract import pandas as pd import numpy as np import random from google.colab import files import tempfile import matplotlib.pyplot as plt from matplotlib import font_manager from PIL import Image, ImageDraw, ImageFont font = ImageFont.load_default() import cv2 # In Colab, use cv2_imshow instead of cv2.imshow from google.colab.patches import cv2_imshow from IPython.display import display import itertools import pathlib from pathlib import Path import shutil from ipywidgets import widgets from IPython.display import display, HTML import transformers import datasets from datasets import concatenate_datasets # ### Key parameters # In[ ]: # model model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" # layout # model_id = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" # lilt if "layout" in model_id: # tokenizer tokenizer_id = "xlm-roberta-base" # In[ ]: # categories colors label2color = { 'Caption': 'brown', 'Footnote': 'orange', 'Formula': 'gray', 'List-item': 'yellow', 'Page-footer': 'red', 'Page-header': 'red', 'Picture': 'violet', 'Section-header': 'orange', 'Table': 'green', 'Text': 'blue', 'Title': 'pink' } domains = ["Financial Reports", "Manuals", "Scientific Articles", "Laws & Regulations", "Patents", "Government Tenders"] domain_names = [domain_name.lower().replace(" ", "_").replace("&", "and") for domain_name in domains] # bounding boxes start and end of a sequence if "layout" in model_id: cls_box = [0, 0, 0, 0] sep_box = [1000, 1000, 1000, 1000] elif "lilt" in model_id: cls_box = [0, 0, 0, 0] sep_box = cls_box # DocLayNet dataset # dataset_name = "pierreguillou/DocLayNet-small" dataset_name = "pierreguillou/DocLayNet-base" dataset_name_suffix = dataset_name.replace("pierreguillou/DocLayNet-", "") # PAD token index label_pad_token_id = -100 # parameters de TrainingArguments batch_size=8 # WARNING: change this value according to your GPU RAM # (tokenization) The maximum length of a feature (sequence) if str(384) in model_id: max_length = 384 elif str(512) in model_id: max_length = 512 else: print("Error with max_length of chunks!") # (tokenization) overlap doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed. # ### Functions # #### General # In[ ]: # get text and bounding boxes from an image # https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract # https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655 def get_data_paragraph(results, factor, conf_min=0): data = {} for i in range(len(results['line_num'])): level = results['level'][i] block_num = results['block_num'][i] par_num = results['par_num'][i] line_num = results['line_num'][i] top, left = results['top'][i], results['left'][i] width, height = results['width'][i], results['height'][i] conf = results['conf'][i] text = results['text'][i] if not (text == '' or text.isspace()): if conf >= conf_min: tup = (text, left, top, width, height) if block_num in list(data.keys()): if par_num in list(data[block_num].keys()): if line_num in list(data[block_num][par_num].keys()): data[block_num][par_num][line_num].append(tup) else: data[block_num][par_num][line_num] = [tup] else: data[block_num][par_num] = {} data[block_num][par_num][line_num] = [tup] else: data[block_num] = {} data[block_num][par_num] = {} data[block_num][par_num][line_num] = [tup] # get paragraphs dicionnary with list of lines par_data = {} par_idx = 1 for _, b in data.items(): for _, p in b.items(): line_data = {} line_idx = 1 for _, l in p.items(): line_data[line_idx] = l line_idx += 1 par_data[par_idx] = line_data par_idx += 1 # get lines of texts, grouped by paragraph texts_pars = list() row_indexes = list() texts_lines = list() texts_lines_par = list() row_index = 0 for _,par in par_data.items(): count_lines = 0 lines_par = list() for _,line in par.items(): if count_lines == 0: row_indexes.append(row_index) line_text = ' '.join([item[0] for item in line]) texts_lines.append(line_text) lines_par.append(line_text) count_lines += 1 row_index += 1 # lines.append("\n") row_index += 1 texts_lines_par.append(lines_par) texts_pars.append(' '.join(lines_par)) # lines = lines[:-1] # get paragraphes boxes (par_boxes) # get lines boxes (line_boxes) par_boxes = list() par_idx = 1 line_boxes, lines_par_boxes = list(), list() line_idx = 1 for _, par in par_data.items(): xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list() line_boxes_par = list() count_line_par = 0 for _, line in par.items(): xmin, ymin = line[0][1], line[0][2] xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4]) line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)]) line_boxes_par.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)]) xmins.append(xmin) ymins.append(ymin) xmaxs.append(xmax) ymaxs.append(ymax) line_idx += 1 count_line_par += 1 xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs) par_bbox = [int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)] par_boxes.append(par_bbox) lines_par_boxes.append(line_boxes_par) par_idx += 1 return texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes # rescale image to get 300dpi def set_image_dpi_resize(image): """ Rescaling image to 300dpi while resizing :param image: An image :return: A rescaled image """ length_x, width_y = image.size factor = min(1, float(1024.0 / length_x)) size = int(factor * length_x), int(factor * width_y) # image_resize = image.resize(size, Image.Resampling.LANCZOS) image_resize = image.resize(size, Image.LANCZOS) temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png') temp_filename = temp_file.name image_resize.save(temp_filename, dpi=(300, 300)) return factor, temp_filename # In[ ]: # it is important that each bounding box should be in (upper left, lower right) format. # source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129 def upperleft_to_lowerright(bbox): x0, y0, x1, y1 = tuple(bbox) if bbox[2] < bbox[0]: x0 = bbox[2] x1 = bbox[0] if bbox[3] < bbox[1]: y0 = bbox[3] y1 = bbox[1] return [x0, y0, x1, y1] # convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format. def convert_box(bbox): x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box # LiLT model gets 1000x10000 pixels images def normalize_box(bbox, width, height): return [ int(1000 * (bbox[0] / width)), int(1000 * (bbox[1] / height)), int(1000 * (bbox[2] / width)), int(1000 * (bbox[3] / height)), ] # LiLT model gets 1000x10000 pixels images def denormalize_box(bbox, width, height): return [ int(width * (bbox[0] / 1000)), int(height * (bbox[1] / 1000)), int(width* (bbox[2] / 1000)), int(height * (bbox[3] / 1000)), ] # get back original size def original_box(box, original_width, original_height, coco_width, coco_height): return [ int(original_width * (box[0] / coco_width)), int(original_height * (box[1] / coco_height)), int(original_width * (box[2] / coco_width)), int(original_height* (box[3] / coco_height)), ] def get_blocks(bboxes_block, categories, texts): # get list of unique block boxes bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list() for count_block, bbox_block in enumerate(bboxes_block): if bbox_block != bbox_block_prec: bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block] bbox_block_dict[count_block] = bbox_block_indexes bboxes_block_list.append(bbox_block) bbox_block_prec = bbox_block # get list of categories and texts by unique block boxes category_block_list, text_block_list = list(), list() for bbox_block in bboxes_block_list: count_block = bboxes_block.index(bbox_block) bbox_block_indexes = bbox_block_dict[count_block] category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0] category_block_list.append(category_block) text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist() text_block = [text.replace("\n","").strip() for text in text_block] if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote": text_block = ' '.join(text_block) else: text_block = '\n'.join(text_block) text_block_list.append(text_block) return bboxes_block_list, category_block_list, text_block_list def get_blocks_evaluation(bboxes_block, categories, texts): # get list of unique block boxes bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list() for count_block, bbox_block in enumerate(bboxes_block): if bbox_block != bbox_block_prec: bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block] bbox_block_dict[count_block] = bbox_block_indexes bboxes_block_list.append(bbox_block) bbox_block_prec = bbox_block # get list of categories and texts by unique block boxes category_block_list, texts_lines_list, text_block_list = list(), list(), list() for bbox_block in bboxes_block_list: count_block = bboxes_block.index(bbox_block) bbox_block_indexes = bbox_block_dict[count_block] category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0] category_block_list.append(category_block) text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist() text_block = [text.replace("\n","").strip() for text in text_block] texts_lines_list.append(text_block) # list of text lines by block if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote": text_block = ' '.join(text_block) else: text_block = '\n'.join(text_block) text_block_list.append(text_block) lines_par_boxes_list = [[bboxes_block[group[j]] for j in range(len(group))]for group in list(bbox_block_dict.values())] return bboxes_block_list, category_block_list, texts_lines_list, text_block_list, lines_par_boxes_list # function to sort bounding boxes def get_sorted_boxes(bboxes): # sort by y from page top to bottom sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False) y_list = [bbox[1] for bbox in sorted_bboxes] # sort by x from page left to right when boxes with same y if len(list(set(y_list))) != len(y_list): y_list_duplicates_indexes = dict() y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1] for item in y_list_duplicates: y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item] bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False) np_array_bboxes = np.array(sorted_bboxes) np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates) sorted_bboxes = np_array_bboxes.tolist() return sorted_bboxes # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) def sort_data(bboxes, categories, texts): sorted_bboxes = get_sorted_boxes(bboxes) sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes] sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist() sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist() return sorted_bboxes, sorted_categories, sorted_texts # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) def sort_data_wo_labels(bboxes, texts): sorted_bboxes = get_sorted_boxes(bboxes) sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes] sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist() return sorted_bboxes, sorted_texts # #### Dataset # In[ ]: # get PDF image and its data def generate_annotated_image(index_image=None, split="all"): # get dataset example = dataset # get split if split == "all": example = concatenate_datasets([example["train"], example["validation"], example["test"]]) else: example = example[split] # get random image & PDF data if index_image == None: index_image = random.randint(0, len(example)-1) example = example[index_image] image = example["image"] # original image coco_width, coco_height = example["coco_width"], example["coco_height"] original_width, original_height = example["original_width"], example["original_height"] original_filename = example["original_filename"] page_no = example["page_no"] num_pages = example["num_pages"] # resize image to original image = image.resize((original_width, original_height)) # get corresponding annotations texts = example["texts"] bboxes_block = example["bboxes_block"] bboxes_line = example["bboxes_line"] categories = example["categories"] domain = example["doc_category"] # get domain name index_domain = domain_names.index(domain) domain = domains[index_domain] # convert boxes to original original_bboxes_block = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_block] original_bboxes_line = [original_box(convert_box(box), original_width, original_height, coco_width, coco_height) for box in bboxes_line] ##### block boxes ##### # get unique blocks and its data bboxes_blocks_list, category_block_list, text_block_list = get_blocks(original_bboxes_block, categories, texts) # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) sorted_original_bboxes_block_list, sorted_category_block_list, sorted_text_block_list = sort_data(bboxes_blocks_list, category_block_list, text_block_list) ##### line boxes #### # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) sorted_original_bboxes_line_list, sorted_category_line_list, sorted_text_line_list = sort_data(original_bboxes_line, categories, texts) # group paragraphs and lines outputs sorted_original_bboxes = [sorted_original_bboxes_block_list, sorted_original_bboxes_line_list] sorted_categories = [sorted_category_block_list, sorted_category_line_list] sorted_texts = [sorted_text_block_list, sorted_text_line_list] # get annotated boudings boxes on images images = [image.copy(), image.copy()] imgs, df_paragraphs, df_lines = dict(), pd.DataFrame(), pd.DataFrame() for i, img in enumerate(images): img = img.convert('RGB') # Convert to RGB draw = ImageDraw.Draw(img) for box, label_idx, text in zip(sorted_original_bboxes[i], sorted_categories[i], sorted_texts[i]): label = id2label[label_idx] color = label2color[label] draw.rectangle(box, outline=color) text = text.encode('latin-1', 'replace').decode('latin-1') # https://stackoverflow.com/questions/56761449/unicodeencodeerror-latin-1-codec-cant-encode-character-u2013-writing-to draw.text((box[0] + 10, box[1] - 10), text=label, fill=color, font=font) if i == 0: imgs["paragraphs"] = img df_paragraphs["paragraphs"] = list(range(len(sorted_original_bboxes_block_list))) df_paragraphs["categories"] = [id2label[label_idx] for label_idx in sorted_category_block_list] df_paragraphs["texts"] = sorted_text_block_list df_paragraphs["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_block_list] else: imgs["lines"] = img df_lines["lines"] = list(range(len(sorted_original_bboxes_line_list))) df_lines["categories"] = [id2label[label_idx] for label_idx in sorted_category_line_list] df_lines["texts"] = sorted_text_line_list df_lines["bounding boxes"] = [str(bbox) for bbox in sorted_original_bboxes_line_list] return imgs, original_filename, page_no, num_pages, domain, df_paragraphs, df_lines # In[ ]: # display PDF image and its data def display_pdf_blocks_lines(index_image=None, split="all"): # get image and image data images, original_filename, page_no, num_pages, domain, df_paragraphs, df_lines = generate_annotated_image(index_image=index_image, split=split) print(f"PDF: {original_filename} (page: {page_no+1} / {num_pages}; domain: {domain})\n") # left widget style1 = {'overflow': 'scroll' ,'white-space': 'nowrap', 'width':'50%'} output1 = widgets.Output(description = "PDF image with bounding boxes of paragraphs", style=style1) with output1: # display image print(">> PDF image with bounding boxes of paragraphs\n") open_cv_image = np.array(images["paragraphs"]) # PIL to cv2 # Convert RGB to BGR open_cv_image = open_cv_image[:, :, ::-1].copy() # cv2.imshow('',open_cv_image) # lambda cv2_imshow(open_cv_image) # Colab cv2.waitKey(0) # display DataFrame print("\n>> Paragraphs dataframe\n") display(df_paragraphs) # right widget style2 = style1 output2 = widgets.Output(description = "PDF image with bounding boxes of lines", style=style2) with output2: # display image print(">> PDF image with bounding boxes of lines\n") open_cv_image = np.array(images["lines"]) # PIL to cv2 # Convert RGB to BGR open_cv_image = open_cv_image[:, :, ::-1].copy() # cv2.imshow('',open_cv_image) # lambda cv2_imshow(open_cv_image) # Colab cv2.waitKey(0) # display DataFrame print("\n>> Lines dataframe\n") display(df_lines) ## Side by side thanks to HBox widgets sidebyside = widgets.HBox([output1,output2]) ## Finally, show. display(sidebyside) # #### PDF processing # In[ ]: # get filename and images of PDF pages def pdf_to_images(uploaded_pdf): # file name of the uploaded PDF filename = next(iter(uploaded_pdf)) try: PdfReader(filename) except PdfReadError: print("Invalid PDF file.") else: try: # images = convert_from_path(path_to_file, last_page=max_imgboxes) pdf = pdfium.PdfDocument(str(filename)) version = pdf.get_version() # get the PDF standard version n_pages = len(pdf) # get the number of pages in the document page_indices = [i for i in range(n_pages)] # pages until last_page images = list(pdf.render( pdfium.PdfBitmap.to_pil, page_indices = page_indices, scale = 300/72, # 300dpi resolution )) num_imgs = len(images) print(f'The PDF "{filename}"" was converted into {num_imgs} images.') print("Now, you can extract data from theses images (text, bounding boxes...).") except: print(f"Error with the PDF {filename}: it was not converted into images.") print() else: # display images if num_imgs > 0: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.figure(figsize=(20,10)) columns = 5 for i, image in enumerate(images): plt.subplot(int(num_imgs / columns + 1), columns, i + 1) plt.xticks(color="white") plt.yticks(color="white") plt.tick_params(bottom = False) plt.tick_params(left = False) plt.imshow(image) return filename, images # In[ ]: # Extraction of image data (text and bounding boxes) def extraction_data_from_dataset(example, model_id): ids = example['id'] texts = example['texts'] bboxes_blocks = example['bboxes_block'] bboxes_lines = example['bboxes_line'] categories = example['categories'] images = example['image'] page_hashs = example['page_hash'] original_filenames = example['original_filename'] page_nos = example['page_no'] num_pages = example['num_pages'] original_widths = example['original_width'] original_heights = example['original_height'] coco_widths = example['coco_width'] coco_heights = example['coco_height'] collections = example['collection'] doc_categorys = example['doc_category'] num_imgs = len(images) if num_imgs > 0: # results, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes, images_pixels = dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict() if "layout" in model_id: images_ids_list, texts_lines_list, texts_pars_list, texts_lines_par_list, par_boxes_list, line_boxes_list, lines_par_boxes_list, images_list, images_pixels_list, page_no_list, num_pages_list, category_block_list = list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list() elif "lilt" in model_id: images_ids_list, texts_lines_list, texts_pars_list, texts_lines_par_list, par_boxes_list, line_boxes_list, lines_par_boxes_list, images_list, page_no_list, num_pages_list, category_block_list = list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list() for i in range(num_imgs): images_ids_list.append(i) image = images[i].resize((original_widths[i], original_heights[i])) images_list.append(image) if "layout" in model_id: images_pixels = feature_extractor(image, return_tensors="pt").pixel_values images_pixels_list.append(images_pixels) page_no_list.append(1) num_pages_list.append(1) # convert boxes to original original_bboxes_block = [upperleft_to_lowerright(original_box(convert_box(box), original_widths[i], original_heights[i], coco_widths[i], coco_heights[i])) for box in bboxes_blocks[i]] # original_bboxes_line = [upperleft_to_lowerright(original_box(convert_box(box), original_widths[i], original_heights[i], coco_widths[i], coco_heights[i]) for box in bboxes_lines[i]] ##### block boxes ##### # get unique blocks and its data bboxes_blocks_list_img, category_block_list_img, texts_lines_par_list_img, text_block_list_img, lines_par_boxes_list_img = get_blocks_evaluation(original_bboxes_block, categories[i], texts[i]) # bboxes_block_list, category_block_list, texts_lines_list, text_block_list, lines_par_boxes_list # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary) sorted_original_bboxes_block_list_img, sorted_category_block_list_img, sorted_text_block_list_img = sort_data(bboxes_blocks_list_img, category_block_list_img, text_block_list_img) # texts_lines_par_list.append(texts_lines_par_list_img) texts_pars_list.append(sorted_text_block_list_img) par_boxes_list.append(sorted_original_bboxes_block_list_img) # bboxes_block_list # lines_par_boxes_list.append(lines_par_boxes_list_img) category_block_list.append(sorted_category_block_list_img) # except: # print(f"There was an error within the extraction of PDF text by the OCR!") from datasets import Dataset if "layout" in model_id: dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "images_pixels": images_pixels_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts_par": texts_pars_list, "bboxes_par": par_boxes_list, "category_block_list": category_block_list}) elif "lilt" in model_id: dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts_par": texts_pars_list, "bboxes_par": par_boxes_list, "category_block_list": category_block_list}) # print(f"The text data was successfully extracted by the OCR!") return dataset, texts_pars_list, par_boxes_list, category_block_list # #### Inference # In[ ]: def prepare_inference_features_paragraph(example, model_id=model_id, cls_box = cls_box, sep_box = sep_box): if "layout" in model_id: images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, cc_list, images_pixels_list = list(), list(), list(), list(), list(), list(), list() elif "lilt" in model_id: images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, cc_list = list(), list(), list(), list(), list(), list() # get batch # batch_page_hash = example["page_hash"] batch_images_ids = example["images_ids"] batch_images = example["images"] if "layout" in model_id: batch_images_pixels = example["images_pixels"] batch_bboxes_par = example["bboxes_par"] batch_texts_par = example["texts_par"] batch_categories_blocks = example["category_block_list"] batch_images_size = [image.size for image in batch_images] batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size] # add a dimension if not a batch but only one image if not isinstance(batch_images_ids, list): batch_images_ids = [batch_images_ids] batch_images = [batch_images] if "layout" in model_id: batch_images_pixels = [batch_images_pixels] batch_bboxes_par = [batch_bboxes_par] batch_texts_par = [batch_texts_par] batch_categories_blocks = [batch_categories_blocks] batch_width, batch_height = [batch_width], [batch_height] # process all images of the batch if "layout" in model_id: for num_batch, (image_id, image_pixels, boxes, texts_par, width, height, categories_blocks) in enumerate(zip(batch_images_ids, batch_images_pixels, batch_bboxes_par, batch_texts_par, batch_width, batch_height, batch_categories_blocks)): tokens_list = [] bboxes_list = [] categories_blocks_list = [] # add a dimension if only on image if not isinstance(texts_par, list): texts_par, boxes, categories_blocks = [texts_par], [boxes], [categories_blocks] # convert boxes to original normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes] # sort boxes with texts # we want sorted lists from top to bottom of the image # boxes, texts_par = sort_data_wo_labels(normalize_bboxes_par, texts_par) boxes, categories_blocks, texts_par = sort_data(normalize_bboxes_par, categories_blocks, texts_par) count = 0 for box, category_block, text_par in zip(boxes, categories_blocks, texts_par): tokens_par = tokenizer.tokenize(text_par) num_tokens_par = len(tokens_par) # get number of tokens tokens_list.extend(tokens_par) bboxes_list.extend([box] * num_tokens_par) # number of boxes must be the same as the number of tokens categories_blocks_list.extend([category_block] * num_tokens_par) # use of return_overflowing_tokens=True / stride=doc_stride # to get parts of image with overlap # source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts encodings = tokenizer(" ".join(texts_par), truncation=True, padding="max_length", max_length=max_length, stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True ) otsm = encodings.pop("overflow_to_sample_mapping") offset_mapping = encodings.pop("offset_mapping") # Let's label those examples and get their boxes sequence_length_prev = 0 for i, offsets in enumerate(offset_mapping): # truncate tokens, boxes and labels based on length of chunk - 2 (special tokens and ) sequence_length = len(encodings.input_ids[i]) - 2 if i == 0: start = 0 else: start += sequence_length_prev - doc_stride end = start + sequence_length sequence_length_prev = sequence_length # get tokens, boxes and labels of this image chunk bb = [cls_box] + bboxes_list[start:end] + [sep_box] cc = [-100] + categories_blocks_list[start:end] + [-100] # as the last chunk can have a length < max_length # we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels) if len(bb) < max_length: bb = bb + [sep_box] * (max_length - len(bb)) cc = cc + [-100] * (max_length - len(cc)) # append results input_ids_list.append(encodings["input_ids"][i]) attention_mask_list.append(encodings["attention_mask"][i]) bb_list.append(bb) cc_list.append(cc) images_ids_list.append(image_id) chunks_ids_list.append(i) images_pixels_list.append(image_pixels) return { "images_ids": images_ids_list, "chunk_ids": chunks_ids_list, "input_ids": input_ids_list, "attention_mask": attention_mask_list, "normalized_bboxes": bb_list, "images_pixels": images_pixels_list, "category_blocks": cc_list } elif "lilt" in model_id: for num_batch, (image_id, boxes, texts_par, width, height, categories_blocks) in enumerate(zip(batch_images_ids, batch_bboxes_par, batch_texts_par, batch_width, batch_height, batch_categories_blocks)): tokens_list = [] bboxes_list = [] categories_blocks_list = [] # add a dimension if only on image if not isinstance(texts_par, list): texts_par, boxes, categories_blocks = [texts_par], [boxes], [categories_blocks] # convert boxes to original normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes] # sort boxes with texts # we want sorted lists from top to bottom of the image # boxes, texts_par = sort_data_wo_labels(normalize_bboxes_par, texts_par) boxes, categories_blocks, texts_par = sort_data(normalize_bboxes_par, categories_blocks, texts_par) count = 0 for box, category_block, text_par in zip(boxes, categories_blocks, texts_par): tokens_par = tokenizer.tokenize(text_par) num_tokens_par = len(tokens_par) # get number of tokens tokens_list.extend(tokens_par) bboxes_list.extend([box] * num_tokens_par) # number of boxes must be the same as the number of tokens categories_blocks_list.extend([category_block] * num_tokens_par) # use of return_overflowing_tokens=True / stride=doc_stride # to get parts of image with overlap # source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts encodings = tokenizer(" ".join(texts_par), truncation=True, padding="max_length", max_length=max_length, stride=doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True ) otsm = encodings.pop("overflow_to_sample_mapping") offset_mapping = encodings.pop("offset_mapping") # Let's label those examples and get their boxes sequence_length_prev = 0 for i, offsets in enumerate(offset_mapping): # truncate tokens, boxes and labels based on length of chunk - 2 (special tokens and ) sequence_length = len(encodings.input_ids[i]) - 2 if i == 0: start = 0 else: start += sequence_length_prev - doc_stride end = start + sequence_length sequence_length_prev = sequence_length # get tokens, boxes and labels of this image chunk bb = [cls_box] + bboxes_list[start:end] + [sep_box] cc = [-100] + categories_blocks_list[start:end] + [-100] # as the last chunk can have a length < max_length # we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels) if len(bb) < max_length: bb = bb + [sep_box] * (max_length - len(bb)) cc = cc + [-100] * (max_length - len(cc)) # append results input_ids_list.append(encodings["input_ids"][i]) attention_mask_list.append(encodings["attention_mask"][i]) bb_list.append(bb) cc_list.append(cc) images_ids_list.append(image_id) chunks_ids_list.append(i) return { "images_ids": images_ids_list, "chunk_ids": chunks_ids_list, "input_ids": input_ids_list, "attention_mask": attention_mask_list, "normalized_bboxes": bb_list, "category_blocks": cc_list } # In[ ]: if "layout" in model_id: from torch.utils.data import Dataset class CustomDataset(Dataset): def __init__(self, dataset, tokenizer): self.dataset = dataset self.tokenizer = tokenizer def __len__(self): return len(self.dataset) def __getitem__(self, idx): # get item example = self.dataset[idx] encoding = dict() encoding["images_ids"] = example["images_ids"] encoding["chunk_ids"] = example["chunk_ids"] encoding["input_ids"] = example["input_ids"] encoding["attention_mask"] = example["attention_mask"] encoding["bbox"] = example["normalized_bboxes"] encoding["images_pixels"] = example["images_pixels"] encoding["category_blocks"] = example["category_blocks"] return encoding elif "lilt" in model_id: from torch.utils.data import Dataset class CustomDataset(Dataset): def __init__(self, dataset, tokenizer): self.dataset = dataset self.tokenizer = tokenizer def __len__(self): return len(self.dataset) def __getitem__(self, idx): # get item example = self.dataset[idx] encoding = dict() encoding["images_ids"] = example["images_ids"] encoding["chunk_ids"] = example["chunk_ids"] encoding["input_ids"] = example["input_ids"] encoding["attention_mask"] = example["attention_mask"] encoding["bbox"] = example["normalized_bboxes"] encoding["category_blocks"] = example["category_blocks"] return encoding # In[ ]: import torch.nn.functional as F # get predictions at token level def predictions_token_level(images, custom_encoded_dataset, model_id): num_imgs = len(images) if num_imgs > 0: if "layout" in model_id: chunk_ids, input_ids, bboxes, pixels_values, outputs, token_predictions, categories_blocks = dict(), dict(), dict(), dict(), dict(), dict(), dict() elif "lilt" in model_id: chunk_ids, input_ids, bboxes, outputs, token_predictions, categories_blocks = dict(), dict(), dict(), dict(), dict(), dict() images_ids_list = list() for i,encoding in enumerate(custom_encoded_dataset): # get custom encoded data image_id = encoding['images_ids'] chunk_id = encoding['chunk_ids'] input_id = torch.tensor(encoding['input_ids'])[None] attention_mask = torch.tensor(encoding['attention_mask'])[None] bbox = torch.tensor(encoding['bbox'])[None] if "layout" in model_id: pixel_values = torch.tensor(encoding["images_pixels"]) category_blocks = torch.tensor(encoding['category_blocks'])[None] # save data in dictionnaries if image_id not in images_ids_list: images_ids_list.append(image_id) if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id) else: chunk_ids[image_id] = [chunk_id] if image_id in input_ids: input_ids[image_id].append(input_id) else: input_ids[image_id] = [input_id] if image_id in bboxes: bboxes[image_id].append(bbox) else: bboxes[image_id] = [bbox] if "layout" in model_id: if image_id in pixels_values: pixels_values[image_id].append(pixel_values) else: pixels_values[image_id] = [pixel_values] if image_id in categories_blocks: categories_blocks[image_id].append(category_blocks) else: categories_blocks[image_id] = [category_blocks] # get prediction with forward pass with torch.no_grad(): if "layout" in model_id: output = model( input_ids=input_id.to(device), attention_mask=attention_mask.to(device), bbox=bbox.to(device), image=pixel_values.to(device) ) elif "lilt" in model_id: output = model( input_ids=input_id, attention_mask=attention_mask, bbox=bbox ) # save probabilities of predictions in dictionnary if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1)) else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)] return outputs, images_ids_list, chunk_ids, input_ids, bboxes, categories_blocks else: print("An error occurred while getting predictions!") # In[ ]: from functools import reduce # Get predictions (line level) def predictions_paragraph_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, categories_blocks): ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict() bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, categories_blocks_dict_dict, df = dict(), dict(), dict(), dict(), dict() if len(images_ids_list) > 0: for i, image_id in enumerate(images_ids_list): # get image information images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"] image = images_list[0] width, height = image.size # get data chunk_ids_list = chunk_ids[image_id] outputs_list = outputs[image_id] input_ids_list = input_ids[image_id] bboxes_list = bboxes[image_id] categories_blocks_list = categories_blocks[image_id] # create zeros tensors ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1]).to(device) ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int).to(device) ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int).to(device) ten_categories_blocks = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int).to(device) if len(outputs_list) > 1: for num_output, (output, input_id, bbox, category_blocks) in enumerate(zip(outputs_list, input_ids_list, bboxes_list, categories_blocks_list)): start = num_output*(max_length - 2) - max(0,num_output)*doc_stride end = start + (max_length - 2) if num_output == 0: ten_probs[start:end,:] += output[1:-1] ten_input_ids[:,start:end] = input_id[:,1:-1] ten_bboxes[:,start:end,:] = bbox[:,1:-1,:] ten_categories_blocks[:,start:end] = category_blocks[:,1:-1] else: ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride] ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5 ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1] ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride] ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1] ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:] ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:] ten_categories_blocks[:,start:start + doc_stride] = category_blocks[:,1:1 + doc_stride] ten_categories_blocks[:,start + doc_stride:end] = category_blocks[:,1 + doc_stride:-1] else: ten_probs += outputs_list[0][1:-1] ten_input_ids = input_ids_list[0][:,1:-1] ten_bboxes = bboxes_list[0][:,1:-1] ten_categories_blocks = categories_blocks_list[0][:,1:-1] ten_probs_list, ten_input_ids_list, ten_bboxes_list, ten_categories_blocks_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0], ten_categories_blocks.tolist()[0] bboxes_list = list() input_ids_dict, probs_dict, categories_blocks_dict = dict(), dict(), dict() bbox_prev = [-100, -100, -100, -100] for probs, input_id, bbox, category_block in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list, ten_categories_blocks_list): bbox = denormalize_box(bbox, width, height) if bbox != bbox_prev and bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]: bboxes_list.append(bbox) input_ids_dict[str(bbox)] = [input_id] probs_dict[str(bbox)] = [probs] categories_blocks_dict[str(bbox)] = category_block elif bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]: input_ids_dict[str(bbox)].append(input_id) probs_dict[str(bbox)].append(probs) # categories_blocks_dict[str(bbox)].append(category_block) bbox_prev = bbox probs_bbox = dict() for num_box,bbox in enumerate(bboxes_list): probs = probs_dict[str(bbox)] probs = np.array(probs).T.tolist() probs_label = list() for probs_list in probs: prob_label = reduce(lambda x, y: x*y, probs_list) prob_label = prob_label**(1./(len(probs_list))) # normalization probs_label.append(prob_label) max_value = max(probs_label) max_index = probs_label.index(max_value) probs_bbox[str(bbox)] = max_index bboxes_list_dict[image_id] = bboxes_list input_ids_dict_dict[image_id] = input_ids_dict probs_dict_dict[image_id] = probs_bbox categories_blocks_dict_dict[image_id] = categories_blocks_dict df[image_id] = pd.DataFrame() df[image_id]["bboxes"] = bboxes_list df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list] df[image_id]["pred_labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list] df[image_id]["true_labels"] = [id2label[categories_blocks_dict[str(bbox)]] for bbox in bboxes_list] if (i % 10 == 0 and i != 0) or (i == len(images_ids_list) - 1): print(f"{i}/{len(images_ids_list)}") return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, categories_blocks_dict_dict, df else: print("An error occurred while getting predictions!") # In[ ]: # Get labeled images with lines bounding boxes def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict): labeled_images = list() for i, image_id in enumerate(images_ids_list): # get image images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"] image = images_list[0] width, height = image.size # get predicted boxes and labels bboxes_list = bboxes_list_dict[image_id] probs_bbox = probs_dict_dict[image_id] draw = ImageDraw.Draw(image) # https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f font = font_manager.FontProperties(family='sans-serif', weight='bold') font_file = font_manager.findfont(font) font_size = 30 font = ImageFont.truetype(font_file, font_size) for bbox in bboxes_list: predicted_label = id2label[probs_bbox[str(bbox)]] draw.rectangle(bbox, outline=label2color[predicted_label]) draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font) labeled_images.append(image) return labeled_images # In[ ]: # get data of encoded chunk def get_encoded_chunk_inference(dataset, index_chunk=None): # get datasets example = dataset encoded_example = encoded_dataset print(encoded_dataset) # get randomly a document in dataset if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1) encoded_example = encoded_example[index_chunk] encoded_image_ids = encoded_example["images_ids"] print(encoded_image_ids) # get the image example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0] image = example["images"] # original image width, height = image.size page_no = example["page_no"] num_pages = example["num_pages"] # get boxes, texts, categories bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1] bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes] num_tokens = len(input_ids) + 2 # get unique bboxes and corresponding labels bboxes_list, input_ids_list = list(), list() input_ids_dict = dict() bbox_prev = [-100, -100, -100, -100] for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)): if bbox != bbox_prev: bboxes_list.append(bbox) input_ids_dict[str(bbox)] = [input_id] else: input_ids_dict[str(bbox)].append(input_id) # start_indexes_list.append(i) bbox_prev = bbox # do not keep "..." if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('')): del input_ids_dict[str(bboxes_list[-1])] bboxes_list = bboxes_list[:-1] # get texts by line input_ids_list = input_ids_dict.values() texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list] # display DataFrame df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list}) return image, df, num_tokens, page_no, num_pages # In[ ]: # display chunk of PDF image and its data def display_chunk_paragraphs_inference(dataset, index_chunk=None): # get image and image data image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(dataset, index_chunk=index_chunk) # get data from dataframe input_ids = df["input_ids"] texts = df["texts"] bboxes = df["bboxes"] print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n') # display image with bounding boxes print(">> PDF image with bounding boxes of paragraphs\n") draw = ImageDraw.Draw(image) labels = list() for box, text in zip(bboxes, texts): color = "red" draw.rectangle(box, outline=color) # resize image to original width, height = image.size image = image.resize((int(0.5*width), int(0.5*height))) # convert to cv and display img = np.array(image, dtype='uint8') # PIL to cv2 cv2_imshow(img) cv2.waitKey(0) # display image dataframe print("\n>> Dataframe of annotated paragraphs\n") cols = ["texts", "bboxes"] df = df[cols] display(df) # #### Accuracy # In[ ]: def blocks_acc_by_image(df): image_ids, num_blocks, exact_matches, acc_imgs, pct_accs = list(), list(), list(), list(), list() mat_res = dict() for label in labels: mat_res[label] = [0]* len(labels) for image_id in df.keys(): df_img = df[image_id] num_blocks_img = len(df_img) if num_blocks_img > 0: # global exact matches by image exact_match_img = sum(df_img["pred_labels"] == df_img["true_labels"]) # exact match by label and by image for label in labels: df_img_label = df_img[df_img["true_labels"] == label].copy() num_blocks_img_label = len(df_img_label) # exact_match_img_label = sum(df_img_label["pred_labels"] == df_img_label["true_labels"]) for index, row in df_img_label.iterrows(): pred_label = row["pred_labels"] mat_res[label][labels.index(pred_label)] += 1 # acc by img (number of blocks with correct predicted label for each image) acc_img = exact_match_img / num_blocks_img pct_acc_img = round(acc_img*100, 2) # append image_ids.append(image_id) num_blocks.append(num_blocks_img) exact_matches.append(exact_match_img) acc_imgs.append(acc_img) pct_accs.append(pct_acc_img) # acc for all images (number of blocks with correct predicted label for all images) acc_dataset = sum(exact_matches) / sum(num_blocks) pct_acc_dataset = round(acc_dataset*100, 2) return image_ids, num_blocks, exact_matches, acc_imgs, pct_accs, acc_dataset, pct_acc_dataset, mat_res # ## HF login # In[ ]: get_ipython().system('huggingface-cli login') # ## Model & tokenizer # In[ ]: if "layout" in model_id: from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast, import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model # tokenizer = LayoutXLMTokenizerFast.from_pretrained(model_id) model = LayoutLMv2ForTokenClassification.from_pretrained(model_id); model.to(device); # feature extractor from transformers import LayoutLMv2FeatureExtractor feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) # tokenizer from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) elif "lilt" in model_id: from transformers import AutoTokenizer, AutoModelForTokenClassification import torch device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForTokenClassification.from_pretrained(model_id); model.to(device); # In[ ]: # get labels id2label = model.config.id2label label2id = model.config.label2id num_labels = len(id2label) # ## Download DocLayNet # ### Download # In[ ]: local_dataset_name = "/content/drive/MyDrive/DocLayNet/datasets/" + dataset_name.replace("pierreguillou/DocLayNet-", "") # In[ ]: # from datasets import load_dataset # dataset = load_dataset(dataset_name) # # save locally # dataset.save_to_disk(local_dataset_name) # In[ ]: # load from datasets import load_from_disk dataset = load_from_disk(local_dataset_name) # In[ ]: dataset # In[ ]: dataset["test"].features # In[ ]: labels = dataset["test"].features["categories"].feature.names id2label = {id:label for id, label in enumerate(labels)} label2id = {label:id for id, label in enumerate(labels)} num_labels = len(labels) print(id2label) # In[ ]: example = dataset["test"][1] example["image"] # In[ ]: example["texts"] # In[ ]: example["bboxes_block"] # In[ ]: example["bboxes_line"] # In[ ]: example # ### Checking of the dataset # Select a dataset split and display a random annotated image from it and its dataframe. # # In[ ]: # choose your dataset splits = ["all", "train", "validation", "test"] index_split = 3 split = splits[index_split] # display random PDF image and its data display_pdf_blocks_lines(split=split) # ## Inference # As Colab can stop the evaluation (lack of time, etc.), we chunck our test dataset into 3 and save results. # In[ ]: dset = dataset["test"].select(list(range(170))) # dset = dataset["test"].select(list(range(340))[170:]) # dset = dataset["test"].select(list(range(499))[340:]) dataset_inference, texts_pars_list, par_boxes_list, category_block_list = extraction_data_from_dataset(dset, model_id) dataset_inference # ### Data encoding # Now, we need to prepare our data in the format of the model. # In[ ]: encoded_dataset = dataset_inference.map(prepare_inference_features_paragraph, batched=True, batch_size=64, remove_columns=dataset_inference.column_names) encoded_dataset # #### Checking of the encoded dataset # (Checking) Display a random annotated chunk image and its dataframe. # # **Note**: the image is squared because of its normalization to 1000px vs 1000px in the encoded dataset (necessary for training the model). # In[ ]: # get and image from random chunk display_chunk_paragraphs_inference(dataset = dataset_inference) # #### Create a custom dataset # We end our data preparation with a new class that keeps only the information needed for inference. # In[ ]: custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer) # Now, we can get the predictions! # ### Get predictions # LayoutXLM outputs labels at the token level, but we are interested in the predicted labels at the line level. # In[ ]: #@title Get predictions (token level) outputs, images_ids_list, chunk_ids, input_ids, bboxes, categories_blocks = predictions_token_level(dataset_inference["images"], custom_encoded_dataset, model_id) # In[ ]: #@title Get predictions (paragraph level) probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, categories_blocks_dict_dict, df = predictions_paragraph_level(dataset_inference, outputs, images_ids_list, chunk_ids, input_ids, bboxes, categories_blocks) # In[ ]: # save import pickle num_data = 170 # num_data = 340 # num_data = 499 # path to main folder if "layout" in model_id: path_to_main = "/content/drive/MyDrive/DocLayNet/results_" + "LayoutXLM_paragraph512_" + "DocLayNet_base_test/" elif "lilt" in model_id: path_to_main = "/content/drive/MyDrive/DocLayNet/results_" + "LiLT_paragraph512_" + "DocLayNet_base_test/" Path(path_to_main).mkdir(parents=True, exist_ok=True) # path to results folder path_to_results_folder = path_to_main + str(num_data) + "/" Path(path_to_results_folder).mkdir(parents=True, exist_ok=True) # bboxes_list_dict path_to_bboxes_list_dict = path_to_results_folder + "bboxes_list_dict" + "_" + str(num_data) + ".pkl" with open(path_to_bboxes_list_dict, 'wb') as f: pickle.dump(bboxes_list_dict, f) # input_ids_dict_dict path_to_input_ids_dict_dict = path_to_results_folder + "input_ids_dict_dict" + "_" + str(num_data) + ".pkl" with open(path_to_input_ids_dict_dict, 'wb') as f: pickle.dump(input_ids_dict_dict, f) # probs_dict_dict path_to_probs_dict_dict = path_to_results_folder + "probs_dict_dict" + "_" + str(num_data) + ".pkl" with open(path_to_probs_dict_dict, 'wb') as f: pickle.dump(probs_dict_dict, f) # categories_blocks_dict_dict path_to_categories_blocks_dict_dict = path_to_results_folder + "categories_blocks_dict_dict" + "_" + str(num_data) + ".pkl" with open(path_to_categories_blocks_dict_dict, 'wb') as f: pickle.dump(categories_blocks_dict_dict, f) # categories_blocks_dict_dict path_to_df = path_to_results_folder + "df" + "_" + str(num_data) + ".pkl" with open(path_to_df, 'wb') as f: pickle.dump(df, f) # In[ ]: # Load # import pickle # num_data = 170 # num_data = 340 # num_data = 499 # path_to_bboxes_list_dict = path_to_main + str(num_data) + "/" + "bboxes_list_dict" + "_" + str(num_data) + ".pkl" # path_to_input_ids_dict_dict = path_to_main + str(num_data) + "/" + "input_ids_dict_dict" + "_" + str(num_data) + ".pkl" # path_to_probs_dict_dict = path_to_main + str(num_data) + "/" + "probs_dict_dict" + "_" + str(num_data) + ".pkl" # path_to_categories_blocks_dict_dict = path_to_main + str(num_data) + "/" + "categories_blocks_dict_dict" + "_" + str(num_data) + ".pkl" # path_to_df = path_to_main + str(num_data) + "/" + "df" + "_" + str(num_data) + ".pkl" # with open(path_to_bboxes_list_dict, 'rb') as f: # bboxes_list_dict_170 = pickle.load(f) # with open(path_to_input_ids_dict_dict, 'rb') as f: # input_ids_dict_dict_170 = pickle.load(f) # with open(path_to_probs_dict_dict, 'rb') as f: # probs_dict_dict_170 = pickle.load(f) # with open(path_to_categories_blocks_dict_dict, 'rb') as f: # categories_blocks_dict_dict_170 = pickle.load(f) # with open(path_to_df, 'rb') as f: # df_170 = pickle.load(f) # In[ ]: #@title Get labeled images with paragraphs bounding boxes labeled_images = get_labeled_images(dataset_inference, images_ids_list, bboxes_list_dict, probs_dict_dict) # ### Display labeled images and get their dataframes # In[ ]: print(f"Number of PDF page images: {len(labeled_images)}") # In[ ]: # #@title Labeled images # import matplotlib.pyplot as plt # %matplotlib inline # plt.figure(figsize=(15,10)) # columns = 3 # for i, image in enumerate(labeled_images): # plt.subplot(int(len(labeled_images) / columns + 1), columns, i + 1) # plt.xticks(color="white") # plt.yticks(color="white") # plt.tick_params(bottom = False) # plt.tick_params(left = False) # plt.imshow(image) # In[ ]: #@title Display one labeled image (full size) num_page = 0 print(f"Image of the labeled page {num_page} (at paragraph level).") labeled_images[num_page] # ## Accuracy (dataset DocLayNet base test) # As we got the test predictions in 3 dataframes, we concatenate them befor to get accuracy of the whole DocLayNet base test dataset. # In[ ]: import pickle num_data = 170 path_to_df = path_to_main + str(num_data) + "/" + "df" + "_" + str(num_data) + ".pkl" with open(path_to_df, 'rb') as f: df_170 = pickle.load(f) num_data = 340 path_to_df = path_to_main + str(num_data) + "/" + "df" + "_" + str(num_data) + ".pkl" with open(path_to_df, 'rb') as f: df_340 = pickle.load(f) num_data = 499 path_to_df = path_to_main + str(num_data) + "/" + "df" + "_" + str(num_data) + ".pkl" with open(path_to_df, 'rb') as f: df_499 = pickle.load(f) print(len(df_170),len(df_340),len(df_499),len(df_170)+len(df_340)+len(df_499)) df1 = df_170.copy() df2 = {k+170:v for k,v in df_340.items()} df3 = {k+2*170:v for k,v in df_499.items()} df = df1 df.update(df2) df.update(df3) len(df) # In[ ]: image_ids, num_blocks, exact_matches, acc_imgs, pct_accs, acc_dataset, pct_acc_dataset, mat_res = blocks_acc_by_image(df) if "layout" in model_id: print("\nAccuracy of the finetuned LayoutXLM base model\n(number of blocks with correct predicted label for all images of the test DocLayNet base dataset)") elif "lilt" in model_id: print("\nAccuracy of the finetuned LiLT base model\n(number of blocks with correct predicted label for all images of the test DocLayNet base dataset)") print(f"{pct_acc_dataset}%") # In[ ]: import numpy as np accs = list() for label in labels: index = label2id[label] res_label = mat_res[label] num_true_blocks_label = sum(res_label) num_pred_blocks_label = res_label[index] acc = num_pred_blocks_label / num_true_blocks_label acc = round(100*acc, 2) accs.append(acc) if "layout" in model_id: print("Accuracy by label of the finetuned LayoutXLM base model") elif "lilt" in model_id: print("Accuracy by label of the finetuned LiLT base model") for acc,label in zip (accs, labels): print(f"- {label}: {acc}%") # In[ ]: import matplotlib.pyplot as plt import pandas as pd data = {'labels': labels, 'accuracy (%)': accs } df = pd.DataFrame(data) df = df.sort_values(by = ['accuracy (%)'], ascending = False) colors_green, colors_red = list(), list() for acc in accs: if acc >= 80: color = "green" colors_green.append(color) else: color = "red" colors_red.append(color) plt.bar(df['labels'][df['accuracy (%)'] >= 80], df['accuracy (%)'][df['accuracy (%)'] >= 80], color=colors_green) plt.bar(df['labels'][df['accuracy (%)'] < 80], df['accuracy (%)'][df['accuracy (%)'] < 80], color=colors_red) if "layout" in model_id: title = 'Accuracy by label\nof the labeled paragraphs\n\ndataset: test DocLayNet base\nmodel: LayoutXLM base finetuned on DocLayNet base\n' elif "lilt" in model_id: title = 'Accuracy by label\nof the labeled paragraphs\n\ndataset: test DocLayNet base\nmodel: LiLT base finetuned on DocLayNet base\n' plt.title(title, fontsize=14) plt.xlabel('Labels', fontsize=14) plt.xticks(rotation=70) plt.ylabel('accuracy (%)', fontsize=14) plt.grid(True) plt.legend([">= 80%", "< 80%"], loc=1) plt.show() # In[ ]: # print("Accuracy by image (number of blocks with correct predicted label by image)") # for image_id, pct_acc in zip(image_ids, pct_accs): # print(f"Image n°{image_id}: {pct_acc}%") # In[ ]: # # Confusion matrix # # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html # from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # import matplotlib.pyplot as plt # import numpy as np # cm = list() # for label in labels: # cm.append(mat_res[label]) # cm = np.array(cm) # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels) # disp.plot(xticks_rotation=70) # if "layout" in model_id: # plt.title('Confusion matrix of the labeled paragraphs\nof the test DocLayNet base dataset\n(model: LayoutXLM base finetuned on DocLayNet base)\n') # elif "lilt" in model_id: # plt.title('Confusion matrix of the labeled paragraphs\nof the test DocLayNet base dataset\n(model: LiLT base finetuned on DocLayNet base)\n') # plt.show() # In[ ]: # # Confusion matrix # # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html # # https://stackoverflow.com/questions/64559225/normalizing-a-color-map-for-plotting-a-confusion-matrix-with-confusionmatrixdisp from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay import matplotlib.pyplot as plt import numpy as np cm = list() for label in labels: cm.append(mat_res[label]) cm = np.array(cm) accuracies = cm/cm.sum(1) fig, ax = plt.subplots(figsize=(10,8)) cb = ax.imshow(accuracies, cmap='Blues') plt.xticks(range(len(labels)), labels, rotation=70) plt.yticks(range(len(labels)), labels) for i in range(len(labels)): for j in range(len(labels)): color='blue' if accuracies[i,j] < 0.5 else 'white' ax.annotate(f'{cm[i,j]}', (j,i), color=color, va='center', ha='center') ax.set( ylabel="True label", xlabel="Predicted label", ) plt.colorbar(cb, ax=ax) if "layout" in model_id: plt.title('Confusion matrix of the labeled paragraphs\nof the test DocLayNet base dataset\n(model: LayoutXLM base finetuned on DocLayNet base)\n') elif "lilt" in model_id: plt.title('Confusion matrix of the labeled paragraphs\nof the test DocLayNet base dataset\n(model: LiLT base finetuned on DocLayNet base)\n') plt.show() # # END