%%capture !apt-get install poppler-utils !apt-get install tesseract-ocr-all # unstructured 0.11.5 # unstructured-inference 0.7.19 !pip install unstructured[all-docs] unstructured-inference import pathlib from pathlib import Path # select the partition function from unstructured.partition.pdf import partition_pdf # version unstructured 0.11.5 # Define parameters for Unstructured's library ## include_page_breaks # include page breaks (default is False) include_page_breaks = True ## strategy # The strategy to use for partitioning the PDF. Valid strategies are "hi_res", "ocr_only", and "fast". # When using the "hi_res" strategy, the function uses a layout detection model to identify document elements. # hi_res" is used for analyzing PDFs and extracting table structure (default is "auto") strategy = "hi_res" ## infer_table_structure # Only applicable if `strategy=hi_res`. # If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string. # I.e., rows and cells are preserved. # Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure). if strategy == "hi_res": infer_table_structure = True else: infer_table_structure = False ## extract_element_types # Get images of tables if infer_table_structure == True: extract_element_types=['Table'] else: extract_element_types=None ## max_characters # The maximum number of characters to include in a partition (document element) # If None is passed, no maximum is applied. # Only applies to the "ocr_only" strategy (default is 1500) if strategy != "ocr_only": max_characters = None ## languages # The languages to use for the Tesseract agent. # To use a language, you'll first need to install the appropriate Tesseract language pack. languages = ["eng"] # example if more than one "eng+por" (default is "eng") ## model_name # @requires_dependencies("unstructured_inference") # yolox: best model for table extraction. Other options are yolox_quantized, detectron2_onnx and chipper depending on file layout # source: https://unstructured-io.github.io/unstructured/best_practices/models.html hi_res_model_name = "yolox" path = "/content/" filename = path + "Quarterly.Financial.Report.Template.pdf" # Returns a List[Element] present in the pages of the parsed pdf document elements = partition_pdf( filename=filename, include_page_breaks=include_page_breaks, strategy=strategy, infer_table_structure=infer_table_structure, extract_element_types=extract_element_types, max_characters=max_characters, languages=languages, hi_res_model_name=hi_res_model_name, ) # get output as json from unstructured.staging.base import elements_to_json elements_to_json(elements, filename=f"{filename}.json") # Takes a while for file to show up on the Google Colab def process_json_file(input_filename): # Read the JSON file with open(input_filename, 'r') as file: data = json.load(file) # Iterate over the JSON data and extract required table elements extracted_elements = [] text_prev = "" for i,entry in enumerate(data): if entry["type"] == "Title": text = "

" + entry["text"] + "

" elif entry["type"] == "Table": text = entry["metadata"]["text_as_html"] else: text = "

" + entry["text"] + "

" if text != text_prev: extracted_elements.append(text) text_prev = text # Write the extracted elements to the output file html_start = """ Document Information """ html_end = """ """ output_file_html = path + Path(input_filename).name.replace(".json", "") + "_" + model_name + ".html" with open(output_file_html, 'w') as output_file: output_file.write(html_start + "\n") for element in extracted_elements: output_file.write(element + "\n") output_file.write(html_end + "\n") return str(output_file_html) import json output_file_html = process_json_file(f"{filename}.json") # It can take a while for the .html file to show up in Colab from google.colab import files files.download(output_file_html)