%%capture
!apt-get install poppler-utils
!apt-get install tesseract-ocr-all
# unstructured 0.11.5
# unstructured-inference 0.7.19
!pip install unstructured[all-docs] unstructured-inference
Restart session in Colab!
You must have restarted your session before to run the next cell.
import pathlib
from pathlib import Path
# select the partition function
from unstructured.partition.pdf import partition_pdf # version unstructured 0.11.5
# Define parameters for Unstructured's library
## include_page_breaks
# include page breaks (default is False)
include_page_breaks = True
## strategy
# The strategy to use for partitioning the PDF. Valid strategies are "hi_res", "ocr_only", and "fast".
# When using the "hi_res" strategy, the function uses a layout detection model to identify document elements.
# hi_res" is used for analyzing PDFs and extracting table structure (default is "auto")
strategy = "hi_res"
## infer_table_structure
# Only applicable if `strategy=hi_res`.
# If True, any Table elements that are extracted will also have a metadata field named "text_as_html" where the table's text content is rendered into an html string.
# I.e., rows and cells are preserved.
# Whether True or False, the "text" field is always present in any Table element and is the text content of the table (no structure).
if strategy == "hi_res": infer_table_structure = True
else: infer_table_structure = False
## extract_element_types
# Get images of tables
if infer_table_structure == True: extract_element_types=['Table']
else: extract_element_types=None
## max_characters
# The maximum number of characters to include in a partition (document element)
# If None is passed, no maximum is applied.
# Only applies to the "ocr_only" strategy (default is 1500)
if strategy != "ocr_only": max_characters = None
## languages
# The languages to use for the Tesseract agent.
# To use a language, you'll first need to install the appropriate Tesseract language pack.
languages = ["eng"] # example if more than one "eng+por" (default is "eng")
## model_name
# @requires_dependencies("unstructured_inference")
# yolox: best model for table extraction. Other options are yolox_quantized, detectron2_onnx and chipper depending on file layout
# source: https://unstructured-io.github.io/unstructured/best_practices/models.html
hi_res_model_name = "yolox"
path = "/content/"
filename = path + "Quarterly.Financial.Report.Template.pdf"
# Returns a List[Element] present in the pages of the parsed pdf document
elements = partition_pdf(
filename=filename,
include_page_breaks=include_page_breaks,
strategy=strategy,
infer_table_structure=infer_table_structure,
extract_element_types=extract_element_types,
max_characters=max_characters,
languages=languages,
hi_res_model_name=hi_res_model_name,
)
# get output as json
from unstructured.staging.base import elements_to_json
elements_to_json(elements, filename=f"{filename}.json") # Takes a while for file to show up on the Google Colab
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
yolox_l0.05.onnx: 0%| | 0.00/217M [00:00<?, ?B/s]
config.json: 0%| | 0.00/1.47k [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/115M [00:00<?, ?B/s]
model.safetensors: 0%| | 0.00/46.8M [00:00<?, ?B/s]
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked'] - This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). - This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
def process_json_file(input_filename):
# Read the JSON file
with open(input_filename, 'r') as file:
data = json.load(file)
# Iterate over the JSON data and extract required table elements
extracted_elements = []
text_prev = ""
for i,entry in enumerate(data):
if entry["type"] == "Title":
text = "<h1>" + entry["text"] + "</h1>"
elif entry["type"] == "Table":
text = entry["metadata"]["text_as_html"]
else:
text = "<p>" + entry["text"] + "</p>"
if text != text_prev: extracted_elements.append(text)
text_prev = text
# Write the extracted elements to the output file
html_start = """
<!DOCTYPE html>
<html>
<head>
<title>Document Information</title>
<style>
table {
width: 100%;
border-collapse: collapse;
}
th, td {
border: 1px solid black;
padding: 8px;
text-align: left;
}
th {
background-color: #f2f2f2;
}
</style>
</head>
<body>
"""
html_end = """
</body>
</html>
"""
output_file_html = path + Path(input_filename).name.replace(".json", "") + "_" + model_name + ".html"
with open(output_file_html, 'w') as output_file:
output_file.write(html_start + "\n")
for element in extracted_elements:
output_file.write(element + "\n")
output_file.write(html_end + "\n")
return str(output_file_html)
import json
output_file_html = process_json_file(f"{filename}.json") # It can take a while for the .html file to show up in Colab
from google.colab import files
files.download(output_file_html)