# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
! pip install -q johnsnowlabs
! pip install transformers
! python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
Using my.johnsnowlabs.com SSO
from johnsnowlabs import nlp, finance, visual
# nlp.install(force_browser=True, visual=True)
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()
nlp.install()
! pip install -U Pillow==9.4.0
Restart the session and run and run the cells from here.
import os
os.kill(os.getpid(), 9)
from johnsnowlabs import nlp, finance, visual
spark = nlp.start(visual=True)
Restart the session and run and run the cells from here.
import os
os.kill(os.getpid(), 9)
from johnsnowlabs import nlp, finance, visual
spark = nlp.start(visual=True)
import pyspark.sql.functions as F
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-ocr-workshop/master/jupyter/data/signature/image_147.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_advertisement2.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_%20letter2.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_advertisement.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_budget2.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_email2.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_form.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_invoice3.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_invoice4.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_letter.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_news%20article.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_presentation.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/streamlit_notebooks/ocr/data/visual_document_examples/Document_resume.png
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-ocr-workshop/master/jupyter/data/tab_images/cTDaR_t10168.jpg
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-ocr/master/python/sparkocr/resources/ocr/images/check.jpg?token=GHSAT0AAAAAABVMES4Q2ZOJEOUXLTCY7TDCYZDDW3A
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/visual_document_sample_images/financial_table_extraction.png
sample_images = spark.read.format("binaryFile").load("*.png").cache()
sample_images.show()
+--------------------+--------------------+-------+--------------------+ | path| modificationTime| length| content| +--------------------+--------------------+-------+--------------------+ |file:/content/ima...|2023-01-13 15:43:...|1799448|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...|1364836|[89 50 4E 47 0D 0...| |file:/content/Doc...|2023-01-13 15:43:...| 876092|[89 50 4E 47 0D 0...| |file:/content/Doc...|2023-01-13 15:43:...| 465896|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 463618|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 341744|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 305933|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 235098|[FF D8 FF DB 00 8...| |file:/content/Doc...|2023-01-13 15:43:...| 230268|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 215445|[89 50 4E 47 0D 0...| |file:/content/Doc...|2023-01-13 15:43:...| 147605|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 137875|[FF D8 FF E0 00 1...| |file:/content/Doc...|2023-01-13 15:43:...| 92964|[FF D8 FF E0 00 1...| |file:/content/fin...|2023-01-13 15:43:...| 20043|[89 50 4E 47 0D 0...| +--------------------+--------------------+-------+--------------------+
image_df = visual.BinaryToImage().transform(sample_images)
visual.display_images(image_df)
🔎With this model, you can classify financial documents using text and layout data with new features offered by Spark OCR.
🔎Classes detected by the model:
letter
form
email
handwritten
advertisement
scientific report
scientific publicatio
specification
file folder
news article
budget
invoice
presentation
questionnaire
resume
memo
# from sparkocr.utils import get_vocabulary_dict
binary_to_image = visual.BinaryToImage()\
.setOutputCol("image") \
.setImageType(visual.ImageType.TYPE_3BYTE_BGR)
doc_class = visual.VisualDocumentClassifierV3() \
.pretrained("dit_base_finetuned_rvlcdip", "en", "clinical/ocr") \
.setInputCols(["image"])\
.setOutputCol("label")
pipeline = nlp.PipelineModel(stages=[binary_to_image,
doc_class])
results = pipeline.transform(sample_images).cache()
results
image | path | modificationTime | length | pagenum | label | exception |
---|---|---|---|---|---|---|
{file:/content/im... | file:/content/ima... | 2023-01-13 15:43:... | 1799448 | 0 | letter | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 1364836 | 0 | invoice | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 876092 | 0 | presentation | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 465896 | 0 | resume | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 463618 | 0 | form | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 341744 | 0 | budget | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 305933 | 0 | advertisement | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 235098 | 0 | advertisement | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 230268 | 0 | letter | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 215445 | 0 | letter | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 147605 | 0 | news article | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 137875 | 0 | invoice | null |
{file:/content/Do... | file:/content/Doc... | 2023-01-13 15:43:... | 92964 | 0 | null | |
{file:/content/fi... | file:/content/fin... | 2023-01-13 15:43:... | 20043 | 0 | scientific report | null |
results.select("image", "label")
image | label |
---|---|
{file:/content/im... | letter |
{file:/content/Do... | invoice |
{file:/content/Do... | presentation |
{file:/content/Do... | resume |
{file:/content/Do... | form |
{file:/content/Do... | budget |
{file:/content/Do... | advertisement |
{file:/content/Do... | advertisement |
{file:/content/Do... | letter |
{file:/content/Do... | letter |
{file:/content/Do... | news article |
{file:/content/Do... | invoice |
{file:/content/Do... | |
{file:/content/fi... | scientific report |
financial
.¶imagePath = "./image_147.png"
# Read image file as binary file
df = spark.read.format("binaryFile").load(imagePath)
visual.display_images(visual.BinaryToImage().transform(df), "image")
Image #0: Origin: file:/content/image_147.png Resolution: 0 dpi Width: 2549 px Height: 3506 px Mode: ImageType.TYPE_BYTE_GRAY Number of channels: 1
ImageHandwrittenDetector is a DL model for detect handwritten text on the image.
Detector support following labels:
signature
date
name
title
address
others
We will detect signature
here :
binary_to_image = visual.BinaryToImage()
binary_to_image.setImageType(visual.ImageType.TYPE_3BYTE_BGR)
pretrained_model = ("image_handwritten_detector_gsa0628", "en", "public/ocr/models")
signature_detector = visual.ImageHandwrittenDetector() \
.pretrained(*pretrained_model) \
.setInputCol("image") \
.setOutputCol("signature_regions") \
.setOutputLabels(["signature"]) \
.setScoreThreshold(0.4)
draw_regions = visual.ImageDrawRegions() \
.setInputCol("image") \
.setInputRegionsCol("signature_regions") \
.setOutputCol("image_with_regions") \
.setFontSize(16) \
.setRectColor(visual.Color.red)
pipeline = nlp.PipelineModel(stages=[
binary_to_image,
signature_detector,
draw_regions
])
image_handwritten_detector_gsa0628 download started this may take some time. Approximate size to download 243.9 MB
result = pipeline.transform(df).cache()
visual.display_images(result, "image_with_regions")
Image #0: Origin: file:/content/image_147.png Resolution: 0 dpi Width: 2549 px Height: 3506 px Mode: ImageType.TYPE_3BYTE_BGR Number of channels: 3