#!/usr/bin/env python
# coding: utf-8

# # Extract text from PDF images using Tesseract
# 
# Although I was able to [extract text from the PDFs directly](tas-pod-save-text-images.ipynb), I wasn't happy with the quality. In particular, column layout detection was quite variable, munging values from different columns together. After a few tests, I decided that re-OCRing the images using [Tesseract](https://pypi.org/project/pytesseract/) would produce better results. Tesseract's automatic page layout detection does a pretty good job of identifying the columns, and the OCR quality in general seems better. There's still some munging of values across columns and various other errors, but I think the quality is good enough for searching.

# In[36]:


from pathlib import Path

import pytesseract
from natsort import natsorted, ns
from PIL import Image


# In[ ]:


# Get a list of volumes
vols = natsorted(
    [d for d in Path("tasmania").glob("AUTAS*") if d.is_dir()], alg=ns.PATH
)

# Loop through each volume
for vol in vols:
    print(vol.name)
    # Create a directory for the OCRd text
    ocr_path = Path(vol, "tesseract")
    ocr_path.mkdir(exist_ok=True)
    # Loop through all the images in the volume
    vol_images = natsorted(Path(vol, "images").glob("*.jpg"), alg=ns.PATH)
    for img_file in vol_images:
        with Image.open(img_file) as img:
            # Extract the text from the image
            # This is the simplest text-extraction method, you can get a lot more info about positions if you need it.
            text = pytesseract.image_to_string(img)
            # Save the text
            Path(ocr_path, f"{img_file.stem}.txt").write_text(text)


# ----
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/) as part of the [Everyday Heritage](https://everydayheritage.au/) project.