#!/usr/bin/env python # coding: utf-8 # # Extract text from PDF images using Tesseract # # Although I was able to [extract text from the PDFs directly](tas-pod-save-text-images.ipynb), I wasn't happy with the quality. In particular, column layout detection was quite variable, munging values from different columns together. After a few tests, I decided that re-OCRing the images using [Tesseract](https://pypi.org/project/pytesseract/) would produce better results. Tesseract's automatic page layout detection does a pretty good job of identifying the columns, and the OCR quality in general seems better. There's still some munging of values across columns and various other errors, but I think the quality is good enough for searching. # In[36]: from pathlib import Path import pytesseract from natsort import natsorted, ns from PIL import Image # In[ ]: # Get a list of volumes vols = natsorted( [d for d in Path("tasmania").glob("AUTAS*") if d.is_dir()], alg=ns.PATH ) # Loop through each volume for vol in vols: print(vol.name) # Create a directory for the OCRd text ocr_path = Path(vol, "tesseract") ocr_path.mkdir(exist_ok=True) # Loop through all the images in the volume vol_images = natsorted(Path(vol, "images").glob("*.jpg"), alg=ns.PATH) for img_file in vol_images: with Image.open(img_file) as img: # Extract the text from the image # This is the simplest text-extraction method, you can get a lot more info about positions if you need it. text = pytesseract.image_to_string(img) # Save the text Path(ocr_path, f"{img_file.stem}.txt").write_text(text) # ---- # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/) as part of the [Everyday Heritage](https://everydayheritage.au/) project.