PDFs of the Tasmanian Post Office Directory from 1890 to 1948 are available from Libraries Tasmania. This notebook downloads all 48 PDFs, then extracts images and text from the PDFs using PyMuPDF.
Further processing:
import re
from pathlib import Path
import fitz
import requests
# The base url for downloads from Libraries Tas
download_url = "https://stors.tas.gov.au/download/"
# This HTML list of PDFs was just copied from the page source of the Libraries Tas viewer. It could of course be scraped automatically.
pdf_list = """
<li pid="AUTAS001126438076P1896-97PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1896-97PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Tasmanian Post Office Directory 1896-97</span></a></li><li pid="AUTAS001126438076P1900PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1900PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1900</span></a></li><li pid="AUTAS001126438076P1890-91PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1890-91PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Tasmanian Post Office Directory 1890-91</span></a></li><li pid="AUTAS001126438076P1892-93PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1892-93PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Tasmanian Post Office Directory 1892-93</span></a></li><li pid="AUTAS001126438076P1894-95PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1894-95PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Tasmanian Post Office Directory 1894-95</span></a></li><li pid="AUTAS001126438076P1898PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1898PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1898</span></a></li><li pid="AUTAS001126438076P1906PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1906PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1906</span></a></li><li pid="AUTAS001126438076P1899PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1899PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1899</span></a></li><li pid="AUTAS001126438076P1907PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1907PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1907</span></a></li><li pid="AUTAS001126438076P1908PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1908PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1908</span></a></li><li pid="AUTAS001126438076P1901PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1901PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1901</span></a></li><li pid="AUTAS001126438076P1909PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1909PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1909</span></a></li><li pid="AUTAS001126438076P1902PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1902PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1902</span></a></li><li pid="AUTAS001126438076P1910PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1910PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1910</span></a></li><li pid="AUTAS001126438076P1903PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1903PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1903</span></a></li><li pid="AUTAS001126438076P1911PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1911PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1911</span></a></li><li pid="AUTAS001126438076P1904PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1904PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1904</span></a></li><li pid="AUTAS001126438076P1912PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1912PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1912</span></a></li><li pid="AUTAS001126438076P1905PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1905PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1905</span></a></li><li pid="AUTAS001126438076P1913PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1913PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1913</span></a></li><li pid="AUTAS001126438076P1914PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1914PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1914</span></a></li><li pid="AUTAS001126438076P1915PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1915PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1915</span></a></li><li pid="AUTAS001126438076P1916PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1916PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1916</span></a></li><li pid="AUTAS001126438076P1917PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1917PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1917</span></a></li><li pid="AUTAS001126438076P1918PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1918PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1918</span></a></li><li pid="AUTAS001126438076P1919PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1919PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1919</span></a></li><li pid="AUTAS001126438076_1920" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1920" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1920 </span></a></li><li pid="AUTAS001126438076P1921PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1921PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1921</span></a></li><li pid="AUTAS001126438076P1922PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1922PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1922</span></a></li><li pid="AUTAS001126438076P1923PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1923PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1923</span></a></li><li pid="AUTAS001126438076P1924PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1924PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1924</span></a></li><li pid="AUTAS001126438076P1925PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1925PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1925</span></a></li><li pid="AUTAS001126438076P1926PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1926PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1926</span></a></li><li pid="AUTAS001126438076P1927PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1927PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1927</span></a></li><li pid="AUTAS001126438076P1928PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1928PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1928</span></a></li><li pid="AUTAS001126438076P1929PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1929PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1929</span></a></li><li pid="AUTAS001126438076P1930PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1930PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1930</span></a></li><li pid="AUTAS001126438076P1931PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1931PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1931</span></a></li><li pid="AUTAS001126438076P1932PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1932PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1932</span></a></li><li pid="AUTAS001126438076_1933-34" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1933-34" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory 1933-34 </span></a></li><li pid="AUTAS001126438076P1935PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1935PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1935</span></a></li><li pid="AUTAS001126438076P1936PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1936PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1936</span></a></li><li pid="AUTAS001126438076P1937PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1937PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1937</span></a></li><li pid="AUTAS001126438076P1938PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1938PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1938</span></a></li><li pid="AUTAS001126438076P1939PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1939PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1939</span></a></li><li pid="AUTAS001126438076_1940-41" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1940-41" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory1940-41 </span></a></li><li pid="AUTAS001126438076_1941-42" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1941-42" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory1941-42 </span></a></li><li pid="AUTAS001126438076_1942-43" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1942-43" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1942-43</span></a></li><li pid="AUTAS001126438076_1943-44" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1943-44" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory1943-44 </span></a></li><li pid="AUTAS001126438076_1944-45" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1944-45" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory1944-45 </span></a></li><li pid="AUTAS001126438076_1945-46" parent="AUTAS001126438076"><a href="/AUTAS001126438076_1945-46" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise\'s Tasmanian Directory1945-46 </span></a></li><li pid="AUTAS001126438076P1945PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1945PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1945</span></a></li><li pid="AUTAS001126438076P1947PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1947PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1947</span></a></li><li pid="AUTAS001126438076P1948PDF" parent="AUTAS001126438076"><a href="/AUTAS001126438076P1948PDF" target="imageFrame"><span class="fiv-cla fiv-icon-pdf"></span> <span class="label">Wise's Tasmanian Directory 1948</span></a></li>
"""
We'll extract all the volume identifiers from the HTML then download each PDF.
# Download all the PDFs
# Extract all the volume identifiers from the HTML fragment
pids = re.findall(r"href=\"/([A-Z0-9\-]+)\"", pdf_list)
# Loop through the identifiers, downloading and saving each PDF
for pid in pids:
print(pid)
response = requests.get(f"{download_url}{pid}")
Path("tasmania", f"{pid}.pdf").write_bytes(response.content)
# Loop through all the PDFs
for pdf in Path("tasmania").glob("*.pdf"):
print(pdf.name)
pid = pdf.name.split(".")[0]
# Create directory for volume
data_dir = Path("tasmania", pid)
data_dir.mkdir(exist_ok=True)
# Create directories for text and images
text_dir = Path(data_dir, "text")
image_dir = Path(data_dir, "images")
text_dir.mkdir(exist_ok=True)
image_dir.mkdir(exist_ok=True)
# Open the PDF with PyMuPDF
doc = fitz.open(pdf)
for i, page in enumerate(doc):
# Get images
for xref in page.get_images():
pix = fitz.Pixmap(doc, xref[0])
image_file = Path(image_dir, f"{pid}-{i+1}.jpg")
pix.save(image_file)
# Get text
text_path = Path(text_dir, f"{pid}-{i+1}.txt")
# The sort option tries to organise the text into a natural reading view.
# However, this doesn't always manage to identify column boundaries, so values from adjacent columns can be munged together.
text = page.get_text(sort=True)
Path(text_path).write_text(text)
Created by Tim Sherratt for the GLAM Workbench as part of the Everyday Heritage project.