After downloading all the PDFs of the Tasmanian Post Office Directories from 1890 to 1948, I extracted the images, uploaded the images to Amazon s3, and extracted text from the images using Tesseract. This notebook brings everything together in an SQLite database ready for delivery through Datasette. Each page of text is indexed by line and linked to the page images.
This is a slightly modified version of the code I used with data from the Trove journals section to generate new search interfaces for the NSW Post Office directories, and the Sydney Telephone directories.
Explore the completed search interface for the Tasmanian Post Office Directories!
# Let's import the libraries we need.
import json
import re
from pathlib import Path
import requests
from natsort import natsorted, ns
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from sqlite_utils import Database
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
First we'll create a skeleton metadata file for Datasette.
db_path = Path("tasmania-datasette")
db_path.mkdir(exist_ok=True)
# This the basic metadata file that will be used by Datasette
# The processing steps below will add details for each table/volume into the metadata file
# Obviously you'd modify this for a different publication!
metadata = {
"title": "Tasmanian Post Office Directories",
"description_html": "<p>This is an experimental interface to the Tasmanian Post Office Directories, which have been digitised from the collections of the State Library of NSW and are <a href='https://nla.gov.au/nla.obj-820136958'>now available on Trove</a>.</p><p>Searching for entries in the directories on Trove is difficult, because Trove's results are grouped by 'article', which is not very helpful for publications like these where information is organised by row. This interface searches for individual lines of text, rather than pages or articles. So it points you straight to entries of interest. Once you've found something, you can view the entry within the context of the complete page, or click back to Trove to explore further.</p>",
"databases": {
"tasmanian-post-office-directories": {
"title": "Tasmanian Post Office Directories, 1890 to 1948",
"source": "Trove",
"source_url": "https://stors.tas.gov.au/ILS/SD_ILS-981598",
"tables": {},
}
},
}
Now we'll populate the SQLite database.
# Create the database
# Change the name as apporpriate!
db = Database(Path(db_path, "tasmanian-post-office-directories.db"))
# Create database tables for pages and volumes
page_table = db["pages"]
vols_table = db["volumes"]
# Loop through the directories of each volume created by the harvesting process (above)
# Use natsorted so that they're processed in date order
vols = natsorted(
[d for d in Path("tasmania").glob("AUTAS*") if d.is_dir()], alg=ns.PATH
)
for vol in vols:
print(vol)
obj_id = vol.name
# In the case of the PO Directories each volume has a different year (or year range) in the title.
# Here we're extracting the year and id, but extracting the year in this way might not work for other titles.
# The year is used as the title of the table in Datasette
year = re.search(r"AUTAS\d+P([0-9\-]+)PDF", vol.name).group(1)
# Add a record for this volume to the database
vols_table.insert({"vol_id": obj_id, "year": year}, pk="vol_id")
# Update the metadata file with details of this volume
metadata["databases"]["tasmanian-post-office-directories"]["tables"][year] = {
"title": f"Tasmanian Post Office Directory, {year}",
"source": "Libraries Tasmania",
"source_url": f"https://stors.tas.gov.au/{obj_id}",
"searchmode": "raw",
}
# Create a table for this volume. For the PO directories I'm using the year as the table name.
# If year isn't available, some other way of naming the table would be necessary, such as the full title.
vol_table = db[year]
# Loop through all the text page by page for this volume
pages = natsorted(
[p for p in Path(vol, "tesseract").glob("*.txt") if p.is_file()], alg=ns.PATH
)
for page in pages:
lines = []
# text = page.read_text()
# Insert details about this page into the pages table
page_num = int(re.search(r"PDF-(\d+).txt", page.name).group(1))
page_table.insert(
{"page_id": page.stem, "page": page_num, "vol_id": obj_id},
pk=("page_id"),
foreign_keys=[("vol_id", "volumes")],
)
# Open the text file and loop through the lines
with page.open("r") as txt:
line_num = 1
for line in txt:
# Get rid of blank lines
line = line.replace("\n", "").strip()
# If line is not blank, add details to a list of lines from this page
if line:
lines.append(
{
"page": page_num,
"line": line_num,
"text": line,
"page_id": page.stem,
}
)
line_num += 1
# Insert all the lines from this page into the db
vol_table.insert_all(
lines, pk=("page", "line"), foreign_keys=[("page_id", "pages", "page_id")]
)
# Add a full text index on the line text
vol_table.enable_fts(["text"])
# vol_table.optimize()
# Save the updated metadata file
with open(Path(db_path, "datasette-metadata.json"), "w") as md_file:
json.dump(metadata, md_file, indent=4)
You should now have an SQLite database containing the indexed text of all the volumes in the collection. You can explore the database using Datasette. In a Python environment you'll need to install:
Then, from within the directory containing the database, run:
datasette tasmanian-post-office-directories.db -m metadata.json
To share your database publicly, look at Datasette's publishing options.
Created by Tim Sherratt for the GLAM Workbench as part of the Everyday Heritage project.