The Trove web interface doesn't provide a way of getting high-resolution page images from newspapers. This simple app lets you download page images as complete, high-resolution JPG files.
import base64
import os
import re
from collections import OrderedDict
from io import BytesIO
from operator import itemgetter
import arrow
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
/home/tim/.pyenv/versions/3.8.12/envs/trove-newspapers/lib/python3.8/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.12) doesn't match a supported version! warnings.warn(
%%capture
# Load env variables
%load_ext dotenv
%dotenv
titles = {}
out = widgets.Output()
def get_page_id(article_url):
response = requests.get(article_url)
soup = BeautifulSoup(response.text, "lxml")
# Lines of OCR are in divs with the class 'zone'
# 'onPage' limits to those on the current page
zones = soup.select("div.zone.onPage")
page_id = zones[0]["data-page-id"]
return page_id
def get_page_image(b):
out.clear_output()
# display_button()
article = None
page_id = None
if article_url.value and "page" in article_url.value:
page_id = re.search(r"page\/{0,1}(\d+)", article_url.value).group(1)
elif article_url.value:
page_id = get_page_id(article_url.value)
else:
end = arrow.get(date.value)
start = end.shift(days=-1)
date_query = "date:[{}Z TO {}Z]".format(
start.format("YYYY-MM-DDT00:00:00"), end.format("YYYY-MM-DDT00:00:00")
)
params = {
"zone": "newspaper",
"reclevel": "full",
"encoding": "json",
"n": "1",
"q": "{} firstpageseq:{}".format(date_query, page.value),
"l-title": title.value,
"key": api_key.value,
}
response = requests.get("http://api.trove.nla.gov.au/v2/result", params=params)
data = response.json()
try:
article = data["response"]["zone"][0]["records"]["article"][0]
except (KeyError, IndexError):
with out:
display(HTML("Page not found!"))
if article:
page_id = re.search(r"page\/(\d+)", article["trovePageUrl"]).group(1)
if page_id:
# Construct the url we need to download the image
page_url = (
"http://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
page_id, size.value
)
)
# Download the page image
response = requests.get(page_url)
img_file = BytesIO(response.content)
# For the download link we can use a data uri -- a base64 encoded version of the file
# Encode the file
encoded_image = base64.b64encode(img_file.read()).decode()
# Create a data uri string
encoded_string = "data:image/jpeg;base64," + encoded_image
# Reset to the beginning
img_file.seek(0)
filename = f"{page_id}-level{size.value}.jpg"
with out:
display(
HTML(
f'<a download="{filename}" href="{encoded_string}">Download {filename}</a>'
)
)
display(widgets.Image(value=img_file.read(), format="jpg"))
def get_titles(b):
params = {"encoding": "json", "key": api_key.value}
response = requests.get(
"http://api.trove.nla.gov.au/v2/newspaper/titles", params=params
)
data = response.json()
title_list = [
(t["title"], t["id"]) for t in data["response"]["records"]["newspaper"]
]
title_list.sort(key=itemgetter(0))
titles = OrderedDict(title_list)
title.options = titles
You can use the url in your browser's location bar or an article or page permalink.
article_url = widgets.Text(
placeholder="Enter an article or page url",
description="Article/Page:",
disabled=False,
)
display(article_url)
Text(value='', description='Article/Page:', placeholder='Enter an article or page url')
If you've provided a url above these settings will be ignored.
Get your own Trove API key and enter it below.
api_key = widgets.Text(
placeholder="Enter your Trove API key", description="API key:", disabled=False
)
display(api_key)
Text(value='', description='API key:', placeholder='Enter your Trove API key')
date = widgets.DatePicker(description="Date:", disabled=False)
display(date)
DatePicker(value=None, description='Date:')
page = widgets.IntText(value=1, description="Page:", disabled=False)
display(page)
IntText(value=1, description='Page:')
title = widgets.Dropdown(
options=["Click the button to load titles"],
description="Newspaper:",
disabled=False,
)
titles_button = widgets.Button(
description="Load titles",
disabled=False,
button_style="", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Click to load titles",
icon="",
)
titles_button.on_click(get_titles)
display(widgets.HBox([title, titles_button]))
HBox(children=(Dropdown(description='Newspaper:', options=('Click the button to load titles',), value='Click t…
Page images are available in seven resolutions that correspond to the zoom levels in the Trove web interface. As a rough guide:
size = widgets.BoundedIntText(
min=1, max=7, value=4, description="Size:", disabled=False
)
display(size)
BoundedIntText(value=4, description='Size:', max=7, min=1)
button = widgets.Button(
description="Get page image",
disabled=False,
button_style="primary",
tooltip="Click to download",
icon="",
)
button.on_click(get_page_image)
display(button)
display(out)
Button(button_style='primary', description='Get page image', style=ButtonStyle(), tooltip='Click to download')
Output()
# TESTING
if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"):
api_key.value = os.getenv("TROVE_API_KEY")
article_url.value = "https://trove.nla.gov.au/newspaper/article/61389505"
button.click()
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.