The Trove web interface doesn't provide a way of getting high-resolution page images from newspapers. This simple app lets you download page images as complete, high-resolution JPG files.
import ipywidgets as widgets
import requests
import datetime
import arrow
import random
import re
import shutil
from collections import OrderedDict
from operator import itemgetter
from IPython.display import display, HTML, FileLink, clear_output
titles = {}
def display_button():
button = widgets.Button(
description='Get page image',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Click to download',
icon=''
)
button.on_click(get_page_image)
display(button)
def get_page_image(b):
clear_output(wait=True)
display_button()
article = None
page_id = None
if article_url.value and 'page' in article_url.value:
page_id = re.search(r'page\/{0,1}(\d+)', article_url.value).group(1)
elif article_url.value:
article_id = re.search(r'article\/{0,1}(\d+)', article_url.value).group(1)
params = {
'reclevel': 'full',
'encoding': 'json',
'key': api_key.value
}
response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/{}'.format(article_id), params=params)
data = response.json()
article = data['article']
else:
end = arrow.get(date.value)
start = end.shift(days=-1)
date_query = 'date:[{}Z TO {}Z]'.format(start.format('YYYY-MM-DDT00:00:00'), end.format('YYYY-MM-DDT00:00:00'))
params = {
'zone': 'newspaper',
'reclevel': 'full',
'encoding': 'json',
'n': '1',
'q': '{} firstpageseq:{}'.format(date_query, page.value),
'l-title': title.value,
'key': api_key.value
}
response = requests.get('http://api.trove.nla.gov.au/v2/result', params=params)
data = response.json()
try:
article = data['response']['zone'][0]['records']['article'][0]
except (KeyError, IndexError):
display(HTML('Page not found!'))
if article:
page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1)
if page_id:
# Construct the url we need to download the image
page_url = 'http://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(page_id, size.value)
# Download the page image
response = requests.get(page_url, stream=True)
filename = 'data/{}-level{}.jpg'.format(page_id, size.value)
with open(filename, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
display(FileLink(filename))
display(HTML('<img src="{}">'.format(filename)))
def get_titles(b):
params = {
'encoding': 'json',
'key': api_key.value
}
response = requests.get('http://api.trove.nla.gov.au/newspaper/titles', params=params)
data = response.json()
title_list = [(t['title'], t['id']) for t in data['response']['records']['newspaper']]
title_list.sort(key=itemgetter(0))
titles = OrderedDict(title_list)
title.options = titles
Get your own Trove API key and enter it below.
api_key = widgets.Text(
placeholder='Enter your Trove API key',
description='API key:',
disabled=False
)
display(api_key)
You can use the url in your browser's location bar or an article or page permalink.
article_url = widgets.Text(
placeholder='Enter an article or page url',
description='Article/Page:',
disabled=False
)
display(article_url)
If you've provided a url above these settings will be ignored.
date = widgets.DatePicker(
description='Date:',
disabled=False
)
display(date)
page = widgets.IntText(
value=1,
description='Page:',
disabled=False
)
display(page)
title = widgets.Dropdown(
options=['Click the button to load titles'],
description='Newspaper:',
disabled=False,
)
titles_button = widgets.Button(
description='Load titles',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Click to load titles',
icon=''
)
titles_button.on_click(get_titles)
display(widgets.HBox([title, titles_button]))
Page images are available in seven resolutions that correspond to the zoom levels in the Trove web interface. As a rough guide:
size = widgets.BoundedIntText(
min=1,
max=7,
value=4,
description='Size:',
disabled=False
)
display(size)
display_button()