In another notebook, I showed how to get high-resolution page images from newspapers. But what if you only want a nice square thumbnail for display purposes? This notebook gets the page image and then crops and resizes the top of the article to create a thumbnail.
Of course, if you're doing this to lots of articles you won't want to feed each one in manually. If you're viewing this notebook in app mode (no code visible), just click on the 'Edit app' button to see what's going on behind the scenes. You should be able to copy and modify the code to suit your purposes.
Briefly, the steps to generate a thumbnail are:
import base64
import os
import re
from io import BytesIO
import ipywidgets as widgets
import requests
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from PIL import Image, ImageOps
%%capture
# TESTING
%load_ext dotenv
%dotenv
titles = {}
results = widgets.Output()
def get_box(zones):
"""
Loop through all the zones to find the outer limits of each boundary.
Return a bounding box around the article.
"""
left = 10000
right = 0
top = 10000
bottom = 0
page_id = zones[0]["data-page-id"]
for zone in zones:
if int(zone["data-x"]) < left:
left = int(zone["data-x"])
for zone in zones:
if int(zone["data-x"]) < (left + 200):
if int(zone["data-y"]) < top:
top = int(zone["data-y"])
if (int(zone["data-x"]) + int(zone["data-w"])) > right:
right = int(zone["data-x"]) + int(zone["data-w"])
if (int(zone["data-y"]) + int(zone["data-h"])) > bottom:
bottom = int(zone["data-y"]) + int(zone["data-h"])
# For a square image
if bottom > top + (right - left):
bottom = top + (right - left)
return {
"page_id": page_id,
"left": left,
"top": top,
"right": right,
"bottom": bottom,
}
def get_illustration(zone):
page_id = zone["data-page-id"]
left = int(zone["data-x"])
right = int(zone["data-x"]) + int(zone["data-w"])
top = int(zone["data-y"])
bottom = int(zone["data-y"]) + int(zone["data-h"])
return {
"page_id": page_id,
"left": left,
"top": top,
"right": right,
"bottom": bottom,
}
def get_article_box(article_url, illustrated=False):
"""
Positional information about the article is attached to each line of the OCR output in data attributes.
This function loads the HTML version of the article and scrapes the x, y, and width values for each line of text
to determine the coordinates of a box around the article.
"""
response = requests.get(article_url)
soup = BeautifulSoup(response.text, "lxml")
# Lines of OCR are in divs with the class 'zone'
# 'onPage' limits to those on the current page
illustrations = soup.select("div.illustration.onPage")
if illustrations and illustrated is True:
zone = illustrations[0].parent
box = get_illustration(zone)
else:
zones = soup.select("div.zone.onPage")
box = get_box(zones)
return box
def get_article_thumbnail(b):
"""
Extract a square thumbnail of the article from the page image.
"""
results.clear_output(wait=True)
article_id = re.search(r"article\/{0,1}(\d+)", article_url.value).group(1)
# Get position of article on the page(s)
box = get_article_box(
"http://nla.gov.au/nla.news-article{}".format(article_id),
illustrated=illustrated.value,
)
# print(box)
# Construct the url we need to download the page image
page_url = (
"https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}".format(
box["page_id"], 7
)
)
# Download the page image
response = requests.get(page_url)
# Open download as an image for editing
img = Image.open(BytesIO(response.content))
# Use coordinates of top line to create a square box to crop thumbnail
points = (box["left"], box["top"], box["right"], box["bottom"])
# Crop image to article box
thumb = img.crop(points)
# Resize
thumb.thumbnail((size.value, size.value), Image.ANTIALIAS)
new_w, new_h = thumb.size
# Squarify
delta_w = size.value - new_w
delta_h = size.value - new_h
padding = (
delta_w // 2,
delta_h // 2,
delta_w - (delta_w // 2),
delta_h - (delta_h // 2),
)
thumb = ImageOps.expand(thumb, padding, fill="white")
# Create a filename for the thumbnail
thumb_file = "nla.news-article{}-{}.jpg".format(article_id, size.value)
# To avoid problems with saving & using local files, we're going to save the image as a file object
# Create a file object to save the image into
image_file = BytesIO()
# Save the image into the file object
thumb.save(image_file, "JPEG")
# Go to the start of the file object
image_file.seek(0)
# For the download link we can use a data uri -- a base64 encoded version of the file
# Encode the file
encoded_image = base64.b64encode(image_file.read()).decode()
# Create a data uri string
encoded_string = "data:image/png;base64," + encoded_image
# Reset to the beginning
image_file.seek(0)
with results:
# Create a download link using the data uri
display(
HTML(
'<a download="{0}" href="{1}">Download {0}</a>'.format(
thumb_file, encoded_string
)
)
)
# Display the image
display(widgets.Image(value=image_file.read(), format="jpg"))
You can use the url in your browser's location bar or an article permalink.
article_url = widgets.Text(
placeholder="Enter an article url", description="Article/Page:", disabled=False
)
display(article_url)
Generate a square thumbnail with this height and width (in pixels).
size = widgets.BoundedIntText(
min=100, max=800, value=500, step=50, description="Size:", disabled=False
)
display(size)
If there's an illustration in the article, check this box to use it as the thumbnail. The illustration will not be cropped, so whitespace will be added around the image to make it square.
illustrated = widgets.Checkbox(
value=False, description="Use illustration as thumbnail", disabled=False
)
display(illustrated)
button = widgets.Button(
description="Get thumbnail",
disabled=False,
button_style="primary",
tooltip="Click to download",
icon="",
)
button.on_click(get_article_thumbnail)
display(button)
display(results)
# TESTING
if os.getenv("GW_STATUS") == "dev":
article_url.value = "https://trove.nla.gov.au/newspaper/article/61389505"
button.click()
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.