#!/usr/bin/env python # coding: utf-8 # # Make composite images from lots of Trove newspaper thumbnails # # This notebook starts with a search in Trove's newspapers. It uses the Trove API to work it's way through the search results. For each article it creates a thumbnail image using the [code from this notebook](Get-article-thumbnail.ipynb). Once this first stage is finished, you have a directory full of lots of thumbnails. # # The next stage takes all those thumbnails and pastes them one by one into a BIG image to create a composite, or mosaic. # # You'll need to think carefully about the number of results in your search, and the size of the image you want to create. Harvesting all the thumbnails can take a long time. # # Also, you need to be able to set a path to a font file, so it's probably best to run this notebook on your local machine rather than in a cloud service, so you have more control over things like font. You might also need to adjust the font size depending on the font you choose. # # Some examples: # # * [White Australia Policy](https://easyzoom.com/image/139535) # * [Australian aviators, pilots, flyers, and airmen](https://www.easyzoom.com/imageaccess/9d26953ccdf5475cad9c11f308cd7988) # In[ ]: import ipywidgets as widgets import requests import random import re import os from IPython.display import display, HTML, FileLink, clear_output from bs4 import BeautifulSoup from PIL import Image, ImageDraw, ImageFont from io import BytesIO from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from tqdm import tqdm_notebook s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ]) s.mount('https://', HTTPAdapter(max_retries=retries)) s.mount('http://', HTTPAdapter(max_retries=retries)) # ## Set your parameters # # Edit the values below as required. # In[ ]: font_path = '/Library/Fonts/Courier New.ttf' font_size = 12 # Insert your search query query = 'title:"white australia policy"' # Insert your Trove API key api_key = '' size = 200 # Size of the thumbnails cols = 90 # The width of the final image will be cols x size rows = 55 # The height of the final image will be cols x size # ## Define some functions # In[ ]: def get_article_top(article_url): ''' Positional information about the article is attached to each line of the OCR output in data attributes. This function loads the HTML version of the article and scrapes the x, y, and width values for the top line of text (ie the top of the article). ''' response = requests.get(article_url) soup = BeautifulSoup(response.text, 'lxml') # Lines of OCR are in divs with the class 'zone' # 'onPage' limits to those on the current page zones = soup.select('div.zone.onPage') # Start with the first element, but... top_element = zones[0] top_y = int(top_element['data-y']) # Illustrations might come after text even if they're above them on the page # So loop through the zones to find the element with the lowest 'y' attribute for zone in zones: if int(zone['data-y']) < top_y: top_y = int(zone['data-y']) top_element = zone top_x = int(top_element['data-x']) top_w = int(top_element['data-w']) return {'x': top_x, 'y': top_y, 'w': top_w} def get_thumbnail(article, size, font_path, font_size): buffer = 0 try: page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1) except (AttributeError, KeyError): thumb = None else: # Get position of top line of article article_top = get_article_top(article['troveUrl']) # Construct the url we need to download the image page_url = 'https://trove.nla.gov.au/ndp/imageservice/nla.news-page{}/level{}'.format(page_id, 7) # Download the page image response = s.get(page_url, timeout=120) # Open download as an image for editing img = Image.open(BytesIO(response.content)) # Use coordinates of top line to create a square box to crop thumbnail box = (article_top['x'] - buffer, article_top['y'] - buffer, article_top['x'] + article_top['w'] + buffer, article_top['y'] + article_top['w'] + buffer) try: # Crop image to create thumb thumb = img.crop(box) except OSError: thumb = None else: # Resize thumb thumb.thumbnail((size, size), Image.ANTIALIAS) article_id = 'nla.news-article{}'.format(article['id']) fnt = ImageFont.truetype(font_path, 12) draw = ImageDraw.Draw(thumb) try: # Check if RGB draw.rectangle([(0, size-10), (size, size)], fill=(255, 255, 255, 255)) draw.text((0,size-10), article_id, font=fnt, fill=(0, 0, 0, 255)) except TypeError: # Must be grayscale draw.rectangle([(0, size-10), (200, 200)], fill=(255)) draw.text((0,size-10), article_id, font=fnt, fill=(0)) return thumb def get_total_results(params): ''' Get the total number of results for a search. ''' these_params = params.copy() these_params['n'] = 0 response = s.get('https://api.trove.nla.gov.au/v2/result', params=these_params, timeout=60) # print(response.url) data = response.json() return int(data['response']['zone'][0]['records']['total']) def get_thumbnails(query, api_key, size, font_path, font_size): #im = Image.new('RGB', (cols*size, rows*size)) params = { 'q': query, 'zone': 'newspaper', 'encoding': 'json', 'bulkHarvest': 'true', 'n': 100, 'key': api_key, 'reclevel': 'full' } start = '*' total = get_total_results(params) x = 0 y = 0 index = 1 with tqdm_notebook(total=total) as pbar: while start: params['s'] = start response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=60) data = response.json() # The nextStart parameter is used to get the next page of results. # If there's no nextStart then it means we're on the last page of results. try: start = data['response']['zone'][0]['records']['nextStart'] except KeyError: start = None for article in data['response']['zone'][0]['records']['article']: thumb_file = 'thumbs/{}-nla.news-article{}.jpg'.format(article['date'], article['id']) if not os.path.exists(thumb_file): try: # Get page id page_id = re.search(r'page\/(\d+)', article['trovePageUrl']).group(1) except (AttributeError, KeyError): pass else: thumb = get_thumbnail(article, size, font_path, font_size) if thumb: thumb.save(thumb_file) pbar.update(1) def create_composite(cols, rows, size): im = Image.new('RGB', (cols*size, rows*size)) thumbs = [t for t in os.listdir('thumbs') if t[-4:] == '.jpg'] # This will sort by date, comment it out if you don't want that # thumbs = sorted(thumbs) x = 0 y = 0 for index, thumb_file in tqdm_notebook(enumerate(thumbs, 1)): thumb = Image.open('thumbs/{}'.format(thumb_file)) try: im.paste(thumb, (x, y, x+size, y+size)) except ValueError: pass else: if (index % cols) == 0: x = 0 y += size else: x += size im.save('composite-{}-{}.jpg'.format(cols, rows), quality=90) # ## Create all the thumbnails # In[ ]: get_thumbnails(query, api_key, size, font_path, font_size) # ## Turn the thumbnails into one big image # In[ ]: create_composite(cols, rows, size)