Trove Newspaper & Gazette Harvester

Download large quantities of digitised newspaper and gazette articles from Trove with this simplified, web-based interface to the Trove Harvester command-line tool.

In [1]:
# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.
In [ ]:
import time
import argparse
import os
import shutil
import datetime
import arrow
import json
import ipywidgets as widgets
from IPython.display import display, HTML, FileLink, clear_output
from pprint import pprint
import re
import unicodecsv as csv
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
    from urllib.parse import urlparse, parse_qsl
except ImportError:
    from urlparse import urlparse, parse_qsl
# Import everything from the troveharvester package
from troveharvester.__main__ import *
from import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 500, 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
In [ ]:
# These basically replace functions in the TroveHarvester package
# Instead of getting parameters from the command line, they get them from the widgets.

def nb_save_meta(data_dir, harvest):
    Save the query metadata in a JSON file.
    Useful for documenting your harvest.
    meta = {}
    meta['query'] = query_url.value
    meta['key'] = api_key.value
    meta['max'] = None
    meta['pdf'] = pdf.value
    meta['text'] = text.value
    meta['image'] = image.value
    meta['harvest'] = harvest
    meta['date_started'] =
    meta['start'] = '*'
    with open(os.path.join(data_dir, 'metadata.json'), 'w') as meta_file:
        json.dump(meta, meta_file, indent=4)

def nb_prepare_harvest(b):
    Create ouput directories, get parameters from widgets & run the harvest.
    Triggered by the Start button.
    harvest = str(int(time.time()))  # Get rid of fractions
    data_dir = os.path.join(os.getcwd(), 'data', harvest)
    nb_save_meta(data_dir, harvest)
    if pdf.value == True:
        make_dir(os.path.join(data_dir, 'pdf'))
    if text.value == True:
        make_dir(os.path.join(data_dir, 'text'))
    if image.value == True:
        make_dir(os.path.join(data_dir, 'image'))
    with out:
        nb_start_harvest(data_dir=data_dir, key=api_key.value, query=query_url.value, pdf=pdf.value, text=text.value, image=image.value, start='*', max=None)
        # out.clear_output(wait=True)
        shutil.make_archive(data_dir, 'zip', data_dir)
        display(HTML(f'<b>Download results</b>: <a download href="data/{harvest}.zip">data/{harvest}.zip</a>'))
def nb_start_harvest(data_dir, key, query, pdf, text, image, start, max):
    Start a harvest.
    # Turn the query url into a dictionary of parameters
    params = prepare_query(query, text, key)
    # Create the harvester
    harvester = nb_Harvester(query_params=params, data_dir=data_dir, pdf=pdf, text=text, image=image, start=start, max=max)
    # Go!

class nb_Harvester(Harvester):
        def harvest(self):
            Start the harvest and loop over the result set until finished.
            number = self.number
            params = self.query_params.copy()
            params['n'] = self.number
            with out:
                with tqdm(total=self.maximum, unit='article') as pbar:
                    while self.start and (self.harvested < self.maximum):
                        params['s'] = self.start
                        response = s.get(self.api_url, params=params, timeout=30)
                        # print(response.url)
                            results = response.json()
                        except (AttributeError, ValueError):
                            # Log errors?
                            records = results['response']['zone'][0]['records']
                            self.process_results(records, pbar)
                            # pbar.update(len(records['article']))
In [ ]:
# All the UI stuff

# Somewhere to put the results
out = widgets.Output(layout=widgets.Layout(padding='40px'))

api_key = widgets.Text(
    placeholder='Enter your Trove API key',
    description='API key:',

query_url = widgets.Text(
    placeholder='Enter the url of your search',
    description='Query url:',

text = widgets.Checkbox(
    description='Save full text',

pdf = widgets.Checkbox(
    description='Save PDFs',

image = widgets.Checkbox(
    description='Save articles as images',

start_button = widgets.Button(
        description='Start harvest',
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Start harvest',


Enter your Trove API key

The harvester gets its data from the Trove API. To use the API you need a key — the process is quick, painless, and free. Once you have a key, paste it in below.

In [ ]:

Enter your search query

Use the Trove web interface to construct your search. Remember that the harvester will get all of the matched results, not just the first 2,000 you see in the web interface. Once you're happy with your search, just copy the url and paste it below.

In [ ]:

Set harvest options

By default the harvester only saves the metadata (date, page, title, newspaper etc) from the search results. If you want to save the full text content of each article, just check the 'Save full text' box. You can also save JPEG and PDF copies of every article by checking the 'Save articles as images' or 'Save PDFs' boxes, but note that this will slow down your harvest and generate large download files.

In [ ]:
In [ ]:

Once your harvest is complete a link will appear to download the results as a single, zipped file. See the GLAM Workbench for more information about the contents and format of the results folder.

Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.