Notebook

Harvest the issues of a newspaper as PDFs¶

This notebook harvests issues of a newspaper as PDFs – one PDF per issue. If the newspaper has an long print run, this will consume large amounts of time and disk space, so you might want to limit your harvest by date range.

The downloaded PDFs are saved in the data/issues folder. The PDF file names have the following structure:

[newspaper identifier]-[issue date as YYYYMMDD]-[issue identifier].pdf

For example:

903-19320528-1791051.pdf

903 – the Glen Innes Examiner
19320528 – 28 May 1932
1791051 – you view in Trove just add this to http://nla.gov.au/nla.news-issue, eg http://nla.gov.au/nla.news-issue1791051

Set up what we need¶

Make sure you paste in your Trove API key where indicated.

In [ ]:

import json
import os
import time
from pathlib import Path

import arrow
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))

In [ ]:

%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv

In [ ]:

# Insert your Trove API key
API_KEY = "YOUR API KEY"

# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")

API_URL = "https://api.trove.nla.gov.au/v2/newspaper/title/"

PARAMS = {"encoding": "json", "key": API_KEY}

Get information about available issues¶

Before we start downloading huge numbers of PDFs, let's have a look at how many issues are available for the newspaper we're interested in. This code comes from harvest_newspaper_issues.ipynb.

In [ ]:

# THIS CODE COMES FROM harvest_newspaper_issues.ipynb

# These are newspapers where the date ranges are off by more than a year
# In these cases we'll harvest all the issues in one hit, rather than year by year
dodgy_dates = ["1486", "1618", "586"]


def get_title_summary(title_id):
    """
    Get the details of a single newspaper title.
    """
    response = s.get(f"{API_URL}{title_id}", params=PARAMS)
    data = response.json()
    return data["newspaper"]


def get_issues_in_range(title_id, start_date, end_date):
    """
    Get a list of issues available from a particular newspaper within the given date range.
    """
    issues = []
    params = PARAMS.copy()
    params["include"] = "years"
    params["range"] = f'{start_date.format("YYYYMMDD")}-{end_date.format("YYYYMMDD")}'
    response = s.get(f"{API_URL}{title_id}", params=params)
    try:
        data = response.json()
    except json.JSONDecodeError:
        print(response.url)
        print(response.text)
    else:
        for year in data["newspaper"]["year"]:
            if "issue" in year:
                for issue in year["issue"]:
                    issues.append(
                        {
                            "title_id": title_id,
                            "issue_id": issue["id"],
                            "issue_date": issue["date"],
                        }
                    )
    time.sleep(0.2)
    return issues


def get_issues_full_range(title_id):
    """
    In most cases we set date ranges to get issue data in friendly chunks. But sometimes the date ranges are missing or wrong.
    In these cases, we ask for everything at once, by setting the range to the limits of Trove.
    """
    start_date = arrow.get("1803-01-01")
    range_end = arrow.now()
    issues = get_issues_in_range(title_id, start_date, range_end)
    return issues


def get_issues_from_title(title_id):
    """
    Get a list of all the issues available for a particular newspaper.

    Params:
      * title_id - a newspaper identifier
    Returns:
      * A list containing details of available issues
    """
    issues = []
    title_summary = get_title_summary(title_id)

    # Date range is off by more than a year, so get everything in one hit
    if title_id in dodgy_dates:
        issues += get_issues_full_range(title_id)
    else:
        try:
            # The date ranges are not always reliable, so to make sure we get everything
            # we'll set the range to the beginning and end of the given year
            start_date = arrow.get(title_summary["startDate"]).replace(day=1, month=1)
            end_date = arrow.get(title_summary["endDate"]).replace(day=31, month=12)
        except KeyError:
            # Some records have no start and end dates at all
            # In this case set the range to the full range of Trove's newspapers
            issues += get_issues_full_range(title_id)
        else:
            # If the date range is available, loop through it by year
            while start_date <= end_date:
                range_end = start_date.replace(month=12, day=31)
                issues += get_issues_in_range(title_id, start_date, range_end)
                start_date = start_date.shift(years=+1).replace(month=1, day=1)
    return issues

Harvest the issue data.

In [ ]:

# Set the id of the newspaper you want to havrest from
# You can get the newspaper id from the title details page in Trove
trove_newspaper_id = 1646

# Harvest the issue data
issues = get_issues_from_title(trove_newspaper_id)

Convert to a dataframe for analysis.

In [ ]:

df = pd.DataFrame(issues)
df.head()

How many issues are available?

In [ ]:

df.shape[0]

What is the date range of the issues?

In [ ]:

df["issue_date"].min()

In [ ]:

df["issue_date"].max()

Harvest the issues as PDFs¶

Now we have the issues data, we can use it to download the PDFs.

In [ ]:

# THIS CODE IS A SLIGHTLY MODIFIED VERSION OF WHAT'S IN THE TROVE NEWSPAPER HARVESTER


def ping_pdf(ping_url):
    """
    Check to see if a PDF is ready for download.
    If a 200 status code is received, return True.
    """
    ready = False
    # req = Request(ping_url)
    try:
        response = s.get(ping_url, timeout=30)
        response.raise_for_status()
    except HTTPError:
        if response.status_code == 423:
            ready = False
        else:
            raise
    else:
        ready = True
    return ready


def get_pdf_url(issue_id):
    """
    Download the PDF version of an article.
    These can take a while to generate, so we need to ping the server to see if it's ready before we download.
    """
    pdf_url = None
    # Ask for the PDF to be created
    prep_url = (
        f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}/prep"
    )
    response = s.get(prep_url)
    # Get the hash
    prep_id = response.text
    # Url to check if the PDF is ready
    ping_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.ping?followup={prep_id}"
    tries = 0
    ready = False
    time.sleep(2)  # Give some time to generate pdf
    # Are you ready yet?
    while ready is False and tries < 5:
        ready = ping_pdf(ping_url)
        if not ready:
            tries += 1
            time.sleep(2)
    # Download if ready
    if ready:
        pdf_url = f"https://trove.nla.gov.au/newspaper/rendition/nla.news-issue{issue_id}.pdf?followup={prep_id}"
    return pdf_url


def harvest_pdfs(issues, start_date=None, end_date=None):
    """
    Download all issue pdfs within the given date range.
    """
    output_path = Path("data", "issues")
    output_path.mkdir(parents=True, exist_ok=True)
    df = pd.DataFrame(issues)
    if start_date and end_date:
        df_range = df.loc[
            (df["issue_date"] >= start_date) & (df["issue_date"] <= end_date)
        ]
    elif start_date:
        df_range = df.loc[(df["issue_date"] >= start_date)]
    elif end_date:
        df_range = df.loc[(df["issue_date"] < end_date)]
    else:
        df_range = df
    for issue in tqdm(df_range.itertuples(), total=df_range.shape[0]):
        pdf_url = get_pdf_url(issue.issue_id)
        response = s.get(pdf_url)
        Path(
            output_path,
            f'{issue.title_id}-{issue.issue_date.replace("-", "")}-{issue.issue_id}.pdf',
        ).write_bytes(response.content)

In the cell below you can set a date range for your harvest. Adjust the start and end dates as required. If you want to harvest ALL the issues, set the start and end dates to None.

In [ ]:

# Set start and end dates - YYYY-MM-DD, eg:
# start_date = '1932-05-01'
# Adjust these to suit your case, set to None to get everything
start_date = None
end_date = None

# Start harvesting the PDFs!
harvest_pdfs(issues, start_date=start_date, end_date=end_date)

Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.