Notebook

Download summaries and transcripts from oral histories¶

If oral histories have summaries or transcripts, they can be downloaded as text or PDF files using their nla.obj identifiers. See Accessing data from digitised oral histories in the Trove Data Guide for more details.

This notebook downloads all the available transcripts and summaries from digitised oral histories available in Trove. It uses a pre-harvested dataset of oral histories to obtain a list of nla.obj identifiers. It then constructs a download url using each identifier, and downloads the file.

If you're using data from the oral histories in Trove, you should read the section on licensing of oral histories in the Trove Data Guide.

In [ ]:

import re
import time
from pathlib import Path

import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

In [ ]:

s = requests_cache.CachedSession(timeout=60)
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))

In [ ]:

def download_transcripts(output_dir="transcripts", max=None):
    # Create a directory to save the transcripts
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)

    # Load the pre-harvested dataset
    df = pd.read_csv("https://github.com/GLAM-Workbench/trove-oral-histories-data/raw/main/trove-oral-histories.csv", keep_default_na=False)

    # Filter to records that have either a transcript or summary (or both)
    transcripts = df.loc[(df["summary"] == 1) | (df["transcript"] == 1)]

    # Loop through a list of fulltext_url values from the filtered dataset
    for ts in tqdm(transcripts["fulltext_url"].to_list()[:max]):
        # Extract the nla.obj id from the fulltext url
        ts_id = re.search(r"nla\.obj-\d+", ts).group(0)

        # Construct a download url
        ts_url = f"https://nla.gov.au/tarkine/listen/download/transcript/{ts_id}"

        # Download and save the text file
        response = s.get(ts_url)
        with Path(output_path, f"{ts_id}.txt").open("w") as text_file:
            text_file.write(response.text)

        # Pause if necessary
        if not response.from_cache:
            time.sleep(0.5)

In [ ]:

download_transcripts()

In [ ]:

# TESTING -- PLEASE IGNORE

with s.cache_disabled():
    download_transcripts(max=10)

Created by Tim Sherratt for the GLAM Workbench.