If oral histories have summaries or transcripts, they can be downloaded as text or PDF files using their nla.obj
identifiers. See Accessing data from digitised oral histories in the Trove Data Guide for more details.
This notebook downloads all the available transcripts and summaries from digitised oral histories available in Trove. It uses a pre-harvested dataset of oral histories to obtain a list of nla.obj
identifiers. It then constructs a download url using each identifier, and downloads the file.
If you're using data from the oral histories in Trove, you should read the section on licensing of oral histories in the Trove Data Guide.
import re
import time
from pathlib import Path
import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession(timeout=60)
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
def download_transcripts(output_dir="transcripts", max=None):
# Create a directory to save the transcripts
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
# Load the pre-harvested dataset
df = pd.read_csv("https://github.com/GLAM-Workbench/trove-oral-histories-data/raw/main/trove-oral-histories.csv", keep_default_na=False)
# Filter to records that have either a transcript or summary (or both)
transcripts = df.loc[(df["summary"] == 1) | (df["transcript"] == 1)]
# Loop through a list of fulltext_url values from the filtered dataset
for ts in tqdm(transcripts["fulltext_url"].to_list()[:max]):
# Extract the nla.obj id from the fulltext url
ts_id = re.search(r"nla\.obj-\d+", ts).group(0)
# Construct a download url
ts_url = f"https://nla.gov.au/tarkine/listen/download/transcript/{ts_id}"
# Download and save the text file
response = s.get(ts_url)
with Path(output_path, f"{ts_id}.txt").open("w") as text_file:
text_file.write(response.text)
# Pause if necessary
if not response.from_cache:
time.sleep(0.5)
download_transcripts()
# TESTING -- PLEASE IGNORE
with s.cache_disabled():
download_transcripts(max=10)
Created by Tim Sherratt for the GLAM Workbench.