Harvest all the XML transcripts from the PMs Transcripts site.
If you don't want to harvest them all yourself, I've created a repository containing all the XML files, a CSV-formatted index, and aggregated text and zip files for each prime minister.
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
import re
from tqdm.auto import tqdm
import time
# This page lists all the XML files
TRANSCRIPTS_URL = 'https://pmtranscripts.pmc.gov.au/transcripts.xml'
# Get the list of XML files
r = requests.get(TRANSCRIPTS_URL)
# Turn the XML into Soup
soup = BeautifulSoup(r.text, 'lxml')
# Get the links to all the XML files
uris = soup.find_all('uri')
# Loop through all the XML files
# Saving each file in turn
for uri in tqdm(uris):
uri = uri.string
uri_bits = urlparse(uri)
filename = os.path.basename(uri_bits.path)
filepath = os.path.join('transcripts', '{}.xml'.format(filename))
if not os.path.exists(filepath):
try:
id = re.search('transcript-(\d+)', filename).group(1)
xml_url = 'https://pmtranscripts.pmc.gov.au/query?transcript=' + id
transcript = requests.get(xml_url)
transcript.encoding = 'utf-8'
with open(os.path.join('transcripts', '{}.xml'.format(filename)), 'wb') as xml_file:
xml_file.write(transcript.text.encode('utf-8'))
except AttributeError:
pass
time.sleep(0.2)
HBox(children=(IntProgress(value=0, max=22815), HTML(value='')))