This notebook analyses Commonwealth Hansard XML files from this GitHub repository. Give it a year
(between 1901 and 1980), and a house
(either 'hofreps' or 'senate'), and it will download all the proceedings of that year and house, extract some basic data about debates and speeches, and provide the results as a dataframe for exploration.
import requests
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
import arrow
import pandas as pd
import altair as alt
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))
Note that the GitHub API only allows 60 unauthorised requests per hour. So it's a good idea to cache things. Note that requests to download files aren't included in the API tally. If you need more requests you'll need to use authentication.
API_URL = 'https://api.github.com/repos/wragge/hansard-xml/contents'
Set the year and house you're interested in.
year = '1901' # 1901 to 1980
house = 'hofreps' # hofreps or senate
def count_words(para):
'''
Count the number of words in an element.
'''
words = 0
for string in para.stripped_strings:
words += len(string.split())
return words
def get_paras(section):
'''
Find all the para type containers in an element and count the total number of words.
'''
words = 0
for para in section.find_all(['para', 'quote', 'list'], recursive=False):
words += count_words(para)
return words
def get_words_in_speech(start, speech):
'''
Get the top-level containers in a speech and find the total number of words across them all.
'''
words = 0
words += get_paras(start)
words += get_paras(speech)
for cont in speech.find_all('continue', recursive=False):
cont_start = cont.find('talk.start', recursive=False)
words += get_paras(cont_start)
words += get_paras(cont)
return words
def get_interjections(speech):
'''
Get details of any interjections within a speech.
'''
speeches = []
for index, intj in enumerate(speech.find_all('interjection', recursive=False)):
start = intj.find('talk.start', recursive=False)
speaker = start.find('talker')
name = speaker.find('name', role='metadata').string
id = speaker.find('name.id').string
words = get_words_in_speech(start, intj)
speeches.append({'interjection_idx': index, 'speaker': name, 'id': id, 'type': intj.name, 'words': words})
return speeches
def get_speeches(debate):
'''
Get details of any speeches in a debate (or subdebate)
'''
speeches = []
for index, speech in enumerate(debate.find_all(['speech', 'question', 'answer'], recursive=False)):
start = speech.find('talk.start', recursive=False)
speaker = start.find('talker')
name = speaker.find('name', role='metadata').string
id = speaker.find('name.id').string
words = get_words_in_speech(start, speech)
speeches.append({'speech_idx': index, 'speaker': name, 'id': id, 'type': speech.name, 'words': words})
# Interjections are within a speech
interjections = get_interjections(speech)
# Tag interjections with the speech index
for intj in interjections:
intj['speech_idx'] = index
speeches.append(intj)
return speeches
def get_subdebates(debate):
'''
Get details of any subdebates within a debate.
'''
speeches = []
for index, sub in enumerate(debate.find_all('subdebate.1', recursive=False)):
subdebate_info = {'subdebate_title': sub.subdebateinfo.title.string, 'subdebate_idx': index}
new_speeches = get_speeches(sub)
# Add the subdebate info to the speech
for sp in new_speeches:
sp.update(subdebate_info)
speeches += new_speeches
return speeches
def get_debates(soup):
'''
Get details of all the debates in day's proceedings.
'''
speeches = []
date = soup.find('session.header').date.string
for index, debate in enumerate(soup.find_all('debate')):
debate_info = {
'date': date,
'debate_title': debate.debateinfo.title.string,
'debate_type': debate.debateinfo.type.string,
'debate_idx': index
}
new_speeches = get_subdebates(debate)
new_speeches += get_speeches(debate)
# Add the debate info to the speech
for sp in new_speeches:
sp.update(debate_info)
speeches += new_speeches
return speeches
def summarise_year(year, house):
'''
Get each day's proceedings for the supplied year/house and extract information about debates and speeches.
'''
speeches = []
response = s.get(f'{API_URL}/{house}/{year}')
data = response.json()
files = [f for f in data if f['type'] == 'file']
for f in tqdm(files):
response = s.get(f['download_url'])
soup = BeautifulSoup(response.text)
speeches += get_debates(soup)
df = pd.DataFrame(speeches)
return df
df = summarise_year(year=year, house=house)
HBox(children=(FloatProgress(value=0.0, max=113.0), HTML(value='')))
df.head()
speech_idx | speaker | id | type | words | subdebate_title | subdebate_idx | date | debate_title | debate_type | debate_idx | interjection_idx | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | MACDONALD-PATERSON, Thomas | KIQ | speech | 318 | HIS EXCELLENCY THE GOVER | 0.0 | 1901-05-09 | QUESTION | Questions | 9 | NaN |
1 | 1 | BRADDON, Edward | JRR | speech | 178 | HIS EXCELLENCY THE GOVER | 0.0 | 1901-05-09 | QUESTION | Questions | 9 | NaN |
2 | 2 | SMITH, Arthur | KTT | speech | 693 | HIS EXCELLENCY THE GOVER | 0.0 | 1901-05-09 | QUESTION | Questions | 9 | NaN |
3 | 2 | CHAPMAN, Austin | JX7 | interjection | 9 | HIS EXCELLENCY THE GOVER | 0.0 | 1901-05-09 | QUESTION | Questions | 9 | 0.0 |
4 | 3 | CAMERON, Donald Norman | JUJ | speech | 98 | HIS EXCELLENCY THE GOVER | 0.0 | 1901-05-09 | QUESTION | Questions | 9 | NaN |
df.loc[df['type'] == 'speech']['speaker'].value_counts()[:20]
BARTON, Edmund 439 KINGSTON, Charles 303 MCMILLAN, William 215 DEAKIN, Alfred 204 CONROY, Alfred 180 PIESSE, Frederick 166 THOMSON, Dugald 153 WATSON, John Christian 150 REID, George 146 ISAACS, Isaac 146 GLYNN, Patrick 140 SPEAKER, Mr 140 CROUCH, Richard 136 O'MALLEY, King 119 MCCAY, James 118 MCEACHARN, Malcolm 115 MAUGER, Samuel 109 LYNE, William 108 POYNTON, Alexander 108 TURNER, George 107 Name: speaker, dtype: int64
df.loc[df['type'] == 'interjection']['speaker'].value_counts()[:20]
KINGSTON, Charles 1257 DEAKIN, Alfred 1097 BARTON, Edmund 1001 TURNER, George 906 REID, George 801 MCMILLAN, William 775 MAUGER, Samuel 604 LYNE, William 551 WATSON, John Christian 550 COOK, Joseph 536 HIGGINS, Henry 535 ISAACS, Isaac 482 MCEACHARN, Malcolm 429 THOMSON, Dugald 391 CONROY, Alfred 355 MCCAY, James 355 FORREST, John 332 SOLOMON, Vaiben 321 POYNTON, Alexander 300 MCDONALD, Charles 284 Name: speaker, dtype: int64
df.groupby(by='speaker')['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]
speaker | words | |
---|---|---|
2 | BARTON, Edmund | 201547 |
65 | REID, George | 140732 |
55 | MCMILLAN, William | 138382 |
41 | KINGSTON, Charles | 132851 |
74 | SPEAKER, Mr | 128840 |
78 | THOMSON, Dugald | 112445 |
18 | DEAKIN, Alfred | 104408 |
82 | WATSON, John Christian | 99848 |
49 | MCCAY, James | 98219 |
12 | CONROY, Alfred | 97755 |
80 | TURNER, George | 94780 |
21 | EDWARDS, George | 93070 |
39 | ISAACS, Isaac | 91439 |
35 | HIGGINS, Henry | 90842 |
64 | QUICK, John | 88777 |
14 | COOK, Joseph | 88317 |
62 | PIESSE, Frederick | 86988 |
73 | SOLOMON, Vaiben | 86977 |
29 | GLYNN, Patrick | 83018 |
84 | WILKS, William | 81424 |
Note that there's variation in the way debate titles were recorded, and in the OCR results, so this sort of grouping isn't always going to work. To get something more accurate, you'd have to do some normalisation of debate titles first.
df.groupby(by=['debate_title'])['words'].sum().to_frame().reset_index().sort_values('words', ascending=False)[:20]
debate_title | words | |
---|---|---|
111 | QUESTION | 1084980 |
74 | MOTION OF CENSURE | 488836 |
96 | POST AND TELEGRAPH BILL | 334188 |
31 | CUSTOMS BILL | 303111 |
58 | IMMIGRATION RESTRICTION BILL | 301900 |
109 | PUBLIC SERVICE BILL | 260357 |
135 | TARIFF | 174766 |
35 | DEFENCE BILL | 136801 |
129 | SUPPLY BILL | 92487 |
81 | PACIFIC ISLANDS LABOURERS BILL | 86791 |
24 | COMMONWEALTH PUBLIC SERVICE BILL | 86225 |
38 | DISTILLATION BILL | 83656 |
5 | ADJOURNMENT | 79587 |
18 | BRITISH NEW GUINEA | 64175 |
60 | INTER-STATE COMMISSION BILL | 50156 |
55 | GOVERNOR-GENERAL'S SPEECH | 47909 |
125 | STATEMENT SHOWING THE AMOUNTS RECEIVE!) BV WHI... | 44751 |
19 | BUDGET | 43095 |
105 | PROPERTY FOR PUBLIC PURPOSES ACQUISITION BILL | 34402 |
0 | ACTS INTERPRETATION BILL | 33693 |
I've only included words in speeches with identified speakers (including interjections), so some procedural content might not be included in the totals.
words_per_day = df.groupby(by=['date'])['words'].sum().to_frame().reset_index()
alt.Chart(words_per_day).mark_bar(size=2).encode(
x='date:T',
y='words:Q',
tooltip=['date:T', 'words:Q']
).properties(width=700)
df.loc[(df['debate_type'] == 'Questions') | (df['debate_title'] == 'QUESTION') | (df['type'] == 'question')]['subdebate_title'].value_counts()[:20]
TARIFF 1161 THE TARIFF 887 THE GOVERNOR-GENERAL'S SPEECH 415 MOTION OFCENSURE 347 G OVERNOR - GENERAL'S SPEECH 277 WEAVERS' PRICES AT THE ANTIPODES 259 SUPPLY 232 GOVERNOR-GENERAL'S SPEECH 219 EDMUND BARTON 193 WESTERN AUSTRALIAN MONEYORDER OFFICERS 167 EMOLUMENTS OF MINISTERS 166 JOHN JOSEPH EA STICK 125 THEGOVERNOR-GENERAL'S SPEECH 64 OLD-AGE PENSIONS 63 WAYS AND MEANS 59 RATE OF WAGE : HOURS OF LABOUR 41 FEDERAL CAPITAL SITE 33 THIRD SCHEDULE 32 ADDITIONAL SITTING DAY 27 DEPARTMENT OF AGRICULTURE 24 Name: subdebate_title, dtype: int64
Created by Tim Sherratt for the GLAM Workbench.