According to Archway:
A function is an area of government activity or responsibility, which is undertaken by an agency (or agencies) over a length of time. A function may consist of a single activity such as teacher registration or a group of related activities e.g. maritime safety.
Functions provide an alternative way of finding relevant series and records — zooming out from the records to focus on the government activities you're interested in. Functions also provide an interesting data point to analyse and visualise.
This notebook lets you download the functions used by Archway as a dataset.
According to Archives New Zealand's copyright page, all material on their website is subject to Crown Copyright unless otherwise indicated. Any Crown Copyright material on the site 'may be copied, printed or downloaded or reproduced in any number of copies and in any format or medium'.
If you're not familiar with Jupyter notebooks like this one, here's a few basic tips.
Shift+Enter
to execute the code.# Import the modules we'll need
# Yes this is a code cell, hit Shift+Enter to run it!
import requests
from bs4 import BeautifulSoup
from tinydb import TinyDB, Query
import re
import time
import os
import json
import ipywidgets as widgets
import pandas as pd
from IPython.display import display, HTML, FileLink
from tqdm import tqdm_notebook
# Make sure the data directory exists
os.makedirs('data', exist_ok=True)
# Create a session to capture cookies etc
s = requests.Session()
Archway doesn't provide an API, so we have to scrape the data from web pages. This is messy and prone to breakage...
Things are further complicated by Archway's use of sessions and forms.
# Utility functions
def get_string(element):
'''
Get strings out of elements.
If the contents includes other elements and the string method returns None,
then it returns the first string.
'''
try:
string = element.string.strip()
except AttributeError:
try:
string = list(element.strings)[0].strip()
except (AttributeError, IndexError):
string = ''
return string
def clean_date(date):
'''Clean a date string and return either a year of an empty string'''
if date == 'current' or date == 'unknown':
year = ''
else:
year = re.search(r'(\d{4})\b', date).group(1)
return year
# -------------------------------------------
# Functions to handle the embedded forms
def prepare_search():
'''
Gathering the cookies and session details...
'''
r1 = s.get('https://www.archway.archives.govt.nz/')
r2 = s.get('https://www.archway.archives.govt.nz/CallAdvancedSearch.do')
r3 = s.get('https://www.archway.archives.govt.nz/FunctionAdvancedSearch.do')
soup = BeautifulSoup(r3.text, 'lxml')
params = get_page_params(soup)
return params
def get_page_params(soup, page=1):
'''
Get the embedded search details (the hidden fields) in a results page to feed to the next page request.
'''
params = {}
elements = soup.find_all('input', {'name': re.compile('searchResultsContainer'), 'type': 'hidden'})
for element in elements:
# print(element)
params[element['name']] = element['value']
params['searchResultsContainer.page'] = page
return params
def get_function_page_params(soup):
'''
Get the form's hidden parameters, so we can submit them to get the next page.
'''
params = {}
elements = soup.find_all('input', type='hidden')
for element in elements:
params[element['name']] = element['value']
return params
# -------------------------------------------
# Find and scrape elements
def find_next_list(heading):
'''
There's no html hierarchies, ids, or classes to make scraping nice and neat.
As a result we just have to find the heading above the ul we want,
then work through the heading's sibling elements until we find a ul.
'''
for el in heading.next_siblings:
try:
if el.name == 'ul':
return el
elif el.string == 'None':
break
except AttributeError:
# Probably a string
pass
return None
def find_next_table(heading):
'''
There's no html hierarchies, ids, or classes to make scraping nice and neat.
As a result we just have to find the heading above the table we want,
then work through the heading's sibling elements until we find a table.
'''
for el in heading.next_siblings:
try:
if el.name == 'table':
return el
elif el.string == 'None':
break
except AttributeError:
# Probably a string
pass
return None
def get_entities(soup, pattern, date_type='range'):
'''
Extract data from tables of related entities.
The use of dates varies depending on the entity and its relationship to the function.
Eg: agencies have a date range indicating when they were responsible for the function,
while a preceding function have a date prior to which it was used.
date_type expects one of:
- 'range'
- 'prior'
- 'following'
'''
links = soup.find_all('a', href=re.compile('ViewEntity.do'), string=re.compile(r'^{}'.format(pattern)))
entities = []
for link in links:
entity_id = re.search(r'ViewEntity\.do\?code=({})'.format(pattern), link['href']).group(1)
entity = {'id': entity_id}
cells = link.parent.parent.find_all('td', recursive=False)
entity['entity'] = get_string(cells[1].table.find_all('tr')[1].td)
if date_type == 'range':
entity['start_date_str'] = cells[2].string.strip()
entity['end_date_str'] = cells[3].string.strip()
entity['start_date'] = clean_date(cells[2].string.strip())
entity['end_date'] = clean_date(cells[3].string.strip())
elif date_type == 'prior':
entity['prior_to_date_str'] = cells[2].string.strip()
entity['prior_to_date'] = clean_date(cells[2].string.strip())
elif date_type == 'following':
entity['following_date_str'] = cells[2].string.strip()
entity['following_date'] = clean_date(cells[2].string.strip())
entities.append(entity)
return entities
# -------------------------------------------
# Get info from the initial function details page
def get_alt_names(soup):
alt_names = []
heading = soup.find('img', src=re.compile('AlsoKnownAs.gif'))
if heading:
table = find_next_table(heading)
rows = table.find_all('tr')
for row in rows[1:]:
cells = row.find_all('td')
name = get_string(cells[0])
year = get_string(cells[1])
alt_names.append({'name': name, 'year': year})
return alt_names
def get_mandates(soup, function):
for mandate in ['CREATED', 'ABOLISHED', 'RELEVANT']:
heading = soup.find('strong', string=re.compile(mandate))
if heading:
ul = find_next_list(heading)
items = []
if ul:
for li in ul.find_all('li'):
items.append(get_string(li))
function['legislation_{}'.format(mandate.lower())] = items
else:
function['legislation_{}'.format(mandate.lower())] = []
return function
# -------------------------------------------
# Get info from linked function detail pages
def get_jurisdictions(params, function):
new_params = params.copy()
new_params['partOfButton.x'] = 10
new_params['partOfButton.y'] = 10
response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params)
soup = BeautifulSoup(response.text)
function['part_of_jurisdiction'] = get_entities(soup, 'J\d+')
return function
def get_related_functions(params, function):
new_params = params.copy()
new_params['beforeAfterButton.x'] = 10
new_params['beforeAfterButton.y'] = 10
response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params)
soup = BeautifulSoup(response.text)
for relation in ['PRIOR', 'FOLLOWING']:
heading = soup.find('strong', string=re.compile(relation))
table = find_next_table(heading)
if table:
function['functions_{}'.format(relation.lower())] = get_entities(table, 'F\d+', relation.lower())
else:
function['functions_{}'.format(relation.lower())] = []
return function
def get_agencies(params, function):
# Add these parameters to simulate a click on the tab
new_params = params.copy()
new_params['whoDidItButton.x'] = 10
new_params['whoDidItButton.y'] = 10
response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params)
soup = BeautifulSoup(response.text)
# There are two types of related agencies: responsible and exercising.
for relation in ['RESPONSIBLE', 'EXERCISING']:
# Find the heading in the page
heading = soup.find('strong', string=re.compile(relation))
# Find the next table on the page
table = find_next_table(heading)
if table:
function['agencies_{}'.format(relation.lower())] = get_entities(table, 'A[A-Z]+')
else:
function['agencies_{}'.format(relation.lower())] = []
return function
# -------------------------------------------
# Process the pages
def get_function_details(function):
# print(function['id'])
function_url = 'https://www.archway.archives.govt.nz/ViewEntity.do?code={}'.format(function['id'])
response = s.get(function_url)
soup = BeautifulSoup(response.text, 'lxml')
function = get_mandates(soup, function)
function['alternative_names'] = get_alt_names(soup)
# The related entities tabs on a function details page actually link to separate pages.
# Here we get the details from each tab and add it to our the function data.
params = get_function_page_params(soup)
function = get_jurisdictions(params, function)
function = get_related_functions(params, function)
function = get_agencies(params, function)
return function
def process_page(soup):
results = []
links = soup.find_all('a', href=re.compile('ViewEntity.do'), string=re.compile(r'^F\d+'))
for link in tqdm_notebook(links, leave=False, unit='result', desc='Per page'):
function_id = re.search(r'ViewEntity\.do\?code=([F\d]+)', link['href']).group(1)
function = {'id': function_id}
cells = link.parent.parent.find_all('td', recursive=False)
function['term'] = cells[1].table.find_all('tr')[1].td.string.strip().lower()
function['start_date_str'] = cells[2].string.strip()
function['end_date_str'] = cells[3].string.strip()
function['start_date'] = clean_date(cells[2].string.strip())
function['end_date'] = clean_date(cells[3].string.strip())
function = get_function_details(function)
# print(function)
time.sleep(0.2)
results.append(function)
return results
def harvest_functions(page=1):
'''
Loop through each results page and harvest the function details.
To resume a harvest you can supply a page parameter.
'''
db = TinyDB('data/db-functions.json')
Result = Query()
params = prepare_search()
total_results = int(params['searchResultsContainer.totalResultSize'])
functions_url = 'https://www.archway.archives.govt.nz/FunctionAdvancedSearchResults.do'
with tqdm_notebook(total=total_results, desc='Functions', unit='function') as pbar:
while len(db.all()) < total_results:
response = s.post(functions_url, params=params)
soup = BeautifulSoup(response.text, 'lxml')
results = process_page(soup)
# Handle duplicates in the case of a restart/reharvest
for result in results:
db.upsert(result, Result.id == result['id'])
page += 1
params = get_page_params(soup, page)
pbar.update(len(results))
This might take a while...
harvest_functions()
def save_json():
'''
Save the complete dataset to a JSON file.
'''
db = TinyDB('data/db-functions.json')
with open('data/functions.json', 'w') as json_file:
json_file.write(json.dumps(db.all()))
display(FileLink('data/functions.json'))
def save_csv():
'''
Save a subset of fields to a CSV
'''
db = TinyDB('data/db-functions.json')
df = pd.DataFrame(db.all())
df = df[['id', 'term', 'start_date', 'end_date']]
df.to_csv('data/functions.csv', index=False)
display(FileLink('data/functions.csv'))
save_json()
save_csv()
Look for keyword strings in the term names and return any records that match.
This searches the JSON dump of the harvested functions.
def find_term(b):
out.clear_output()
search_term = keyword.value
# Load the JSON file
with open('data/functions.json', 'r') as json_file:
data = json.load(json_file)
results = [f for f in data if search_term in f['term']]
for result in results:
with out:
display(HTML('<b>{}</b>'.format(result['term'])))
print(json.dumps(result, indent=4))
out = widgets.Output()
keyword = widgets.Text(
placeholder='Enter search term',
description='Search term:',
disabled=False
)
button = widgets.Button(
description='Display functions',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Click to view matching functions',
icon=''
)
button.on_click(find_term)
display(widgets.HBox([keyword, button]))
display(out)
HBox(children=(Text(value='', description='Search term:', placeholder='Enter search term'), Button(button_styl…
Output()