#!/usr/bin/env python # coding: utf-8 # # Harvesting functions from Archway # # According to Archway: # # > A function is an area of government activity or responsibility, which is undertaken by an agency (or agencies) over a length of time. A function may consist of a single activity such as teacher registration or a group of related activities e.g. maritime safety. # # Functions provide an alternative way of finding relevant series and records — zooming out from the records to focus on the government activities you're interested in. Functions also provide an interesting data point to analyse and visualise. # # This notebook lets you download the functions used by Archway as a dataset. # # According to [Archives New Zealand's copyright page](http://archives.govt.nz/copyright-disclaimer-and-privacy), all material on their website is subject to Crown Copyright unless otherwise indicated. Any Crown Copyright material on the site 'may be copied, printed or downloaded or reproduced in any number of copies and in any format or medium'. # ## Using this notebook # # If you're not familiar with [Jupyter notebooks](http://jupyter.org/) like this one, here's a few basic tips. # # * You can run real live code. Look below for code cells – they have boxes around them and code inside (d'uh). # * Click on a code cell to edit it. # * Once you've clicked on a code cell, it's ready to run. Just hit `Shift+Enter` to execute the code. # * Start at the top of the page, running each code cell in turn – this will ensure that all the necessary modules, functions, and variables are setup and ready to use. # * While a code cell is running it'll display an asterix in the square brackets next to the cell. Once it's finished, the asterix will turn into a number. # ## Setting things up # In[2]: # Import the modules we'll need # Yes this is a code cell, hit Shift+Enter to run it! import requests from bs4 import BeautifulSoup from tinydb import TinyDB, Query import re import time import os import json import ipywidgets as widgets import pandas as pd from IPython.display import display, HTML, FileLink from tqdm import tqdm_notebook # Make sure the data directory exists os.makedirs('data', exist_ok=True) # Create a session to capture cookies etc s = requests.Session() # ## The harvesting code # # Archway doesn't provide an API, so we have to scrape the data from web pages. This is messy and prone to breakage... # # Things are further complicated by Archway's use of sessions and forms. # In[295]: # Utility functions def get_string(element): ''' Get strings out of elements. If the contents includes other elements and the string method returns None, then it returns the first string. ''' try: string = element.string.strip() except AttributeError: try: string = list(element.strings)[0].strip() except (AttributeError, IndexError): string = '' return string def clean_date(date): '''Clean a date string and return either a year of an empty string''' if date == 'current' or date == 'unknown': year = '' else: year = re.search(r'(\d{4})\b', date).group(1) return year # ------------------------------------------- # Functions to handle the embedded forms def prepare_search(): ''' Gathering the cookies and session details... ''' r1 = s.get('https://www.archway.archives.govt.nz/') r2 = s.get('https://www.archway.archives.govt.nz/CallAdvancedSearch.do') r3 = s.get('https://www.archway.archives.govt.nz/FunctionAdvancedSearch.do') soup = BeautifulSoup(r3.text, 'lxml') params = get_page_params(soup) return params def get_page_params(soup, page=1): ''' Get the embedded search details (the hidden fields) in a results page to feed to the next page request. ''' params = {} elements = soup.find_all('input', {'name': re.compile('searchResultsContainer'), 'type': 'hidden'}) for element in elements: # print(element) params[element['name']] = element['value'] params['searchResultsContainer.page'] = page return params def get_function_page_params(soup): ''' Get the form's hidden parameters, so we can submit them to get the next page. ''' params = {} elements = soup.find_all('input', type='hidden') for element in elements: params[element['name']] = element['value'] return params # ------------------------------------------- # Find and scrape elements def find_next_list(heading): ''' There's no html hierarchies, ids, or classes to make scraping nice and neat. As a result we just have to find the heading above the ul we want, then work through the heading's sibling elements until we find a ul. ''' for el in heading.next_siblings: try: if el.name == 'ul': return el elif el.string == 'None': break except AttributeError: # Probably a string pass return None def find_next_table(heading): ''' There's no html hierarchies, ids, or classes to make scraping nice and neat. As a result we just have to find the heading above the table we want, then work through the heading's sibling elements until we find a table. ''' for el in heading.next_siblings: try: if el.name == 'table': return el elif el.string == 'None': break except AttributeError: # Probably a string pass return None def get_entities(soup, pattern, date_type='range'): ''' Extract data from tables of related entities. The use of dates varies depending on the entity and its relationship to the function. Eg: agencies have a date range indicating when they were responsible for the function, while a preceding function have a date prior to which it was used. date_type expects one of: - 'range' - 'prior' - 'following' ''' links = soup.find_all('a', href=re.compile('ViewEntity.do'), string=re.compile(r'^{}'.format(pattern))) entities = [] for link in links: entity_id = re.search(r'ViewEntity\.do\?code=({})'.format(pattern), link['href']).group(1) entity = {'id': entity_id} cells = link.parent.parent.find_all('td', recursive=False) entity['entity'] = get_string(cells[1].table.find_all('tr')[1].td) if date_type == 'range': entity['start_date_str'] = cells[2].string.strip() entity['end_date_str'] = cells[3].string.strip() entity['start_date'] = clean_date(cells[2].string.strip()) entity['end_date'] = clean_date(cells[3].string.strip()) elif date_type == 'prior': entity['prior_to_date_str'] = cells[2].string.strip() entity['prior_to_date'] = clean_date(cells[2].string.strip()) elif date_type == 'following': entity['following_date_str'] = cells[2].string.strip() entity['following_date'] = clean_date(cells[2].string.strip()) entities.append(entity) return entities # ------------------------------------------- # Get info from the initial function details page def get_alt_names(soup): alt_names = [] heading = soup.find('img', src=re.compile('AlsoKnownAs.gif')) if heading: table = find_next_table(heading) rows = table.find_all('tr') for row in rows[1:]: cells = row.find_all('td') name = get_string(cells[0]) year = get_string(cells[1]) alt_names.append({'name': name, 'year': year}) return alt_names def get_mandates(soup, function): for mandate in ['CREATED', 'ABOLISHED', 'RELEVANT']: heading = soup.find('strong', string=re.compile(mandate)) if heading: ul = find_next_list(heading) items = [] if ul: for li in ul.find_all('li'): items.append(get_string(li)) function['legislation_{}'.format(mandate.lower())] = items else: function['legislation_{}'.format(mandate.lower())] = [] return function # ------------------------------------------- # Get info from linked function detail pages def get_jurisdictions(params, function): new_params = params.copy() new_params['partOfButton.x'] = 10 new_params['partOfButton.y'] = 10 response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params) soup = BeautifulSoup(response.text) function['part_of_jurisdiction'] = get_entities(soup, 'J\d+') return function def get_related_functions(params, function): new_params = params.copy() new_params['beforeAfterButton.x'] = 10 new_params['beforeAfterButton.y'] = 10 response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params) soup = BeautifulSoup(response.text) for relation in ['PRIOR', 'FOLLOWING']: heading = soup.find('strong', string=re.compile(relation)) table = find_next_table(heading) if table: function['functions_{}'.format(relation.lower())] = get_entities(table, 'F\d+', relation.lower()) else: function['functions_{}'.format(relation.lower())] = [] return function def get_agencies(params, function): # Add these parameters to simulate a click on the tab new_params = params.copy() new_params['whoDidItButton.x'] = 10 new_params['whoDidItButton.y'] = 10 response = s.post('https://www.archway.archives.govt.nz/FullFunction.do', params=new_params) soup = BeautifulSoup(response.text) # There are two types of related agencies: responsible and exercising. for relation in ['RESPONSIBLE', 'EXERCISING']: # Find the heading in the page heading = soup.find('strong', string=re.compile(relation)) # Find the next table on the page table = find_next_table(heading) if table: function['agencies_{}'.format(relation.lower())] = get_entities(table, 'A[A-Z]+') else: function['agencies_{}'.format(relation.lower())] = [] return function # ------------------------------------------- # Process the pages def get_function_details(function): # print(function['id']) function_url = 'https://www.archway.archives.govt.nz/ViewEntity.do?code={}'.format(function['id']) response = s.get(function_url) soup = BeautifulSoup(response.text, 'lxml') function = get_mandates(soup, function) function['alternative_names'] = get_alt_names(soup) # The related entities tabs on a function details page actually link to separate pages. # Here we get the details from each tab and add it to our the function data. params = get_function_page_params(soup) function = get_jurisdictions(params, function) function = get_related_functions(params, function) function = get_agencies(params, function) return function def process_page(soup): results = [] links = soup.find_all('a', href=re.compile('ViewEntity.do'), string=re.compile(r'^F\d+')) for link in tqdm_notebook(links, leave=False, unit='result', desc='Per page'): function_id = re.search(r'ViewEntity\.do\?code=([F\d]+)', link['href']).group(1) function = {'id': function_id} cells = link.parent.parent.find_all('td', recursive=False) function['term'] = cells[1].table.find_all('tr')[1].td.string.strip().lower() function['start_date_str'] = cells[2].string.strip() function['end_date_str'] = cells[3].string.strip() function['start_date'] = clean_date(cells[2].string.strip()) function['end_date'] = clean_date(cells[3].string.strip()) function = get_function_details(function) # print(function) time.sleep(0.2) results.append(function) return results def harvest_functions(page=1): ''' Loop through each results page and harvest the function details. To resume a harvest you can supply a page parameter. ''' db = TinyDB('data/db-functions.json') Result = Query() params = prepare_search() total_results = int(params['searchResultsContainer.totalResultSize']) functions_url = 'https://www.archway.archives.govt.nz/FunctionAdvancedSearchResults.do' with tqdm_notebook(total=total_results, desc='Functions', unit='function') as pbar: while len(db.all()) < total_results: response = s.post(functions_url, params=params) soup = BeautifulSoup(response.text, 'lxml') results = process_page(soup) # Handle duplicates in the case of a restart/reharvest for result in results: db.upsert(result, Result.id == result['id']) page += 1 params = get_page_params(soup, page) pbar.update(len(results)) # ## Harvest the data # # This might take a while... # In[ ]: harvest_functions() # ## Save and download the results # In[302]: def save_json(): ''' Save the complete dataset to a JSON file. ''' db = TinyDB('data/db-functions.json') with open('data/functions.json', 'w') as json_file: json_file.write(json.dumps(db.all())) display(FileLink('data/functions.json')) def save_csv(): ''' Save a subset of fields to a CSV ''' db = TinyDB('data/db-functions.json') df = pd.DataFrame(db.all()) df = df[['id', 'term', 'start_date', 'end_date']] df.to_csv('data/functions.csv', index=False) display(FileLink('data/functions.csv')) # In[303]: save_json() save_csv() # ## Very simple search # # Look for keyword strings in the term names and return any records that match. # # This searches the JSON dump of the harvested functions. # In[3]: def find_term(b): out.clear_output() search_term = keyword.value # Load the JSON file with open('data/functions.json', 'r') as json_file: data = json.load(json_file) results = [f for f in data if search_term in f['term']] for result in results: with out: display(HTML('{}'.format(result['term']))) print(json.dumps(result, indent=4)) out = widgets.Output() keyword = widgets.Text( placeholder='Enter search term', description='Search term:', disabled=False ) button = widgets.Button( description='Display functions', disabled=False, button_style='primary', # 'success', 'info', 'warning', 'danger' or '' tooltip='Click to view matching functions', icon='' ) button.on_click(find_term) display(widgets.HBox([keyword, button])) display(out) # In[ ]: