Harvest indexes

This notebook harvests data from all of NSW State Archives online indexes, saving the data as a collection of easily downloadable CSV files.

Before you harvest the data, you need to get the details of all the indexes.

If you just want the data, my latest harvest of the indexes is available from this repository.

If you'd like to explore the harvested data, try the Index Explorer!

Import what we need

In [65]:
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm_notebook
import pandas as pd
from slugify import slugify
from urllib.parse import urlparse, parse_qs, urljoin
import time
import string
import os
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# By using requests_cache we don't have to redownload pages if a harvest fails part way through
import requests_cache

s = requests_cache.CachedSession()
retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504, 524 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))

Define some functions

In [85]:
def get_values(row, tag):
    '''
    Extracts values from the row of an index.
    Parameters:
        row - a row of an html table
        tag - html table cell tag, either td or th
    Returns:
        A list of values
    '''
    values = []
    
    # Look though all the cells with the specified tag
    for cell in row.find_all(tag):
        
        # Get the value of a cell, replacing any non-breaking spaces
        try:
            value = cell.string.replace(u'\xa0', '')
        except AttributeError:
            value = cell.string
            
        # Add the value to the list
        values.append(value)
    return values


def get_urls(row, columns):
    '''
    Checks to see if any of the values in a row have links.
    If they do, it adds the urls to a new column.
    '''
    urls = {}
    
    # Loop through cells
    for i, cell in enumerate(row.find_all('td')):
        
        # Look for a link
        link = cell.find('a')
        
        # If there's a link and the corresponding column has a name
        if link and columns[i] is not None:
            
            # Get the path
            path = link['href']
            
            # Check that it's a full path
            if path != '/index_image/':
                
                # Turn the relative path into and absolute url
                url = urljoin('https://www.records.nsw.gov.au', path)
                
                # Create a new column name by appending 'url' to the current column
                column = '{} url'.format(columns[i])
                
                # Add to the dictionary
                urls[column] = url
    return urls
            


def get_total_pages(url):
    '''
    Get the total number of pages in an index.
    '''
    # Get the first page
    response = s.get(url, timeout=60)
    
    # Soupify
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Find the link to the last page
    last_link = soup.find('a', title='Go to last page')
    
    # Get the last page number from the link
    last_page = int(re.search('page=(\d+)', last_link['href']).group(1))
    return last_page


def harvest_index(index, start_page=0):
    '''
    Harvest all the available data from an online index.
    Parameters:
        index - a dictionary with 'title' and 'url' parameters
    Returns:
        A list of harvested rows, the actual fields depends on the index being harvested.
    '''
    title = index['title']
    url = index['url']
    rows = []
    
    # Get the total number of pages in the index
    total_pages = get_total_pages(url)
    
    # Split the index search url into the base url and query parameters
    base_url, query = url.split('?')
    
    # Parse the query parameters into dictionary form
    params = parse_qs(query, keep_blank_values=True)
    
    # Set the start page
    page = start_page
    
    # Column headings
    columns = None
    
    # Loop through all the pages in the index
    with tqdm_notebook(total=(total_pages - start_page), leave=False, desc='Pages:') as pbar:
        while page <= total_pages:
            
            # Set the page parameter to the current page
            params['page'] = page
            
            # Get a page of index search results & soupify
            response = s.get(base_url, params=params, timeout=120)
            soup = BeautifulSoup(response.text, 'lxml')
            
            # If the columns list is empty, then we'll get the column headings from the first row
            if columns is None:
                header = soup.find('thead').find('tr')
                columns = get_values(header, 'th')
                
            # Get the rows
            table = soup.find('tbody')
            
            try:
                # Loop through all the rows
                for row in table.find_all('tr', recursive=False):
                    
                    # Get the values of a row and zip them up with the column headers to make a dictionary
                    values = dict(zip(columns, get_values(row, 'td')))
                    
                    # Get any urls as new columns and add them to the values dictionary
                    values.update(get_urls(row, columns))
                    
                    # Save the row values
                    rows.append(values)
                    
            # Sometimes this fails with an Attribute Error
            # I assume we're getting a page with no results, but no explicit error
            # In this case we'll try and trigger the retry mechanism
            except AttributeError:
                raise requests.HTTPError
            else:
                
                # Rest for a bit
                time.sleep(1)
                
                # Increment the page
                page += 1
                pbar.update(1)
    return rows


def save_csv(rows, title):
    '''
    Saves harvested index as a CSV formatted file.
    
    Parameters:
    
        rows – the data
        title – the name of the index
    '''
    
    # Convert the data into a DataFrame
    df = pd.DataFrame(rows)
    
    # Remove empty rows and columns
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, how='all', inplace=True)
    
    # Save as a CSV, using the slugified index title as a file name
    df.to_csv(os.path.join('csv', '{}.csv'.format(slugify(title))), index=False)


def harvest_all_indexes(start=0):
    '''
    Works through the list of indexes, harvesting the data for each index and saving it as a CSV file.
    '''
    
    # Open up the CSV file containing the index details
    indexes = pd.read_csv('indexes.csv')
    
    # Loop through the list of indexes 
    for index in tqdm_notebook(indexes.to_dict('records')[start:], desc='Indexes:'):
        
        # GET ALL THE DATA!
        rows = harvest_index(index)
        
        # Save the harvested data as a CSV, using the index name as a file title
        save_csv(rows, index['title'])
        
        # Clear the requests cache after each index
        s.cache.clear()

Get all the data!

In [ ]:
harvest_all_indexes()

Harvest a single index

In [93]:
index = {'title': 'Assisted Immigrants', 'url': 'https://www.records.nsw.gov.au/searchhits_nocopy?id=9&Surname=%25&Firstname=&Ship&Year=&Arriving&Remarks='}
rows = harvest_index(index)
df = pd.DataFrame(rows)
save_csv(rows, index['title'])
s.cache.clear()

Created by Tim Sherratt.

Part of the GLAM Workbench project.