This notebook harvests data from all of NSW State Archives online indexes, saving the data as a collection of easily downloadable CSV files.
Before you harvest the data, you need to get the details of all the indexes.
If you just want the data, my latest harvest of the indexes is available from this repository.
If you'd like to explore the harvested data, try the Index Explorer!
import requests
from bs4 import BeautifulSoup
import re
from tqdm import tqdm_notebook
import pandas as pd
from slugify import slugify
from urllib.parse import urlparse, parse_qs, urljoin
import time
import string
import os
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
# By using requests_cache we don't have to redownload pages if a harvest fails part way through
import requests_cache
s = requests_cache.CachedSession()
retries = Retry(total=10, backoff_factor=1, status_forcelist=[ 502, 503, 504, 524 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
def get_values(row, tag):
'''
Extracts values from the row of an index.
Parameters:
row - a row of an html table
tag - html table cell tag, either td or th
Returns:
A list of values
'''
values = []
# Look though all the cells with the specified tag
for cell in row.find_all(tag):
# Get the value of a cell, replacing any non-breaking spaces
try:
value = cell.string.replace(u'\xa0', '')
except AttributeError:
value = cell.string
# Add the value to the list
values.append(value)
return values
def get_urls(row, columns):
'''
Checks to see if any of the values in a row have links.
If they do, it adds the urls to a new column.
'''
urls = {}
# Loop through cells
for i, cell in enumerate(row.find_all('td')):
# Look for a link
link = cell.find('a')
# If there's a link and the corresponding column has a name
if link and columns[i] is not None:
# Get the path
path = link['href']
# Check that it's a full path
if path != '/index_image/':
# Turn the relative path into and absolute url
url = urljoin('https://www.records.nsw.gov.au', path)
# Create a new column name by appending 'url' to the current column
column = '{} url'.format(columns[i])
# Add to the dictionary
urls[column] = url
return urls
def get_total_pages(url):
'''
Get the total number of pages in an index.
'''
# Get the first page
response = s.get(url, timeout=60)
# Soupify
soup = BeautifulSoup(response.text, 'lxml')
# Find the link to the last page
last_link = soup.find('a', title='Go to last page')
# Get the last page number from the link
last_page = int(re.search('page=(\d+)', last_link['href']).group(1))
return last_page
def harvest_index(index, start_page=0):
'''
Harvest all the available data from an online index.
Parameters:
index - a dictionary with 'title' and 'url' parameters
Returns:
A list of harvested rows, the actual fields depends on the index being harvested.
'''
title = index['title']
url = index['url']
rows = []
# Get the total number of pages in the index
total_pages = get_total_pages(url)
# Split the index search url into the base url and query parameters
base_url, query = url.split('?')
# Parse the query parameters into dictionary form
params = parse_qs(query, keep_blank_values=True)
# Set the start page
page = start_page
# Column headings
columns = None
# Loop through all the pages in the index
with tqdm_notebook(total=(total_pages - start_page), leave=False, desc='Pages:') as pbar:
while page <= total_pages:
# Set the page parameter to the current page
params['page'] = page
# Get a page of index search results & soupify
response = s.get(base_url, params=params, timeout=120)
soup = BeautifulSoup(response.text, 'lxml')
# If the columns list is empty, then we'll get the column headings from the first row
if columns is None:
header = soup.find('thead').find('tr')
columns = get_values(header, 'th')
# Get the rows
table = soup.find('tbody')
try:
# Loop through all the rows
for row in table.find_all('tr', recursive=False):
# Get the values of a row and zip them up with the column headers to make a dictionary
values = dict(zip(columns, get_values(row, 'td')))
# Get any urls as new columns and add them to the values dictionary
values.update(get_urls(row, columns))
# Save the row values
rows.append(values)
# Sometimes this fails with an Attribute Error
# I assume we're getting a page with no results, but no explicit error
# In this case we'll try and trigger the retry mechanism
except AttributeError:
raise requests.HTTPError
else:
# Rest for a bit
time.sleep(1)
# Increment the page
page += 1
pbar.update(1)
return rows
def save_csv(rows, title):
'''
Saves harvested index as a CSV formatted file.
Parameters:
rows – the data
title – the name of the index
'''
# Convert the data into a DataFrame
df = pd.DataFrame(rows)
# Remove empty rows and columns
df.dropna(axis=1, how='all', inplace=True)
df.dropna(axis=0, how='all', inplace=True)
# Save as a CSV, using the slugified index title as a file name
df.to_csv(os.path.join('csv', '{}.csv'.format(slugify(title))), index=False)
def harvest_all_indexes(start=0):
'''
Works through the list of indexes, harvesting the data for each index and saving it as a CSV file.
'''
# Open up the CSV file containing the index details
indexes = pd.read_csv('indexes.csv')
# Loop through the list of indexes
for index in tqdm_notebook(indexes.to_dict('records')[start:], desc='Indexes:'):
# GET ALL THE DATA!
rows = harvest_index(index)
# Save the harvested data as a CSV, using the index name as a file title
save_csv(rows, index['title'])
# Clear the requests cache after each index
s.cache.clear()
harvest_all_indexes()
index = {'title': 'Assisted Immigrants', 'url': 'https://www.records.nsw.gov.au/searchhits_nocopy?id=9&Surname=%25&Firstname=&Ship&Year=&Arriving&Remarks='}
rows = harvest_index(index)
df = pd.DataFrame(rows)
save_csv(rows, index['title'])
s.cache.clear()
HBox(children=(IntProgress(value=0, description='Pages:', max=9584, style=ProgressStyle(description_width='ini…