This notebook scrapes details of available indexes from the NSW State Archives A to Z list of online indexes. It saves the results as a CSV formatted file.
Once you've harvested the index details, you can use them to harvest the content of all the individual indexes.
Here's the indexes.csv I harvested in July 2019.
The fields in the CSV file are:
id
– numeric index identifiertitle
– index title (this is taken from the index search page, some indexes have different titles in the category listings)url
– a search url that returns all the results in the indexstatus
– Not digitised / Fully digitisedmore_info_url
– link with more information about the indeximport requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
from tqdm import tqdm_notebook
import pandas as pd
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
def process_category(category):
'''
Process a category page, scraping the links to indexes & extracting basic data.
Parameters:
category - the index category to process
Returns:
A list of indexes.
'''
indexes = []
# Construct a url to the category
url = 'https://www.records.nsw.gov.au' + category
# Get the category page and soupify
response = s.get(url)
soup = BeautifulSoup(response.text)
# Find the rows containing index info and loop through them
for div in soup.find_all('div', class_='container'):
# Get the index search link
index = div.find('a', class_='form-submit')
# Try to extract the numeric index id from the link
try:
index_id = re.search('\?id=(\d+)', index['href']).group(1)
except (AttributeError, TypeError):
pass
else:
# If we find an id, then grab some other data
# Get the digitisation status
status = div.find('a', href=re.compile(r'record-status')).string
# Get the link to more information
more_info = div.find('a', string=re.compile(r'More about the Index'))
# If there's no more info link, just use the category page
if more_info is None:
more_info_url = url
# If there is a more info link, turn it into a url
else:
more_info_url = urljoin('https://www.records.nsw.gov.au', more_info['href'])
# Add this index to the list
indexes.append({'id': index_id, 'status': status, 'more_info_url': more_info_url})
return indexes
def get_indexes():
'''
Process each of the categories on the A-Z page, scraping the links & extracting the index data.
Returns:
A list of indexes.
'''
indexes = []
# Some indexes appear in more than one category, so we'll keep track of what we've seen.
seen = []
# Get the A-Z page & turn it into soup
response = s.get('https://www.records.nsw.gov.au/archives/collections-and-research/guides-and-indexes/indexes-a-z')
soup = BeautifulSoup(response.text)
# Get all the links that go to an index category
links = soup.find_all('a', href=re.compile('/archives/collections-and-research/guides-and-indexes/[a-z\-]+/indexes'))
# Loop through the links
for link in tqdm_notebook(links, desc='Links:'):
# If we haven't seen this url before, we'll add it to the seen list
if link['href'] not in seen:
seen.append(link['href'])
# Get all the indexes from the category link
indexes += process_category(link['href'])
# Make sure we have no duplicates
indexes = [i for n, i in enumerate(indexes) if i not in indexes[n + 1:]]
return indexes
def make_index_list():
'''
Get the title and search url for each index.
Returns:
A list of all indexes with the following columns
- id
- title
- url (search url)
- status (is it digitised?)
- more_info_url (link to more info)
'''
# Get all the indexes from the A-Z & category pages
indexes = get_indexes()
# Loop through the indexes)
for index in tqdm_notebook(indexes, desc='Indexes:'):
# What we're doing here is trying to formulate the url we'll need to harvest all the data from an index
# First we get the index page (which includes the search form)
response = s.get('https://www.records.nsw.gov.au/search_form', params={'id': index['id']})
soup = BeautifulSoup(response.text)
# Get the title of the index
index['title'] = soup.find('h1').string
# Find the search form
form = soup.find(id='records-online-index-search-form')
# Get all the input fields from the form
inputs = form.find_all('input')
# This is the payload that we'll save the form parameters to
data = {}
# Loop through the input fields
for i, field in enumerate(inputs):
# To get all the records in an index, we search fpor '%'
# If this is the first field, set its value to %
if i == 0:
data[field['name']] = '%'
# Otherwise just keep default values
else:
data[field['name']] = field['value']
# Submit the form data
form_response = s.post('https://www.records.nsw.gov.au/search_form', params={'id': index['id']}, data=data)
# Save the form submission url
index['url'] = form_response.url
return indexes
# Harvest index details
indexes = make_index_list()
# Convert to a Pandas dataframe
df = pd.DataFrame(indexes)
# Peek inside
df.head()
id | more_info_url | status | title | url | |
---|---|---|---|---|---|
0 | 47 | https://www.records.nsw.gov.au/archives/collec... | Not digitised | Index on Occupants on Aboriginal Reserves, 187... | https://www.records.nsw.gov.au/searchhits_noco... |
1 | 91 | https://www.records.nsw.gov.au/archives/collec... | Not digitised | Botanic Gardens and Government Domains Employe... | https://www.records.nsw.gov.au/searchhits_noco... |
2 | 9 | https://www.records.nsw.gov.au/archives/collec... | Fully digitised | Assisted Immigrants | https://www.records.nsw.gov.au/searchhits_noco... |
3 | 55 | https://www.records.nsw.gov.au/archives/collec... | Not digitised | Index to Miscellaneous Immigrants | https://www.records.nsw.gov.au/searchhits_noco... |
4 | 43 | https://www.records.nsw.gov.au/archives/collec... | Not digitised | Index to the Unassisted Arrivals NSW 1842-1855 | https://www.records.nsw.gov.au/searchhits_noco... |
df['status'].value_counts()
Not digitised 56 Fully digitised 8 Name: status, dtype: int64
# Save as a CSV file
df.to_csv('indexes.csv', index=False)