This notebook scrapes details of available indexes from the NSW State Archives Subjects A to Z page. It saves the results as a CSV formatted file.
Once you've harvested the index details, you can use them to harvest the content of all the individual indexes.
Here's the indexes.csv I harvested in May 2023.
The fields in the CSV file are:
title
– index titleurl
– link to the index's web pagedescription
– brief description of the indexcategory
– subject category this index belongs to (eg: 'Convicts')import re
import pandas as pd
import requests_cache
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
def get_categories():
"""
Scrape a list of subject categories containing indexes from the Subjects A-Z page.
Returns a list of dicts with keys:
- `category` -- name of category
- `url` -- link to category page
"""
categories = []
# Get the Subjects A-Z page filtered to categories containing indexes
response = s.get("https://mhnsw.au/archive/subjects/?filter=indexes")
soup = BeautifulSoup(response.text)
# Get the div containing the category list
category_list = soup.find("div", class_=re.compile("^styles_rows__"))
# Loop through each category div saving details
for row in category_list.find_all("div", id=re.compile("^row-")):
link = row.find("a")
categories.append(
{"category": link.string, "url": f"https://mhnsw.au{link['href']}"}
)
return categories
def get_indexes(categories):
"""
Scrape a list of indexes for each category in the list of categories.
Parameters: `categories` -- list of categories
Returns a list of dicts with keys:
- `title` -- title of index
- `url` -- link to index page
- `description` -- brief description of index
- `category` -- name of category
"""
indexes = []
# Loop through list of categories
for category in categories:
# Get the category page
response = s.get(category["url"])
soup = BeautifulSoup(response.text)
# Find the div containing the list of indexes
index_list = soup.find("div", class_=re.compile("^styles_rows__"))
# Loop through divs containing index info
for row in index_list.find_all("div", id=re.compile("^row-undefined")):
link = row.find("a")
# Get description
description = row.find(
"div", class_=re.compile("^styles_content__")
).get_text()
indexes.append(
{
"title": link.string,
"url": f"https://mhnsw.au{link['href']}",
"description": description,
"category": category["category"],
}
)
return indexes
# Harvest list of categories
categories = get_categories()
# Harvest list of indexes from categories
indexes = get_indexes(categories)
# Convert to a Pandas dataframe
df = pd.DataFrame(indexes)
# Peek inside
df.head()
title | url | description | category | |
---|---|---|---|---|
0 | Colonial (Government) Architect index 1837-1970 | https://mhnsw.au/indexes/architecture-and-desi... | Designed for researching the history of public... | Architecture & design |
1 | Infirm & destitute (Government) asylums index ... | https://mhnsw.au/indexes/asylums/infirm-destit... | This index relates to persons admitted to Gove... | Asylums |
2 | Bankruptcy index 1888-1929 | https://mhnsw.au/indexes/bankruptcy-and-insolv... | Bankruptcy is a state in which a person is una... | Bankruptcy & insolvency |
3 | Insolvency index 1842-1887 | https://mhnsw.au/indexes/bankruptcy-and-insolv... | Insolvency is the inability to pay debts or me... | Bankruptcy & insolvency |
4 | Bubonic plague index 1900-1908 | https://mhnsw.au/indexes/bubonic-plague/buboni... | The Register of Cases of Bubonic Plague 1900-1... | Bubonic plague |
# Save as a CSV file
df.to_csv("indexes.csv", index=False)
Created by Tim Sherratt for the GLAM Workbench.