This notebook scrapes data from the 'Newly scanned records' section of RecordSearch, creating a list of recently digitised files. I ran this code on 27 March 2021 to generate a dataset containing files that had been digitised in the previous month.
The 'Newly scanned records' only display a month's worth of additions. However, I've modified the code below to create a 'git scraper' that uses GitHub actions to run the harvester every Sunday, saving a list of the files digitised in the previous week into a public repository. Over time, this should build up a more complete record of the digitisation process.
It took me a while to figure out how the pagination worked in the 'Newly scanned records' site. As you can see below, it's a matter of adding inputs to the main navigation form that mimic a click on the page navigation buttons. Screen scraping is such fun... 😬
import re
import time
from pathlib import Path
import altair as alt
import arrow
import mechanicalsoup
import pandas as pd
from recordsearch_data_scraper.scrapers import RSSeries
from tqdm.auto import tqdm
def initialise_browser():
"""
This is necessary to get an active session in RS.
"""
browser = mechanicalsoup.StatefulBrowser()
browser.open("https://recordsearch.naa.gov.au/scripts/Logon.asp?N=guest")
# As of Jan 2023 these lines don't seem necessary and cause a LinkNotFound error
# browser.select_form('form[id="t"]')
# browser.submit_selected()
return browser
def get_date_digitised(result):
"""
Generate a formatted date from the date digitised string (eg 'Digitised 1 days ago').
It does this by getting today's date then subtracting the interval.
It's possible this might not always be accurate...
"""
# Get the string describing when the record was digitised
when_digitised = result.find(
"div", class_="card-footer card-footer-list"
).span.string.strip()
# Extract out the time interval and unit
interval, unit = re.search(
r"^Digitised (\d+) (minutes|hours|days) ago", when_digitised
).groups()
# Subtract interval from today's date
if unit == "minutes":
date_digitised = arrow.now("Australia/Sydney").shift(minutes=-(int(interval)))
elif unit == "days":
date_digitised = arrow.now("Australia/Sydney").shift(days=-(int(interval)))
elif unit == "hours":
date_digitised = arrow.now("Australia/Sydney").shift(hours=-(int(interval)))
# ISO format the result
return date_digitised.format("YYYY-MM-DD")
def get_records_from_page(page, pbar):
"""
Scrapes item metadata from the list of results.
"""
records = []
# Get the list of results
results = page.find_all("li", class_="soda_list")
# Loop through the results, extracting the metadata
for result in results:
record = {}
record["title"] = result.img["title"]
record["item_id"] = (
result.find("dt", string="Item ID:")
.find_next_sibling("dd")
.a.string.strip()
)
record["series"] = (
result.find("dt", string="Series:").find_next_sibling("dd").a.string.strip()
)
record["control_symbol"] = (
result.find("dt", string=re.compile("Control symbol:"))
.find_next_sibling("dd")
.string.strip()
)
record["date_range"] = re.sub(
r"\s+",
" ",
result.find("dt", string=re.compile("Date range:"))
.find_next_sibling("dd")
.string.strip(),
)
record["date_digitised"] = get_date_digitised(result)
records.append(record)
pbar.update(len(records))
return records
def get_number_of_results(page):
"""
Get the start, end, and total number of results from the current page of results.
"""
result_summary = page.find(
"label", id="ContentPlaceHolderSNR_lblTopPaging"
).string.strip()
start, end, total = re.search(r"(\d+) to (\d+) of (\d+)", result_summary).groups()
return (start, end, total)
def harvest_recently_digitised():
records = []
# Get a browser with all RecordSearch's session stuff ready
browser = initialise_browser()
# Open the recently digitised page
browser.open(
"https://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ListingReports/NewlyScannedList.aspx"
)
# CONFIGURE THE RESULTS FORM
browser.select_form('form[id="formSNRMaster"]')
# Get 200 results per page
browser["ctl00$ContentPlaceHolderSNR$ddlResultsPerPage"] = "200"
# Get results from the past month. Other options are 'w' (week) and 'f' (fortnight).
browser["ctl00$ContentPlaceHolderSNR$ddlDateAdded"] = "m"
# Set display to list view
# Setting these mimics a click on the List View button
browser.form.set("ctl00$ContentPlaceHolderSNR$btn_viewList.x", "11", force=True)
browser.form.set("ctl00$ContentPlaceHolderSNR$btn_viewList.y", "9", force=True)
browser.submit_selected()
# PROCESS RESULTS
# Get the total number of results
start, end, total = get_number_of_results(browser.page)
with tqdm(total=int(total)) as pbar:
# Process first page of results
records += get_records_from_page(browser.page, pbar)
# Loop through the rest of the results set
while end != total:
browser.select_form('form[id="formSNRMaster"]')
# Setting these and submitting the form retrieves th next page of results
# Basically they mimic a click on the page navigation buttons
browser.form.set(
"ctl00$ContentPlaceHolderSNR$listPagerTop$ctl00$ctl02.x",
"10",
force=True,
)
browser.form.set(
"ctl00$ContentPlaceHolderSNR$listPagerTop$ctl00$ctl02.y",
"10",
force=True,
)
browser.submit_selected()
start, end, total = get_number_of_results(browser.page)
records += get_records_from_page(browser.page, pbar)
time.sleep(1)
return records
records = harvest_recently_digitised()
df_records = pd.DataFrame(records)
df_records.head()
The dataset only includes the series identifiers. To make it a bit more useful, we can retrieve the title of each series and add this to the dataset.
First we extract a list of unique series identifiers from the dataset, then loop through it, grabbing the series details using my RecordSearch tools library.
series_titles = []
# Loop through the list of series ids
for s in tqdm(list(df_records["series"].unique())):
# Get the summary details from each series
# Note that this includes more information than the title which could be added into the dataset if you wanted (eg location)
details = RSSeries(
s, include_number_digitised=False, include_access_status=False
).data
# Add the titles and ids to a new list
try:
series_titles.append({"series": s, "series_title": details["title"]})
except KeyError:
print(details)
Then we can convert the series titles into a dataframe and merge it with the records dataframe to create a new dataframe that includes the titles.
df_series = pd.DataFrame(series_titles)
# Merge the dataframes on the `series` column
df = df_records.merge(df_series, on="series")
df.head()
df.to_csv(
Path("data", f'recently-digitised-{arrow.now().format("YYYYMMDD")}.csv'),
index=False,
)
Let's get a list of the series that appear most often in the dataset.
# Reload previously harvested file if necessary
df = pd.read_csv("data/recently-digitised-20210327.csv")
series = df.value_counts(["series", "series_title"]).to_frame().reset_index()
series.columns = ["series", "series_title", "count"]
series[:25]
series | series_title | count | |
---|---|---|---|
0 | B884 | Citizen Military Forces Personnel Dossiers, 19... | 20382 |
1 | A9301 | RAAF Personnel files of Non-Commissioned Offic... | 1207 |
2 | B883 | Second Australian Imperial Force Personnel Dos... | 515 |
3 | A10605 | Personnel Occurrence Reports | 396 |
4 | A6135 | Photographic colour transparencies positives, ... | 226 |
5 | D4881 | Alien registration cards, alphabetical series | 66 |
6 | MP367/1 | General correspondence files | 48 |
7 | A9300 | RAAF Officers Personnel files, 1921-1948 | 47 |
8 | A12372 | RAAF Personnel files - All Ranks [Main corresp... | 40 |
9 | BP5/2 | Drawings of inventions for letters patent, sin... | 37 |
10 | B78 | Alien registration documents | 36 |
11 | A2478 | Non-British European migrant selection documents | 34 |
12 | MP84/1 | Correspondence files, multiple number series | 30 |
13 | BP371/1 | Correspondence registration booklets and cards | 26 |
14 | A705 | Correspondence files, multiple number (Melbour... | 25 |
15 | A471 | Courts-Martial files [including war crimes tri... | 23 |
16 | J3111 | Queensland post office history files, alphabet... | 23 |
17 | J3109 | Historic photographic collection assembled by ... | 19 |
18 | BP8/1 | Mail service (contract) files, either annual s... | 19 |
19 | A13860 | Medical Documents - Army (Department of Defenc... | 19 |
20 | SP908/1 | Application for Registration of Aliens (other ... | 18 |
21 | A446 | Correspondence files, annual single number ser... | 18 |
22 | J539 | Correspondence files, multiple number series. | 16 |
23 | A1877 | British migrants - Selection documents for fre... | 14 |
24 | J26 | Medical case files, single number series with ... | 13 |
You can see that the most of the files come from just four series containing miltary service records. This reflects the NAA's current digitisation priorities.
Let's go to the other end of the dataset and look at the series that appear 20 or less times.
series.loc[series["count"] < 21]
series | series_title | count | |
---|---|---|---|
17 | J3109 | Historic photographic collection assembled by ... | 19 |
18 | BP8/1 | Mail service (contract) files, either annual s... | 19 |
19 | A13860 | Medical Documents - Army (Department of Defenc... | 19 |
20 | SP908/1 | Application for Registration of Aliens (other ... | 18 |
21 | A446 | Correspondence files, annual single number ser... | 18 |
... | ... | ... | ... |
369 | BP190/4 | 'RT' series rifle range tenure correspondence ... | 1 |
370 | BP242/1 | Correspondence files relating to national secu... | 1 |
371 | BP25/1 | Alien registration papers, alphabetical series... | 1 |
372 | BP460/3 | Main Trust files annual single number series | 1 |
373 | C424 | General correspondence files, annual single nu... | 1 |
357 rows × 3 columns
series.loc[series["count"] == 1]
series | series_title | count | |
---|---|---|---|
164 | ST1233/1 | Investigation files, single number series with... | 1 |
165 | K26 | Personal case files, single number series with... | 1 |
166 | J992 | Mail Service files, North Queensland, single n... | 1 |
167 | K60 | Personal case files, single number with 'M' an... | 1 |
168 | K269 | Inward passenger manifests for ships and aircr... | 1 |
... | ... | ... | ... |
369 | BP190/4 | 'RT' series rifle range tenure correspondence ... | 1 |
370 | BP242/1 | Correspondence files relating to national secu... | 1 |
371 | BP25/1 | Alien registration papers, alphabetical series... | 1 |
372 | BP460/3 | Main Trust files annual single number series | 1 |
373 | C424 | General correspondence files, annual single nu... | 1 |
210 rows × 3 columns
So 357 of 375 series (that's 95%) appear 20 or less times. That's a classic 'long tail', and presumably reflects the diversity of interests that fuel 'digitisation on demand' requests. But this really needs more analysis
We can visualise the long tail by using a logarithmic scale to display the count. You'll see that most series only have one digitised file (mouse over the bars for series details).
alt.Chart(series).mark_bar(size=2).encode(
x=alt.X("series", sort="-y", axis=alt.Axis(labels=False, ticks=False)),
y=alt.Y("count", scale=alt.Scale(type="symlog")),
tooltip=["series", "series_title", "count"],
).properties(width=800)
Once I've accumulated a longer record of digitisation it'll be interesting to see how things change over time. It would also be possible to use my RecordSearch Tools to find out how many pages there are in each digitised file.
Of course you could other things with this data, such as setting up an RSS feed for updates, or creating a Twitter bot sharing recently-digitised files. Hmmm...