The National Archives of Australia's RecordSearch database includes some information about files that we're not allowed to see. These files have been through the access examination process and ended up with an access status of 'closed'. You can read about my efforts to extract and interpret this data in Inside Story.
While you can search by access status in RecordSearch, you can't explore the reasons, so if you want to dig any deeper you need to harvest the data. This notebook shows you how.
This code used in this notebook is similar to that in harvesting items from a search. The only real difference is that full items records are harvested by default, and the access reasons are processed to separate and normalise munged-together values.
This notebook uses the RecordSearch Data Scraper to do most of the work. Note that the RecordSearch Data Scraper caches results to improve efficiency. This also makes it easy to resume a failed harvest. If you want to completely refresh a harvest, then delete the cache_db.sqlite
file to start from scratch.
import json
import re
import time
from datetime import datetime
from pathlib import Path
import pandas as pd
from IPython.display import FileLink, display
from recordsearch_data_scraper.scrapers import RSItemSearch
from tqdm.auto import tqdm
# Regular expressions to match against the reasons in RS to normalise them
EXCEPTIONS = [
["33(1)(a)", r"33\(1\)\(a\)"],
["33(1)(b)", r"33\(1\)[a\(\)]*\(b\)"],
["33(1)(c)", r"33\(1\)[ab\(\)]*\(c\)"],
["33(1)(d)", r"33\(1\)[abc\(\)]*\(d\)"],
["33(1)(e)(i)", r"33\(1\)[abcd\(\)]*\(e\)\(i\)"],
["33(1)(e)(ii)", r"33\(1\)[abcd\(\)]*\(e\)\(ii\)"],
["33(1)(e)(iii)", r"33\(1\)[abcd\(\)]*\(e\)\(iii\)"],
["33(1)(f)(i)", r"33\(1\)[abcdei\(\)]*\(f\)\(i\)"],
["33(1)(f)(ii)", r"33\(1\)[abcdei\(\)]*\(f\)\(ii\)"],
["33(1)(f)(iii)", r"33\(1\)[abcdei\(\)]*\(f\)\(iii\)"],
["33(1)(g)", r"33\(1\)[abcdefi\(\)]*\(g\)*"],
["33(1)(h)", r"33\(1\)[abcdefgi\(\)]*\(h\)"],
["33(1)(j)", r"33\(1\)[abcdefghi\(\)]*\(j\)"],
["33(2)(a)", r"33\(2\)\(a\)"],
["33(2)(b)", r"33\(2\)[a\(\)]*\(b\)"],
["33(3)(a)(i)", r"33\(3\)\(a\)\(i\)"],
["33(3)(a)(ii)", r"33\(3\)\(a\)(\(i\))?\(ii\)"],
["33(3)(b)", r"33\(3\)[ai\(\) &]*\(b\)"],
["Closed period", r"Closed period.*"],
]
def normalise_reasons(items):
"""
Uses a set of regex patterns to try and extract a set of individual reasons from the reasons values,
which can sometimes be munged together.
"""
for item in items:
item["reasons"] = []
try:
# The access reason field can munge together mutiple reasons, so we need to separate & normalise
for reason in item["access_decision_reasons"]:
matched = False
# Loop through the regexp patterns to see what we can find in the access reason field, save any matches
for exception, pattern in EXCEPTIONS:
if re.match(pattern, reason):
item["reasons"].append(exception)
matched = True
if not matched:
# If nothing matches, just save the original
item["reasons"].append(reason)
except KeyError:
print(item)
raise
return items
items = []
search = RSItemSearch(record_detail="full", access="Closed")
with tqdm(total=search.total_results) as pbar:
more = True
while more:
data = search.get_results()
if data["results"]:
items += normalise_reasons(data["results"])
pbar.update(len(data["results"]))
time.sleep(0.5)
else:
more = False
def save_harvest(search, items):
params = search.params.copy()
params.update(search.kwargs)
today = datetime.now()
search_param_str = "_".join(
sorted(
[
f"{k}_{v}"
for k, v in params.items()
if v is not None and k not in ["results_per_page", "sort"]
]
)
)
data_dir = Path("harvests", f'{today.strftime("%Y%m%d")}_{search_param_str}')
data_dir.mkdir(exist_ok=True, parents=True)
metadata = {
"date_harvested": today.isoformat(),
"search_params": search.params,
"search_kwargs": search.kwargs,
"total_results": search.total_results,
"total_harvested": len(items),
}
with Path(data_dir, "metadata.json").open("w") as md_file:
json.dump(metadata, md_file)
with Path(data_dir, "results.jsonl").open("w") as data_file:
for item in items:
data_file.write(json.dumps(item) + "\n")
df = pd.json_normalize(items)
df.to_csv(Path(data_dir, "results.csv"), index=False)
display(FileLink(Path(data_dir, "metadata.json")))
display(FileLink(Path(data_dir, "results.jsonl")))
display(FileLink(Path(data_dir, "results.csv")))
return str(data_dir)
save_harvest(search, items)
Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!