If you ever need to find a file in the National Archives of Australia that contains a specific numbered cable from the Department of Foreign Affairs this is the tool for you!
Just give it a cable number and it will look in the series listed below for a file that might contain the cable. For each possible match it returns a link to the file as well as a bit of information about it.
This tool works because many of the files in these series include the first and last numbered cable in the file title. So all it does is look at the numbers in each file title to see if the cable you're interested in falls somewhere between them. It's simple, but it's not something you can do in RecordSearch.
It's far from perfect because the way the file titles are constructed are not always consistent, but it's quicker than looking through all the file titles manually.
Series searched:
Let me know if you'd like additional series added. If you want to refresh the series data from RecordSearch, just delete the cables_data.json
file before running a search. The tool will then reharvest all the data.
import json
import re
from copy import deepcopy
import ipywidgets as widgets
from IPython.display import HTML, display
from recordsearch_data_scraper.scrapers import RSItemSearch
series = ["A11785", "A11786", "A3195 ", "A3196", "A6364", "A6366"]
def get_total_files(series):
"""
Get the number of files in a series.
"""
results = RSItemSearch(sort=5, digitised=False, series=series)
return int(results.total_results)
def get_files(series):
"""
Harvest file details from a series in RecordSearch
"""
all_results = []
item_search = RSItemSearch(series=series, sort=5)
more = True
while more:
results = item_search.get_results()
all_results += results
if not results:
more = False
return all_results
def refresh_data():
"""
Harvest data from the listed series and save the results in a json file.
"""
results = []
for s in series:
results += get_files(s)
with open("cables_data.json", "w") as json_file:
json.dump(results, json_file)
return results
def load_data():
"""
Try to load preharvested data.
If the data file doesn't exist, harvest it.
"""
try:
with open("cables_data.json", "r") as json_file:
results = json.load(json_file)
except (FileNotFoundError, json.JSONDecodeError):
results = refresh_data()
return results
def check_year(r, year):
keep = False
try:
start = int(r["contents_dates"]["start_date"][:4])
end = int(r["contents_dates"]["end_date"][:4])
except (TypeError, KeyError):
pass
else:
if int(year) >= start and int(year) <= end:
keep = True
return keep
def find_cable(cable, series=None, year=None):
display_results.clear_output()
# Load pre harvested data
results = load_data()
try:
cable_num = int(re.search(r"[OI0]{0,1}\.{0,1}\s*?(\d+)", cable).group(1))
except AttributeError:
print("Not a number")
filtered_results = deepcopy(results)
if series:
filtered_results = [r for r in filtered_results if r["series"] == series]
if year:
filtered_results = [r for r in filtered_results if check_year(r, year) is True]
for result in filtered_results:
# Start conservatively, looking for O or I in front of numbers
cables = re.findall(r"[OI]{1}\.{0,1}\s*?(\d+)", result["title"])
if len(cables) == 0:
# If that didn't work find all numbers
cables = re.findall(r"\d+", result["title"])
if len(cables) > 2:
# If there are too many numbers, exclude ones that look like years
cables = [c for c in cables if not re.search(r"^19[1-9]{1}\d{1}$", c)]
# Just right
# print(cables)
if len(cables) == 2:
if cable_num >= int(cables[0]) and cable_num <= int(cables[1]):
# Display the details of each candidate
html = '<p><b>NAA: <a href="http://www.naa.gov.au/cgi-bin/Search?O=I&Number={}">{}, {}</a></b>'.format(
result["identifier"], result["series"], result["control_symbol"]
)
html += "<br>{}".format(result["title"])
html += "<br>{}".format(result["contents_dates"]["date_str"])
if result["digitised_status"] is True:
html += "<br>Digitised: {} pages".format(result["digitised_pages"])
html += "</p>"
with display_results:
display(HTML(html))
def run_query(b):
find_cable(cable.value, series=series_select.value, year=year.value)
# All the widgety things
series_options = [(s, s) for s in series]
series_options[0] = ("All", None)
series_select = widgets.Dropdown(options=series_options, description="Series:")
year = widgets.Text(
value=None, placeholder="filter by year, eg 1940", description="Year:"
)
cable = widgets.Text(value=None, placeholder="enter cable number", description="Cable:")
display_results = widgets.Output(layout=widgets.Layout(margin="40px 0 0 0"))
button = widgets.Button(
description="Find files!",
button_style="primary",
layout=widgets.Layout(margin="20px 0 0 0"),
)
button.on_click(run_query)
display(HTML("<h3>Find files containing this numbered cable</h3>"))
display(
widgets.VBox(
[
cable,
widgets.HTML(
"<p><b>Filter by series and/or year to reduce the number of results</b></p>"
),
series_select,
year,
button,
display_results,
]
)
)
VBox(children=(Text(value='', description='Cable:', placeholder='enter cable number'), HTML(value='<p><b>Filte…