This notebook brings together annual harvests of files with an access status of 'closed', scraped from the NAA's RecordSearch database. The data files are here:
The current code used to harvest 'closed' files is in this notebook. Previous versions can be found in this repository.
from pathlib import Path
import altair as alt
import pandas as pd
harvests = {
"2015": "closed-20160101.csv",
"2016": "closed-20170109.csv",
"2017": "closed-20180101.csv",
"2018": "closed-20190101.csv",
"2019": "closed-20200101.csv",
"2020": "closed-20210101.csv",
"2021": "closed-20220101.csv",
}
# Load all the data into a single dataframe
dfs = []
for year, data_file in harvests.items():
df_year = pd.read_csv(
Path("data", data_file),
parse_dates=[
"contents_start_date",
"contents_end_date",
"access_decision_date",
],
keep_default_na=False,
)
df_year["harvested_year"] = year
dfs.append(df_year)
df = pd.concat(dfs)
df.head()
identifier | series | control_symbol | title | series_title | contents_date_str | contents_start_date | contents_end_date | access_status | access_decision_date | reasons | harvested_year | location | access_decision_date_str | digitised_status | digitised_pages | access_decision_reasons | retrieved | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12332 | A1 | 1911/21007 | Salvatore Pagano Naturalization Issued to Immi... | Correspondence files, annual single number ser... | 1961 - 1961 | 1961-01-01 00:00:00 | 1961-01-01 00:00:00 | Closed | 1981-07-28 00:00:00 | Pre Access Recorder | 2015 | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 15403 | A1 | 1913/6809 | Meeting of Commonwealth Literary Fund (Missing... | Correspondence files, annual single number ser... | 1913 - 1913 | 1913-01-01 00:00:00 | 1913-01-01 00:00:00 | Closed | 1981-09-28 00:00:00 | Pre Access Recorder | 2015 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 33093 | A1 | 1915/11532 | Wilhelm CA Simonsen - Naturalization Issued to... | Correspondence files, annual single number ser... | 1961 - 1961 | 1961-01-01 00:00:00 | 1961-01-01 00:00:00 | Closed | 1981-12-03 00:00:00 | Pre Access Recorder | 2015 | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 46663 | A2 | 1907/554 | Report of Conference of Statisticians (File Co... | Correspondence files, annual single number series | 1904 - 1920 | 1904-01-01 00:00:00 | 1920-01-01 00:00:00 | Closed | 1973-06-20 00:00:00 | Pre Access Recorder | 2015 | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 47046 | A2 | 1915/346 | Rossino - Mario | Correspondence files, annual single number series | 1915 - 1915 | 1915-01-01 00:00:00 | 1915-01-01 00:00:00 | Closed | 1973-06-28 00:00:00 | Pre Access Recorder | 2015 | NaN | NaN | NaN | NaN | NaN | NaN |
year_counts = df["harvested_year"].value_counts().to_frame().reset_index()
year_counts.columns = ["year", "count"]
year_counts.sort_values(by="year")
year | count | |
---|---|---|
0 | 2015 | 14370 |
6 | 2016 | 10750 |
4 | 2017 | 11189 |
1 | 2018 | 11953 |
2 | 2019 | 11867 |
5 | 2020 | 11140 |
3 | 2021 | 11377 |
alt.Chart(year_counts).mark_bar(point=True).encode(
x=alt.X("year:O", title="Year end"),
y=alt.Y("count:Q", title="Number of closed files"),
color=alt.Color("year", legend=None),
tooltip=["year:O", "count:Q"],
).properties(width=300)
df_reasons = df.copy()
df_reasons["reason"] = df_reasons["reasons"].str.split("|")
df_reasons = df_reasons.explode("reason")
df_reasons["reason"].replace("", "No reason", inplace=True)
unique_reasons = sorted(list(df_reasons["reason"].unique()))
unique_reasons
['33(1)(a)', '33(1)(b)', '33(1)(c)', '33(1)(d)', '33(1)(e)(i)', '33(1)(e)(ii)', '33(1)(e)(iii)', '33(1)(f)(i)', '33(1)(f)(ii)', '33(1)(f)(iii)', '33(1)(g)', '33(1)(h)', '33(1)(j)', '33(2)(a)', '33(2)(b)', '33(3)(a)(i)', '33(3)(a)(ii)', '33(3)(b)', 'Cabinet notebooks', 'Closed period', 'Court records', 'Destroyed', 'MAKE YOUR SELECTION', 'NRF', 'No reason', 'Non Cwlth-depositor', 'Non Cwlth-no appeal', 'Parliament Class A', 'Pre Access Recorder', 'Withheld pending adv', "['33(1)(a)', '33(1)(b)', '33(1)(c)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)', '33(1)(d)', '33(1)(g)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)', '33(1)(d)', '33(1)(g)']", "['33(1)(a)', '33(1)(b)', '33(1)(d)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)', '33(1)(d)']", "['33(1)(a)', '33(1)(b)', '33(1)(e)(ii)', '33(1)(g)']", "['33(1)(a)', '33(1)(b)', '33(1)(e)(ii)']", "['33(1)(a)', '33(1)(b)', '33(1)(e)(iii)']", "['33(1)(a)', '33(1)(b)', '33(1)(g)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)', 'Closed period', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(b)']", "['33(1)(a)', '33(1)(c)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(i)', '33(1)(g)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(i)', '33(1)(g)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(i)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(ii)', '33(1)(e)(iii)', 'NRF']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(ii)', '33(1)(e)(iii)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(ii)', '33(1)(e)(iii)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(ii)', '33(1)(g)', 'Closed period', 'Withheld pending adv']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(ii)', '33(1)(g)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(iii)', '33(1)(g)']", "['33(1)(a)', '33(1)(d)', '33(1)(e)(iii)']", "['33(1)(a)', '33(1)(d)', '33(1)(g)', '33(1)(e)(i)']", "['33(1)(a)', '33(1)(d)', '33(1)(g)', '33(1)(e)(ii)']", "['33(1)(a)', '33(1)(d)', '33(1)(g)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(d)', '33(1)(g)']", "['33(1)(a)', '33(1)(d)']", "['33(1)(a)', '33(1)(e)(ii)', '33(1)(g)']", "['33(1)(a)', '33(1)(e)(ii)']", "['33(1)(a)', '33(1)(e)(iii)']", "['33(1)(a)', '33(1)(f)(iii)']", "['33(1)(a)', '33(1)(g)', 'Withheld pending adv']", "['33(1)(a)', '33(1)(g)']", "['33(1)(a)', '33(1)(j)', '33(2)(a)', '33(2)(b)']", "['33(1)(a)', 'Closed period', 'Withheld pending adv']", "['33(1)(a)', 'Withheld pending adv']", "['33(1)(a)']", "['33(1)(b)', '33(1)(d)', '33(1)(e)(ii)']", "['33(1)(b)', '33(1)(e)(ii)']", "['33(1)(b)', '33(1)(e)(iii)']", "['33(1)(b)', '33(1)(g)']", "['33(1)(b)', 'Withheld pending adv']", "['33(1)(b)']", "['33(1)(c)']", "['33(1)(d)', '33(1)(e)(ii)', '33(1)(g)']", "['33(1)(d)', '33(1)(e)(ii)']", "['33(1)(d)', '33(1)(e)(iii)', '33(1)(g)']", "['33(1)(d)', '33(1)(g)', '33(1)(j)', '33(1)(f)(ii)']", "['33(1)(d)', '33(1)(g)', '33(1)(j)', 'Closed period']", "['33(1)(d)', '33(1)(g)', '33(1)(j)']", "['33(1)(d)', '33(1)(g)', 'Closed period']", "['33(1)(d)', '33(1)(g)']", "['33(1)(d)']", "['33(1)(e)(i)', '33(1)(e)(ii)', '33(1)(g)']", "['33(1)(e)(i)', '33(1)(f)(i)', '33(1)(g)']", "['33(1)(e)(i)', '33(1)(g)']", "['33(1)(e)(ii)', '33(1)(g)', '33(1)(j)']", "['33(1)(e)(ii)', '33(1)(g)']", "['33(1)(e)(ii)']", "['33(1)(e)(iii)', '33(1)(f)(iii)']", "['33(1)(e)(iii)', '33(1)(g)']", "['33(1)(e)(iii)', 'Withheld pending adv']", "['33(1)(e)(iii)']", "['33(1)(f)(i)']", "['33(1)(f)(ii)']", "['33(1)(f)(iii)', '33(1)(g)']", "['33(1)(f)(iii)']", "['33(1)(g)', '33(1)(j)', 'Closed period']", "['33(1)(g)', '33(1)(j)']", "['33(1)(g)', 'Closed period']", "['33(1)(g)', 'MAKE YOUR SELECTION']", "['33(1)(g)', 'Non Cwlth-depositor']", "['33(1)(g)', 'Non Cwlth-no appeal']", "['33(1)(g)', 'Parliament Class A']", "['33(1)(g)']", "['33(1)(h)']", "['33(1)(j)']", "['33(2)(a)', '33(2)(b)', '33(3)(a)(i)', '33(3)(a)(ii)', '33(3)(b)']", "['33(2)(a)', '33(2)(b)', 'Withheld pending adv']", "['33(2)(a)', '33(2)(b)']", "['33(3)(a)(i)', '33(3)(a)(ii)', '33(3)(b)']", "['33(3)(a)(i)', '33(3)(b)', '33(3)(a)(ii)', '33(3)(b)']", "['33(3)(a)(ii)', '33(3)(b)', 'MAKE YOUR SELECTION']", "['33(3)(a)(ii)', '33(3)(b)']", "['Closed period', 'Non Cwlth-no appeal']", "['Closed period', 'Withheld pending adv']", "['Closed period']", "['Court records']", "['Destroyed']", "['MAKE YOUR SELECTION', 'Withheld pending adv']", "['MAKE YOUR SELECTION']", "['NRF']", "['Non Cwlth-depositor']", "['Non Cwlth-no appeal']", "['Parliament Class A']", "['Withheld pending adv']", '[]']
harvest_reasons_counts = (
df_reasons.groupby(by=["harvested_year", "reason"]).size().reset_index()
)
harvest_reasons_counts.columns = ["year", "reason", "count"]
alt.Chart(harvest_reasons_counts).mark_bar().encode(
x=alt.X("year:O", title=None),
y=alt.Y("count:Q", title="Number of files"),
color=alt.Color("year:N", legend=None),
facet=alt.Facet(
"reason:O", align="each", columns=5, title="Reason for being closed"
),
tooltip=["year:O", "reason:N", "count:Q"],
).properties(height=200).resolve_scale(x="independent")
Select a reason from the dropdown list to examine change over time.
input_dropdown = alt.binding_select(
options=[None] + unique_reasons, labels=["All"] + unique_reasons
)
selection = alt.selection_single(fields=["reason"], bind=input_dropdown, name="Select")
alt.Chart(harvest_reasons_counts).mark_bar().encode(
x=alt.X("year:O", title=None),
y=alt.Y("count:Q", title="Number of files"),
color=alt.Color("year:N", legend=None),
column=alt.Column("reason:N", title="Reason for being closed"),
tooltip=["year:O", "reason:N", "count:Q"],
).add_selection(selection).transform_filter(selection).properties(
height=200
).resolve_scale(
x="independent"
)