#!/usr/bin/env python # coding: utf-8 # # Comparing harvests of closed files # # This notebook brings together annual harvests of files with an access status of 'closed', scraped from the NAA's RecordSearch database. The data files are here: # # * [2015](data/closed-20160101.csv) (harvested 1 January 2016) # * [2016](data/closed-20170109.csv) (harvested 9 January 2017) # * [2017](data/closed-20180101.csv) (harvested 1 January 2018) # * [2018](data/closed-20190101.csv) (harvested 1 January 2019) # * [2019](data/closed-20200101.csv) (harvested 1 January 2020) # * [2020](data/closed-20210101.csv) (harvested 1 January 2021) # # The current code used to harvest 'closed' files is in [this notebook](harvest_closed_files.ipynb). Previous versions can be found in [this repository](https://github.com/wragge/closed_access). # In[11]: from pathlib import Path import altair as alt import pandas as pd # In[12]: harvests = { "2015": "closed-20160101.csv", "2016": "closed-20170109.csv", "2017": "closed-20180101.csv", "2018": "closed-20190101.csv", "2019": "closed-20200101.csv", "2020": "closed-20210101.csv", "2021": "closed-20220101.csv", } # In[13]: # Load all the data into a single dataframe dfs = [] for year, data_file in harvests.items(): df_year = pd.read_csv( Path("data", data_file), parse_dates=[ "contents_start_date", "contents_end_date", "access_decision_date", ], keep_default_na=False, ) df_year["harvested_year"] = year dfs.append(df_year) df = pd.concat(dfs) df.head() # ## Number of closed files in each harvest # In[14]: year_counts = df["harvested_year"].value_counts().to_frame().reset_index() year_counts.columns = ["year", "count"] year_counts.sort_values(by="year") # In[15]: alt.Chart(year_counts).mark_bar(point=True).encode( x=alt.X("year:O", title="Year end"), y=alt.Y("count:Q", title="Number of closed files"), color=alt.Color("year", legend=None), tooltip=["year:O", "count:Q"], ).properties(width=300) # ## Find the number of times each reason is cited in the annual harvests # In[16]: df_reasons = df.copy() df_reasons["reason"] = df_reasons["reasons"].str.split("|") df_reasons = df_reasons.explode("reason") df_reasons["reason"].replace("", "No reason", inplace=True) # In[17]: unique_reasons = sorted(list(df_reasons["reason"].unique())) unique_reasons # In[18]: harvest_reasons_counts = ( df_reasons.groupby(by=["harvested_year", "reason"]).size().reset_index() ) harvest_reasons_counts.columns = ["year", "reason", "count"] # ## Visualise the number of times each reason is cited # In[19]: alt.Chart(harvest_reasons_counts).mark_bar().encode( x=alt.X("year:O", title=None), y=alt.Y("count:Q", title="Number of files"), color=alt.Color("year:N", legend=None), facet=alt.Facet( "reason:O", align="each", columns=5, title="Reason for being closed" ), tooltip=["year:O", "reason:N", "count:Q"], ).properties(height=200).resolve_scale(x="independent") # ## Focus on a specific reason # # Select a reason from the dropdown list to examine change over time. # In[20]: input_dropdown = alt.binding_select( options=[None] + unique_reasons, labels=["All"] + unique_reasons ) selection = alt.selection_single(fields=["reason"], bind=input_dropdown, name="Select") alt.Chart(harvest_reasons_counts).mark_bar().encode( x=alt.X("year:O", title=None), y=alt.Y("count:Q", title="Number of files"), color=alt.Color("year:N", legend=None), column=alt.Column("reason:N", title="Reason for being closed"), tooltip=["year:O", "reason:N", "count:Q"], ).add_selection(selection).transform_filter(selection).properties( height=200 ).resolve_scale( x="independent" ) # In[ ]: