#!/usr/bin/env python
# coding: utf-8

# # Comparing harvests of closed files
# 
# This notebook brings together annual harvests of files with an access status of 'closed', scraped from the NAA's RecordSearch database. The data files are here:
# 
# * [2015](data/closed-20160101.csv) (harvested 1 January 2016)
# * [2016](data/closed-20170109.csv) (harvested 9 January 2017)
# * [2017](data/closed-20180101.csv) (harvested 1 January 2018)
# * [2018](data/closed-20190101.csv) (harvested 1 January 2019)
# * [2019](data/closed-20200101.csv) (harvested 1 January 2020)
# * [2020](data/closed-20210101.csv) (harvested 1 January 2021)
# 
# The current code used to harvest 'closed' files is in [this notebook](harvest_closed_files.ipynb). Previous versions can be found in [this repository](https://github.com/wragge/closed_access).

# In[11]:


from pathlib import Path

import altair as alt
import pandas as pd


# In[12]:


harvests = {
    "2015": "closed-20160101.csv",
    "2016": "closed-20170109.csv",
    "2017": "closed-20180101.csv",
    "2018": "closed-20190101.csv",
    "2019": "closed-20200101.csv",
    "2020": "closed-20210101.csv",
    "2021": "closed-20220101.csv",
}


# In[13]:


# Load all the data into a single dataframe
dfs = []
for year, data_file in harvests.items():
    df_year = pd.read_csv(
        Path("data", data_file),
        parse_dates=[
            "contents_start_date",
            "contents_end_date",
            "access_decision_date",
        ],
        keep_default_na=False,
    )
    df_year["harvested_year"] = year
    dfs.append(df_year)
df = pd.concat(dfs)
df.head()


# ## Number of closed files in each harvest

# In[14]:


year_counts = df["harvested_year"].value_counts().to_frame().reset_index()
year_counts.columns = ["year", "count"]
year_counts.sort_values(by="year")


# In[15]:


alt.Chart(year_counts).mark_bar(point=True).encode(
    x=alt.X("year:O", title="Year end"),
    y=alt.Y("count:Q", title="Number of closed files"),
    color=alt.Color("year", legend=None),
    tooltip=["year:O", "count:Q"],
).properties(width=300)


# ## Find the number of times each reason is cited in the annual harvests

# In[16]:


df_reasons = df.copy()
df_reasons["reason"] = df_reasons["reasons"].str.split("|")
df_reasons = df_reasons.explode("reason")
df_reasons["reason"].replace("", "No reason", inplace=True)


# In[17]:


unique_reasons = sorted(list(df_reasons["reason"].unique()))
unique_reasons


# In[18]:


harvest_reasons_counts = (
    df_reasons.groupby(by=["harvested_year", "reason"]).size().reset_index()
)
harvest_reasons_counts.columns = ["year", "reason", "count"]


# ## Visualise the number of times each reason is cited

# In[19]:


alt.Chart(harvest_reasons_counts).mark_bar().encode(
    x=alt.X("year:O", title=None),
    y=alt.Y("count:Q", title="Number of files"),
    color=alt.Color("year:N", legend=None),
    facet=alt.Facet(
        "reason:O", align="each", columns=5, title="Reason for being closed"
    ),
    tooltip=["year:O", "reason:N", "count:Q"],
).properties(height=200).resolve_scale(x="independent")


# ## Focus on a specific reason
# 
# Select a reason from the dropdown list to examine change over time.

# In[20]:


input_dropdown = alt.binding_select(
    options=[None] + unique_reasons, labels=["All"] + unique_reasons
)
selection = alt.selection_single(fields=["reason"], bind=input_dropdown, name="Select")

alt.Chart(harvest_reasons_counts).mark_bar().encode(
    x=alt.X("year:O", title=None),
    y=alt.Y("count:Q", title="Number of files"),
    color=alt.Color("year:N", legend=None),
    column=alt.Column("reason:N", title="Reason for being closed"),
    tooltip=["year:O", "reason:N", "count:Q"],
).add_selection(selection).transform_filter(selection).properties(
    height=200
).resolve_scale(
    x="independent"
)


# In[ ]: