#!/usr/bin/env python
# coding: utf-8

# # Finding unpublished works that might be entering the public domain on 1 January 2019
# 
# Changes to Australian copyright legislation mean that many unpublished resources will be entering the public domain on 1 January 2019. This notebook attempts to harvest the details of some of these resources from Trove.
# 
# As with most things involving copyright, there's no real way to be certain what will be entering the public domain. The main problem is that if there's no known author then the copyright period depends on if and when the work was 'made public'. Add to that general issues around the accuracy and completeness of the metadata and all I can really do is create a list of *some* of the things which could *potentially* be entering the public domain based on the available metadata.
# 
# The basic methodology is:
# 
# * Search in Trove's 'Diaries, letters, archives' zone for ['Unpublished' Australian materials](https://trove.nla.gov.au/collection/result?l-format=Unpublished&q=&l-australian=y)
# * For each record check to see if there are any listed creators.
# * If there are creators, look to see if they have a death date and if that date is before 1949
# * If all creators died before 1949 then save the item metadata
# * If there are no creators, look to see if the creation date of the item is before 1949, if so save the metadata
# 
# If you just want the data, here's a [CSV file you can download](unpublished_works_entering_pd_20181006.csv). Look below for a preview.
# 
# If you want to play with the data a bit, [here's another notebook](Exploring-unpublished-works-entering-public-domain.ipynb) with a few ideas.
# 
# For more information on the changes see the NSLA guide to [Preparing for copyright term changes in 2019](https://www.nsla.org.au/sites/default/files/documents/nsla.copyright-preparing-changes-2019.pdf).

# In[ ]:


import datetime
import os
import re
import time

import pandas as pd
import requests_cache
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm

s = requests_cache.CachedSession()
retries = Retry(total=10, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))


# In[ ]:


get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n')


# In[ ]:


# Insert your Trove API key
API_KEY = "YOUR API KEY"

if os.getenv("TROVE_API_KEY"):
    API_KEY = os.getenv("TROVE_API_KEY")


# ## Harvest the data

# In[ ]:


api_url = "http://api.trove.nla.gov.au/v2/result"
# Insert your Trove API key where indicated
params = {
    "q": " ",
    "zone": "collection",
    "encoding": "json",
    "l-format": "Unpublished",
    "l-australian": "y",
    "include": "holdings",
    "key": API_KEY,
    "n": "100",
    "bulkHarvest": "true",
}


# In[ ]:


# How many things are we processing?
response = s.get(api_url, params=params)
data = response.json()
total = int(data["response"]["zone"][0]["records"]["total"])
print(total)


# In[ ]:


def check_creators(creators):
    """
    Make sure all creators have a death date before 1949.
    """
    opening = False
    count = 0
    for creator in creators:
        year = get_latest_year(creator)
        if year and int(year) < 1949:
            count += 1
    if len(creators) == count:
        opening = True
    return opening


def check_date(issued):
    """
    Check if the latest issued date is before 1949.
    """
    opening = False
    year = get_latest_year(issued)
    if year and int(year) < 1949:
        opening = True
    return opening


def get_latest_year(value):
    """
    Get a year from the end of a string.
    """
    try:
        year = re.search(r"\b(\d{4})$", value).group(1)
    except (AttributeError, TypeError):
        year = None
    return year


# In[ ]:


def harvest(max_records=None):
    items = []
    start = "*"

    with tqdm(total=total) as pbar:
        while start:
            params["s"] = start
            response = s.get(api_url, params=params)
            # print(response.url)
            data = response.json()
            for record in data["response"]["zone"][0]["records"]["work"]:
                opening = False
                creators = record.get("contributor")
                issued = record.get("issued")
                if creators:
                    opening = check_creators(creators)
                elif issued:
                    opening = check_date(str(issued))
                if opening:
                    try:
                        creator = " | ".join(creators)
                    except TypeError:
                        creator = creators
                    try:
                        nuc = record["holding"][0]["nuc"]
                    except KeyError:
                        nuc = None
                    item = {
                        "id": record["id"],
                        "title": record["title"],
                        "creator": creator,
                        "date": issued,
                        "trove_url": record["troveUrl"],
                        "nuc": nuc,
                    }
                    items.append(item)
            try:
                start = data["response"]["zone"][0]["records"]["nextStart"]
            except KeyError:
                start = None
            pbar.update(100)
            # Stop iteration once max number of records inspected (mainly for testing)
            if max_records and pbar.n >= max_records:
                break
            if not response.from_cache:
                time.sleep(0.2)
    return items


# In[ ]:


items = harvest()


# ## Convert the results to a dataframe and have a look inside

# In[ ]:


df = pd.DataFrame(items)
df.head()


# In[ ]:


# How many items are there?
df.shape[0]


# ## Save the results as a CSV file

# In[ ]:


date_str = datetime.datetime.now().strftime("%Y%m%d")
csv_file = "unpublished_works_entering_pd_{}.csv".format(date_str)
df.to_csv(csv_file, index=False)
# Make a download link
display(HTML('<a target="_blank" href="{}">Download CSV file</a>'.format(csv_file)))


# In[ ]:


# Ignore -- this is just used for testing in development
if os.getenv("GW_STATUS") == "dev":
    fa_test = harvest(200)
    df_test = pd.DataFrame(fa_test)
    assert not df_test.empty


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge).