#!/usr/bin/env python # coding: utf-8 # # Finding unpublished works that might be entering the public domain on 1 January 2019 # # Changes to Australian copyright legislation mean that many unpublished resources will be entering the public domain on 1 January 2019. This notebook attempts to harvest the details of some of these resources from Trove. # # As with most things involving copyright, there's no real way to be certain what will be entering the public domain. The main problem is that if there's no known author then the copyright period depends on if and when the work was 'made public'. Add to that general issues around the accuracy and completeness of the metadata and all I can really do is create a list of *some* of the things which could *potentially* be entering the public domain based on the available metadata. # # The basic methodology is: # # * Search in Trove's 'Diaries, letters, archives' zone for ['Unpublished' Australian materials](https://trove.nla.gov.au/collection/result?l-format=Unpublished&q=&l-australian=y) # * For each record check to see if there are any listed creators. # * If there are creators, look to see if they have a death date and if that date is before 1949 # * If all creators died before 1949 then save the item metadata # * If there are no creators, look to see if the creation date of the item is before 1949, if so save the metadata # # If you just want the data, here's a [CSV file you can download](unpublished_works_entering_pd_20181006.csv). Look below for a preview. # # If you want to play with the data a bit, [here's another notebook](Exploring-unpublished-works-entering-public-domain.ipynb) with a few ideas. # # For more information on the changes see the NSLA guide to [Preparing for copyright term changes in 2019](https://www.nsla.org.au/sites/default/files/documents/nsla.copyright-preparing-changes-2019.pdf). # In[ ]: import datetime import os import re import time import pandas as pd import requests_cache from IPython.display import HTML, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from tqdm.auto import tqdm s = requests_cache.CachedSession() retries = Retry(total=10, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504]) s.mount("http://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries)) # In[ ]: get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n') # In[ ]: # Insert your Trove API key API_KEY = "YOUR API KEY" if os.getenv("TROVE_API_KEY"): API_KEY = os.getenv("TROVE_API_KEY") # ## Harvest the data # In[ ]: api_url = "http://api.trove.nla.gov.au/v2/result" # Insert your Trove API key where indicated params = { "q": " ", "zone": "collection", "encoding": "json", "l-format": "Unpublished", "l-australian": "y", "include": "holdings", "key": API_KEY, "n": "100", "bulkHarvest": "true", } # In[ ]: # How many things are we processing? response = s.get(api_url, params=params) data = response.json() total = int(data["response"]["zone"][0]["records"]["total"]) print(total) # In[ ]: def check_creators(creators): """ Make sure all creators have a death date before 1949. """ opening = False count = 0 for creator in creators: year = get_latest_year(creator) if year and int(year) < 1949: count += 1 if len(creators) == count: opening = True return opening def check_date(issued): """ Check if the latest issued date is before 1949. """ opening = False year = get_latest_year(issued) if year and int(year) < 1949: opening = True return opening def get_latest_year(value): """ Get a year from the end of a string. """ try: year = re.search(r"\b(\d{4})$", value).group(1) except (AttributeError, TypeError): year = None return year # In[ ]: def harvest(max_records=None): items = [] start = "*" with tqdm(total=total) as pbar: while start: params["s"] = start response = s.get(api_url, params=params) # print(response.url) data = response.json() for record in data["response"]["zone"][0]["records"]["work"]: opening = False creators = record.get("contributor") issued = record.get("issued") if creators: opening = check_creators(creators) elif issued: opening = check_date(str(issued)) if opening: try: creator = " | ".join(creators) except TypeError: creator = creators try: nuc = record["holding"][0]["nuc"] except KeyError: nuc = None item = { "id": record["id"], "title": record["title"], "creator": creator, "date": issued, "trove_url": record["troveUrl"], "nuc": nuc, } items.append(item) try: start = data["response"]["zone"][0]["records"]["nextStart"] except KeyError: start = None pbar.update(100) # Stop iteration once max number of records inspected (mainly for testing) if max_records and pbar.n >= max_records: break if not response.from_cache: time.sleep(0.2) return items # In[ ]: items = harvest() # ## Convert the results to a dataframe and have a look inside # In[ ]: df = pd.DataFrame(items) df.head() # In[ ]: # How many items are there? df.shape[0] # ## Save the results as a CSV file # In[ ]: date_str = datetime.datetime.now().strftime("%Y%m%d") csv_file = "unpublished_works_entering_pd_{}.csv".format(date_str) df.to_csv(csv_file, index=False) # Make a download link display(HTML('Download CSV file'.format(csv_file))) # In[ ]: # Ignore -- this is just used for testing in development if os.getenv("GW_STATUS") == "dev": fa_test = harvest(200) df_test = pd.DataFrame(fa_test) assert not df_test.empty # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge).