Changes to Australian copyright legislation mean that many unpublished resources will be entering the public domain on 1 January 2019. This notebook attempts to harvest the details of some of these resources from Trove.
As with most things involving copyright, there's no real way to be certain what will be entering the public domain. The main problem is that if there's no known author then the copyright period depends on if and when the work was 'made public'. Add to that general issues around the accuracy and completeness of the metadata and all I can really do is create a list of some of the things which could potentially be entering the public domain based on the available metadata.
The basic methodology is:
If you just want the data, here's a CSV file you can download. Look below for a preview.
If you want to play with the data a bit, here's another notebook with a few ideas.
For more information on the changes see the NSLA guide to Preparing for copyright term changes in 2019.
import datetime
import os
import re
import time
import pandas as pd
import requests_cache
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
s = requests_cache.CachedSession()
retries = Retry(total=10, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
# Insert your Trove API key
API_KEY = "YOUR API KEY"
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
api_url = "http://api.trove.nla.gov.au/v2/result"
# Insert your Trove API key where indicated
params = {
"q": " ",
"zone": "collection",
"encoding": "json",
"l-format": "Unpublished",
"l-australian": "y",
"include": "holdings",
"key": API_KEY,
"n": "100",
"bulkHarvest": "true",
}
# How many things are we processing?
response = s.get(api_url, params=params)
data = response.json()
total = int(data["response"]["zone"][0]["records"]["total"])
print(total)
def check_creators(creators):
"""
Make sure all creators have a death date before 1949.
"""
opening = False
count = 0
for creator in creators:
year = get_latest_year(creator)
if year and int(year) < 1949:
count += 1
if len(creators) == count:
opening = True
return opening
def check_date(issued):
"""
Check if the latest issued date is before 1949.
"""
opening = False
year = get_latest_year(issued)
if year and int(year) < 1949:
opening = True
return opening
def get_latest_year(value):
"""
Get a year from the end of a string.
"""
try:
year = re.search(r"\b(\d{4})$", value).group(1)
except (AttributeError, TypeError):
year = None
return year
def harvest(max_records=None):
items = []
start = "*"
with tqdm(total=total) as pbar:
while start:
params["s"] = start
response = s.get(api_url, params=params)
# print(response.url)
data = response.json()
for record in data["response"]["zone"][0]["records"]["work"]:
opening = False
creators = record.get("contributor")
issued = record.get("issued")
if creators:
opening = check_creators(creators)
elif issued:
opening = check_date(str(issued))
if opening:
try:
creator = " | ".join(creators)
except TypeError:
creator = creators
try:
nuc = record["holding"][0]["nuc"]
except KeyError:
nuc = None
item = {
"id": record["id"],
"title": record["title"],
"creator": creator,
"date": issued,
"trove_url": record["troveUrl"],
"nuc": nuc,
}
items.append(item)
try:
start = data["response"]["zone"][0]["records"]["nextStart"]
except KeyError:
start = None
pbar.update(100)
# Stop iteration once max number of records inspected (mainly for testing)
if max_records and pbar.n >= max_records:
break
if not response.from_cache:
time.sleep(0.2)
return items
items = harvest()
df = pd.DataFrame(items)
df.head()
# How many items are there?
df.shape[0]
date_str = datetime.datetime.now().strftime("%Y%m%d")
csv_file = "unpublished_works_entering_pd_{}.csv".format(date_str)
df.to_csv(csv_file, index=False)
# Make a download link
display(HTML('<a target="_blank" href="{}">Download CSV file</a>'.format(csv_file)))
# Ignore -- this is just used for testing in development
if os.getenv("GW_STATUS") == "dev":
fa_test = harvest(200)
df_test = pd.DataFrame(fa_test)
assert not df_test.empty
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.