Version 2.1 of the Trove API introduced a new rights
index that you can use to limit your search results to records that include one of the licences and rights statements listed on this page. We can also use this index to build a picture of which rights statements are currently being used, and by who. Let's give it a try...
The method used here is to:
picture
zone for each combination.So for every organisation that contributes records to Trove, we'll find out the number of image records that include each rights statement.
Problems:
Searching by contributor saves us having to harvest all the images, but it has a major problem. Sometimes Trove will group multiple versions of a picture held by different organisations as a single work. Rights information is saved in the version metadata, but searches only return works. So if one organisation has assigned a rights statement to a version of the image, it will look like all the organisations whose images are grouped together with it as a work are using that rights statement. I don't think this will make a huge difference to the results, but it will be something to look out for. The only way around this is to harvest everything and expand the versions out into separate record.
The rights
index doesn't currently seem to include information on out of copyright images, unless they've actually been marked using the 'Public Domain' statement by the institution. Common statements such as 'Out of copyright', 'No known copyright restrictions', or 'Copyright expired' return no results. So there's a lot more open images than are currently reported by the rights index.
import os
import time
import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.notebook import tqdm
# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
# These are all the licence/rights statements recognised by Trove
# Copied from https://help.nla.gov.au/trove/becoming-partner/for-content-partners/licensing-reuse
licences = [
"Free/CC Public Domain",
"Free/CC BY",
"Free/CC0",
"Free/RS NKC",
"Free/RS Noc-US",
"Free with conditions/CC BY-ND",
"Free with conditions/CC BY-SA",
"Free with conditions/CC BY-NC",
"Free with conditions/CC BY-NC-ND",
"Free with conditions/CC BY-NC-SA",
"Free with conditions/RS NoC-NC",
"Free with conditions/InC-NC",
"Free with conditions/InC-EDU",
"Restricted/RS InC",
"Restricted/RS InC-OW-EU",
"Restricted/RS InC-RUU",
"Restricted/RS CNE",
"Restricted/RS UND",
"Restricted/NoC-CR",
"Restricted/NoC-OKLR",
]
# Insert your Trove API key
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
def save_summary(contributors, record, parent=None):
"""
Extract basic data from contributor record, and traverse any child records.
Create a full_name value by combining parent and child names.
"""
summary = {"id": record["id"], "name": record["name"]}
if parent:
summary["parent_id"] = parent["id"]
summary["full_name"] = f'{parent["full_name"]} / {record["name"]}'
elif "parent" in record:
summary["parent_id"] = record["parent"]["id"]
summary["full_name"] = f'{record["parent"]["value"]} / {record["name"]}'
else:
summary["full_name"] = record["name"]
if "children" in record:
for child in record["children"]["contributor"]:
save_summary(contributors, child, summary)
contributors.append(summary)
def get_contributors():
"""
Get a list of contributors form the Trove API.
Flatten all the nested records.
"""
contributors = []
contrib_params = {"key": API_KEY, "encoding": "json", "reclevel": "full"}
response = s.get(
"https://api.trove.nla.gov.au/v2/contributor/",
params=contrib_params,
timeout=60,
)
data = response.json()
for record in data["response"]["contributor"]:
save_summary(contributors, record)
return contributors
def contributor_has_results(contrib, params, additional_query):
"""
Check to see is the query return any results for this contributor.
"""
query = f'nuc:"{contrib["id"]}"'
# Add any extra queries
if additional_query:
query += f" {additional_query}"
params["q"] = query
response = s.get(
"https://api.trove.nla.gov.au/v2/result", params=params, timeout=60
)
data = response.json()
total = int(data["response"]["zone"][0]["records"]["total"])
if total > 0:
return True
def licence_counts_by_institution(additional_query=None):
"""
Loop through contributors and licences to harvest data about the number of times each licence is used.
"""
contributors = get_contributors()
licence_counts = []
params = {"key": API_KEY, "encoding": "json", "zone": "picture", "n": 0}
for contrib in tqdm(contributors):
# If there are no results for this contributor then there's no point checking for licences
# This should save a bit of time
if contributor_has_results(contrib, params, additional_query):
contrib_row = contrib.copy()
# Only search for nuc ids that start with a letter
if contrib["id"][0].isalpha():
for licence in licences:
# Construct query using nuc id and licence
query = f'nuc:"{contrib["id"]}" rights:"{licence}"'
# Add any extra queries
if additional_query:
query += f" {additional_query}"
params["q"] = query
response = s.get(
"https://api.trove.nla.gov.au/v2/result",
params=params,
timeout=60,
)
data = response.json()
total = data["response"]["zone"][0]["records"]["total"]
contrib_row[licence] = int(total)
if not response.from_cache:
time.sleep(0.2)
# print(contrib_row)
licence_counts.append(contrib_row)
return licence_counts
licence_counts_not_books = licence_counts_by_institution('NOT format:"Book"')
df = pd.DataFrame(licence_counts_not_books)
# Fill empty totals with zeros & make them all integers
df[licences] = df[licences].fillna(0).astype(int)
# Check the overall distribution of rights statements
df.sum(numeric_only=True)
Free/CC Public Domain 269877 Free/CC BY 221770 Free/CC0 940 Free/RS NKC 2134 Free/RS Noc-US 0 Free with conditions/CC BY-ND 0 Free with conditions/CC BY-SA 11994 Free with conditions/CC BY-NC 22332 Free with conditions/CC BY-NC-ND 22901 Free with conditions/CC BY-NC-SA 109934 Free with conditions/RS NoC-NC 0 Free with conditions/InC-NC 0 Free with conditions/InC-EDU 4466 Restricted/RS InC 13963 Restricted/RS InC-OW-EU 0 Restricted/RS InC-RUU 1 Restricted/RS CNE 9209 Restricted/RS UND 400 Restricted/NoC-CR 0 Restricted/NoC-OKLR 0 dtype: int64
# Remove columns we don't need
df_final = df[["id", "full_name"] + licences]
# Remove rows that add up to zero
df_final = df_final.loc[(df_final.sum(axis=1, numeric_only=True) != 0)]
# Remove columns that are all zero
df_final = df_final.loc[:, df_final.any()]
# Sort by name and save as CSV
df_final.sort_values(by=["full_name"]).to_csv("rights-on-images.csv", index=False)
See the results here:
Some GLAM institutions apply restrictive licences to digitised versions of out-of-copyright images. Under Australian copyright law, photographs created before 1955 are out of copyright, so we can adjust our query and look to see what sorts of rights statements are attached to them.
licence_counts_out_of_copyright = licence_counts_by_institution(
"format:Photograph date:[* TO 1954]"
)
df2 = pd.DataFrame(licence_counts_out_of_copyright)
# Fill empty totals with zeros & make them all integers
df2[licences] = df2[licences].fillna(0).astype(int)
# Check the overall distribution of rights statements
df2.sum(numeric_only=True)
Free/CC Public Domain 2424 Free/CC BY 45229 Free/CC0 406 Free/RS NKC 1583 Free/RS Noc-US 0 Free with conditions/CC BY-ND 0 Free with conditions/CC BY-SA 805 Free with conditions/CC BY-NC 81 Free with conditions/CC BY-NC-ND 830 Free with conditions/CC BY-NC-SA 1145 Free with conditions/RS NoC-NC 0 Free with conditions/InC-NC 0 Free with conditions/InC-EDU 2 Restricted/RS InC 120 Restricted/RS InC-OW-EU 0 Restricted/RS InC-RUU 0 Restricted/RS CNE 393 Restricted/RS UND 1 Restricted/NoC-CR 0 Restricted/NoC-OKLR 0 dtype: int64
# Remove columns we don't need
df2_final = df2[["id", "full_name"] + licences]
# Remove rows that add up to zero
df2_final = df2_final.loc[(df2_final.sum(axis=1, numeric_only=True) != 0)]
# Remove columns that are all zero
df2_final = df2_final.loc[:, df2_final.any()]
# Sort by name and save as CSV
df2_final.sort_values(by=["full_name"]).to_csv(
"rights-on-out-of-copyright-photos.csv", index=False
)
See the results here:
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.