Because of problems with duplicate records in data.gov.au, I no longer use this notebook to harvest details of GLAM datasets. See GLAM data from government portals instead.
This is a quick attempt to harvest datasets published by GLAM institutions using the new data.gov.au API.
To create the list of organisations, I searched the organisations on the data.gov.au site for 'library', 'archives', 'records', and 'museum'. I noticed that Queensland State Archives isn't included as an organisation, even though it's used as a tag, so I added it in as a query. There are inconsistencies in the way organisations are listed, so it's possible I've missed some.
import pandas as pd
import requests
from IPython.display import FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
api_url = "https://data.gov.au/api/v0/search/datasets"
organisations = [
"NSW State Archives",
"National Archives of Australia",
"Libraries Tasmania",
"State Records",
"State Records Office of Western Australia",
"State Library of Victoria",
"State Library of NSW",
"Mount Gambier Library",
"National Library of Australia",
"State Library of Queensland",
"State Library of Western Australia",
"State Library of South Australia",
"State Library of New South Wales",
"Western Australian Museum",
"South Australian Museum",
"Museum of Applied Arts and Sciences",
"Tasmanian Museum and Art Gallery",
"History Trust of South Australia",
"Australian Institute of Aboriginal and Torres Strait Islander Studies (AIATSIS)",
"National Portrait Gallery",
"Australian Museum",
]
# No entries under organisations
queries = ['"Queensland State Archives"', "PROV Public Record Office"]
def safe_get(dct, *keys):
for key in keys:
try:
dct = dct[key]
except (KeyError, TypeError):
return None
return dct
def process_dataset(dataset, query=None):
datafiles = []
for dist in dataset["distributions"]:
if query:
publisher = query.strip('"')
else:
publisher = safe_get(dataset, "publisher", "name")
datafile = {
"dataset_title": safe_get(dataset, "title"),
"publisher": publisher,
"dataset_issued": safe_get(dataset, "issued"),
"dataset_modified": safe_get(dataset, "modified"),
"dataset_description": safe_get(dataset, "description"),
"source": safe_get(dataset, "catalog"),
"info_url": safe_get(dataset, "landingPage"),
"start_date": safe_get(dataset, "temporal", "start", "date"),
"end_date": safe_get(dataset, "temporal", "end", "date"),
"file_title": safe_get(dist, "title"),
"download_url": safe_get(dist, "downloadURL"),
"format": safe_get(dist, "format"),
"file_description": safe_get(dist, "description"),
"file_issued": safe_get(dist, "issued"),
"file_modified": safe_get(dist, "modified"),
"licence": safe_get(dist, "license", "name"),
}
datafiles.append(datafile)
return datafiles
def harvest_datasets():
datafiles = []
for organisation in organisations:
response = s.get(api_url, params={"publisher": organisation, "limit": 100})
print(response.url)
data = response.json()
for dataset in data["dataSets"]:
datafiles += process_dataset(dataset)
for query in queries:
response = s.get(api_url, params={"query": query, "limit": 100})
print(response.url)
data = response.json()
for dataset in data["dataSets"]:
datafiles += process_dataset(dataset, query=query)
return datafiles
datafiles = harvest_datasets()
df = pd.DataFrame(datafiles)
df.head()
df.shape
df["format"].value_counts()
df["licence"].value_counts()
df["publisher"].value_counts()
df.to_csv("glam_datasets_all_formats_from_datagovau.csv", index=False)
display(FileLink("glam_datasets_all_formats_from_datagovau.csv"))
csvs = df.loc[df["format"] == "CSV"]
csvs.shape
csvs["publisher"].value_counts()
csvs.to_csv("glam_datasets_csvs_from_datagovau.csv", index=False)
display(FileLink("glam_datasets_csvs_from_datagovau.csv"))
# Write results to a markdown file
orgs = df.sort_values(by=["publisher", "dataset_title", "dataset_modified"]).groupby(
"publisher"
)
with open("glam_datasets_from_datagovau.md", "w") as md_file:
for org, group in orgs:
print("* [{}](#{})".format(org, slugify(org)))
md_file.write("\n## {}\n".format(org))
for dataset, files in group.groupby(["dataset_title", "info_url"]):
md_file.write("\n### [{}]({})\n".format(dataset[0], dataset[1]))
for row in files.itertuples():
md_file.write(
"* [{}]({}) ({}, {})\n".format(
row.file_title, row.download_url, row.format, row.file_issued
)
)