#!/usr/bin/env python # coding: utf-8 # # Metadata for Trove digitised works # # In poking around to try and find a way of automating the download of OCR text from Trove's digitised books, I discovered that there's lots of useful metadata embedded in the page of a digitised work. Most of this metadata isn't available through the Trove API. # # The works that I'm talking about are things like digitised books and journal issues that have an `nla.obj` in their url. For some examples, you can [search in the book zone](https://trove.nla.gov.au/book/result?q=%22nla.obj%22&l-availability=y) for records containing `nla.obj`. # # As you can see from the code below, it's pretty easy to extract the data from the HTML of the web page – just one regular expression does the job. # In[22]: import requests import re import json from IPython.display import display # This is the url of the work we're going to extract metadata from. You could change this to any other `nla.obj` link. # In[13]: trove_digitised_url = 'https://nla.gov.au/nla.obj-362059651/' # We just have to load the HTML page and then do a regex search for the JSON string. Then we turn the string back into a JSON object and display it. # # Note the detailed data information on `children` – they include the individual pages of a work. # In[21]: # Get the HTML page response = requests.get(trove_digitised_url) # Search for the JSON string using regex try: work_data = re.search(r'var work = JSON\.parse\(JSON\.stringify\((\{.*\})', response.text).group(1) except AttributeError: # Just in case it's not there... work_data = '{}' print('No data found!') # Display the JSON display(json.loads(work_data)) # In[ ]: