#!/usr/bin/env python # coding: utf-8 # # Trove Newspaper & Gazette Harvester # # Download large quantities of digitised newspaper and gazette articles from [Trove](https://trove.nla.gov.au/newspaper/) with this simplified, web-based interface to the [Trove Harvester](https://github.com/wragge/troveharvester) command-line tool. # In[ ]: # This notebook is designed to run in Voila as an app (with the code hidden). # To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab' # Your browser might ask for permission to open the new tab as a popup. # In[ ]: import os import shutil from pathlib import Path import ipywidgets as widgets from IPython.display import HTML, display from requests.exceptions import HTTPError from trove_newspaper_harvester.core import ( Harvester, NoQueryError, get_harvest, prepare_query, ) # In[ ]: get_ipython().run_cell_magic('capture', '', '# Load environment variables if available\n%load_ext dotenv\n%dotenv\n') # In[ ]: def start_harvest(b): out.clear_output() query_params = prepare_query(query=query_url.value) try: harvester = Harvester( query_params=query_params, key=api_key.value, text=text.value, pdf=pdf.value, image=image.value, ) except HTTPError as e: if e.response.status_code == 403: with out: print("The request could not be authorised, check your API key.") else: raise except NoQueryError: with out: print("No query parameters found, check your query url.") else: with out: harvester.harvest() if harvester.harvested > 0: harvest = get_harvest() harvester.save_csv() Path(harvest, "results.ndjson").unlink() harvester.remove_ndjson_from_crate() shutil.make_archive(harvest, "zip", harvest) display( HTML( f'Download results: {str(harvest)}.zip' ) ) else: print("No results were harvested, check your query url.") # In[ ]: # All the UI stuff # Somewhere to put the results out = widgets.Output(layout=widgets.Layout(padding="40px")) api_key = widgets.Text( placeholder="Enter your Trove API key", description="API key:", disabled=False ) query_url = widgets.Text( placeholder="Enter the url of your search", description="Query url:", disabled=False, layout=widgets.Layout(width="100%"), ) text = widgets.Checkbox(value=False, description="Save full text", disabled=False) pdf = widgets.Checkbox(value=False, description="Save PDFs", disabled=False) image = widgets.Checkbox( value=False, description="Save articles as images", disabled=False ) start_button = widgets.Button( description="Start harvest", disabled=False, button_style="primary", # 'success', 'info', 'warning', 'danger' or '' tooltip="Start harvest", icon="", ) start_button.on_click(start_harvest) # ## Enter your Trove API key # # The harvester gets its data from the Trove API. To use the API [you need a key](https://trove.nla.gov.au/about/create-something/using-api) — the process is quick, painless, and free. Once you have a key, paste it in below. # In[ ]: display(api_key) # ## Enter your search query # # Use the [Trove web interface](https://trove.nla.gov.au/newspaper/) to construct your search. Remember that the harvester will get **all** of the matched results, not just the first 2,000 you see in the web interface. Once you're happy with your search, just copy the url and paste it below. # In[ ]: display(query_url) # ## Set harvest options # # By default the harvester only saves the metadata (date, page, title, newspaper etc) from the search results. If you want to save the full text content of each article, just check the 'Save full text' box. You can also save JPEG and PDF copies of every article by checking the 'Save articles as images' or 'Save PDFs' boxes, but note that this will slow down your harvest and generate large download files. # In[ ]: display(text) display(pdf) display(image) # In[ ]: display(start_button) display(out) # Once your harvest is complete a link will appear to download the results as a single, zipped file. See the GLAM Workbench for more information about the [contents and format of the results folder](https://glam-workbench.net/trove-harvester/#your-harvested-data). # In[ ]: # TESTING if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"): api_key.value = os.getenv("TROVE_API_KEY") query_url.value = "https://trove.nla.gov.au/search/category/newspapers?keyword=wragge%201902&l-artType=newspapers&l-state=Queensland&l-title=840" start_button.click() # ---- # # Created by [Tim Sherratt](https://timsherratt.org) ([@wragge](https://twitter.com/wragge)) for the [GLAM Workbench](https://github.com/glam-workbench/). # Support this project by [becoming a GitHub sponsor](https://github.com/sponsors/wragge?o=esb). #