#!/usr/bin/env python # coding: utf-8 # In[ ]: # Import the IA Python library import internetarchive as ia # Replace placeholder strings with your IA credentials (leaving the quote marks) ia_email = "YOUR_EMAIL_HERE" ia_password = "YOUR_PASSWORD_HERE" # add these credentials to the API's configuration object ia.configure(ia_email, ia_password) # In[ ]: # the requests library installed through conda import requests # a few other imports from the Python standard library import gzip import os import sys import xml.etree.ElementTree as ET # In[ ]: # sample search (should yield two results) query = "peter parley date:[1825 TO 1830] mediatype:texts" vol_ids = [result['identifier'] for result in ia.search_items(query)] vol_ids # In[ ]: # define a function for downloading pictures from a given IA volume def ia_picture_download(item_id, out_dir=None): """ :param item_id: unique Internet Archive volume identifier :param out_dir: destination for images; if None, no download Note: if supplied, out_dir must be an existing directory and the caller must have write permissions in that directory :rtype list of pages with one or more blockType=Picture in Abbyy OCR data """ print("[{}] Starting processing".format(item_id)) # Use command-line client to see available metadata formats: # `ia metadata formats VOLUME_ID` # for this lesson, only the Abbyy file is needed returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"])) # make sure something got returned if len(returned_files) > 0: abbyy_file = returned_files[0].name else: print("[{}] Could not get Abbyy file".format(item_id)) return None # download the abbyy file to CWD ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, destdir=os.getcwd(), no_directory=True) # collect the pages with at least one picture block img_pages = [] with gzip.open(abbyy_file) as fp: tree = ET.parse(fp) document = tree.getroot() for i, page in enumerate(document): for block in page: try: if block.attrib['blockType'] == 'Picture': img_pages.append(i) break except KeyError: continue # 0 is not a valid page for making GET requests to IA, #yet sometimes it's in the zipped Abbyy file img_pages = [page for page in img_pages if page > 0] # track for download progress report total_pages = len(img_pages) # OCR files are huge, so just delete once we have pagelist os.remove(abbyy_file) # if out_dir is not None, then also download page images if out_dir: # return if folder already exists (reasonable inference that volume already processed) if os.path.isdir(out_dir): print("[{}] Directory already exists.".format(item_id)) return img_pages # otherwise, create folder to put the images print("[{}] Making directory {}".format(item_id, out_dir)) os.makedirs(out_dir) # https://iiif.archivelab.org/iiif/documentation urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) for page in img_pages] # no direct page download through API, DIY for i, page, url in zip(range(1,total_pages), img_pages, urls): rsp = requests.get(url, allow_redirects=True) if rsp.status_code == 200: print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages)) with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp: fp.write(rsp.content) # return list of pages with 1+ picture blocks return img_pages # In[ ]: # loop over our search results and call the function for item_id in vol_ids: destination = os.path.join("items", "internetarchive", item_id) img_pages = ia_picture_download(item_id, out_dir=destination)