Notebook

In [ ]:

# Import the HT Data API wrapper
from hathitrust_api import DataAPI

# Replace placeholder strings with your HT credentials (leaving the quote marks)
ht_access_key = "YOUR_ACCESS_KEY_HERE"
ht_secret_key = "YOUR_SECRET_KEY_HERE"

# instantiate the Data API connection object
data_api = DataAPI(ht_access_key, ht_secret_key)

In [ ]:

# assorted imports from Python standard library
import json
import os
import time

In [ ]:

# JSON metadata file downloaded from HT
metadata_path = "554050894-1535834127.json"

In [ ]:

# the preferred syntax for opening/closing files in Python
with open(metadata_path, "r") as fp:
    data = json.load(fp)

# the last line in a cell is always excuted and its return value displayed
data.keys()

In [ ]:

# the gathers field is what actually contains the list of volumes in the collection
data['gathers']

In [ ]:

# list comprehension to get only the volume ids
vol_ids = [item['htitem_id'] for item in data['gathers']]
vol_ids

In [ ]:

def ht_picture_download(item_id, out_dir=None):
    """
    :param item_id: unique HathiTrust volume identifier
    :param out_dir: destination for images; if None, no download
    
    Note: if supplied, out_dir must be an existing directory and
    the caller must have write permissions in that directory
    
    :rtype list of pages with IMAGE_ON_PAGE feature
    """
    
    print("[{}] Starting processing".format(item_id))
    
    # metadata from API in json format (different than HT collection metadata)
    meta = json.loads(data_api.getmeta(item_id, json=True))

    # sequence gets us each page of the PDF in order, with any
    # additional information that might be available for it
    sequence = meta['htd:seqmap'][0]['htd:seq']

    # list of pages with pictures (empty to start)
    img_pages = []

    # try/except block handles situation where no "pfeats" exist OR
    # the sequence numbers are not numeric
    for page in sequence:
        try:
            if 'IMAGE_ON_PAGE' in page['htd:pfeat']:
                img_pages.append(int(page['pseq']))
        except (KeyError, TypeError) as e:
            continue
        
    # track for download progress report
    total_pages = len(img_pages)

    # if out_dir is not None, then also download page images
    if out_dir:
        
         # return if folder already exists (reasonable inference that volume already processed)
        if os.path.isdir(out_dir):
            print("[{}] Directory already exists.".format(item_id))
            return img_pages

        # otherwise, create folder to put the images
        print("[{}] Making directory {}".format(item_id, out_dir))
        os.makedirs(out_dir)
        
        for i, page in enumerate(img_pages):
            try:
                # simple status message
                print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
                
                img = data_api.getpageimage(item_id, page)
            
                img_out = os.path.join(out_dir, str(page) + ".jpg")
    
                # write out the image
                with open(img_out, 'wb') as fp:
                    fp.write(img)

                # to avoid exceeding the allowed API usage, we take a quick
                # two-second break before requesting the next image
                time.sleep(2)

            except Exception as e:
                print("[{}] Error downloading page {}: {}".format(item_id, page,e))
                
    # return the list of image pages
    return img_pages

In [ ]:

# loop over volumes in our collection
for item_id in vol_ids:
    destination = os.path.join("items", "hathitrust", item_id)
    ht_picture_download(item_id, out_dir=destination)