def ht_picture_download(item_id, out_dir=None):
"""
:param item_id: unique HathiTrust volume identifier
:param out_dir: destination for images; if None, no download
Note: if supplied, out_dir must be an existing directory and
the caller must have write permissions in that directory
:rtype list of pages with IMAGE_ON_PAGE feature
"""
print("[{}] Starting processing".format(item_id))
# metadata from API in json format (different than HT collection metadata)
meta = json.loads(data_api.getmeta(item_id, json=True))
# sequence gets us each page of the PDF in order, with any
# additional information that might be available for it
sequence = meta['htd:seqmap'][0]['htd:seq']
# list of pages with pictures (empty to start)
img_pages = []
# try/except block handles situation where no "pfeats" exist OR
# the sequence numbers are not numeric
for page in sequence:
try:
if 'IMAGE_ON_PAGE' in page['htd:pfeat']:
img_pages.append(int(page['pseq']))
except (KeyError, TypeError) as e:
continue
# track for download progress report
total_pages = len(img_pages)
# if out_dir is not None, then also download page images
if out_dir:
# return if folder already exists (reasonable inference that volume already processed)
if os.path.isdir(out_dir):
print("[{}] Directory already exists.".format(item_id))
return img_pages
# otherwise, create folder to put the images
print("[{}] Making directory {}".format(item_id, out_dir))
os.makedirs(out_dir)
for i, page in enumerate(img_pages):
try:
# simple status message
print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages))
img = data_api.getpageimage(item_id, page)
img_out = os.path.join(out_dir, str(page) + ".jpg")
# write out the image
with open(img_out, 'wb') as fp:
fp.write(img)
# to avoid exceeding the allowed API usage, we take a quick
# two-second break before requesting the next image
time.sleep(2)
except Exception as e:
print("[{}] Error downloading page {}: {}".format(item_id, page,e))
# return the list of image pages
return img_pages