This notebook has been superseded by brand new, super duper, notebook that explores in much more detail how to harvest items from a search in RecordSearch. I've left it here so as not to break any links, but please don't use it.
import time
import csv
import os
import math
import string
import requests
import pandas as pd
from slugify import slugify
# from PIL import Image, ImageOps
from requests import ConnectionError
from recordsearch_tools.utilities import retry
from recordsearch_tools.client import RSSearchClient, RSSeriesClient
from tinydb import TinyDB, Query
try:
from io import BytesIO
except ImportError:
from StringIO import StringIO
from IPython.display import Image as DImage
from IPython.core.display import HTML
# Make sure data directory exists
os.makedirs('data/images', exist_ok=True)
# What series do you want to harvest?
# Insert the series id between the quotes.
series = 'A821'
output_dir = 'data'
class SeriesHarvester():
def __init__(self, series, control=None):
self.series = series
self.control = control
self.total_pages = None
self.pages_complete = 0
self.client = RSSearchClient()
self.prepare_harvest()
self.db = TinyDB('data/db-{}.json'.format(self.series.replace('/', '-')))
self.items = self.db.table('items')
self.images = self.db.table('images')
def get_total(self):
return self.client.total_results
def prepare_harvest(self):
if self.control:
self.client.search(series=self.series, control=self.control)
else:
self.client.search(series=self.series)
total_results = self.client.total_results
print('{} items'.format(total_results))
self.total_pages = math.floor(int(total_results) / self.client.results_per_page) + 1
print(self.total_pages)
@retry(ConnectionError, tries=20, delay=10, backoff=1)
def start_harvest(self, page=None):
Record = Query()
if not page:
page = self.pages_complete + 1
while self.pages_complete < self.total_pages:
if self.control:
response = self.client.search(series=self.series, page=page, control=self.control, sort='9')
else:
response = self.client.search(series=self.series, page=page, sort='9')
for result in response['results']:
self.items.upsert(result, Record.identifier == result['identifier'])
self.pages_complete += 1
page += 1
print('{} pages complete'.format(self.pages_complete))
time.sleep(1)
@retry(ConnectionError, tries=20, delay=10, backoff=1)
def harvest_images(self):
Record = Query()
items = self.items.search(Record.digitised_status == True)
headers = {'User-Agent': 'Mozilla/5.0'}
for item in items:
directory = os.path.join('data', 'images', '{}/{}-[{}]'.format(self.series.replace('/', '-'), item['control_symbol'].replace('/', '-').replace(' ', '-'), item['identifier']))
if not os.path.exists(directory):
os.makedirs(directory)
for page in range(1, item['digitised_pages'] + 1):
filename = '{}/{}-p{}.jpg'.format(directory, item['identifier'], page)
print('{}, p. {}'.format(item['identifier'], page))
if not os.path.exists(filename):
img_url = 'http://recordsearch.naa.gov.au/NaaMedia/ShowImage.asp?B={}&S={}&T=P'.format(item['identifier'], page)
response = requests.get(img_url, headers=headers, stream=True, verify=False)
response.raise_for_status()
try:
image = Image.open(BytesIO(response.content))
except IOError:
print('Not an image')
else:
width, height = image.size
image.save(filename)
del response
image_meta = {
'image_id': '{}-{}'.format(item['identifier'], page),
'identifier': item['identifier'],
'page': page,
'width': width,
'height': height
}
self.images.upsert(image_meta, Record.image_id == image_meta['image_id'])
print('Image saved')
time.sleep(1)
def harvest_series(series):
h = SeriesHarvester(series=series)
h.start_harvest()
def harvest_images(series):
h = SeriesHarvester(series=series)
h.harvest_images()
# Harvest the metadata!
harvest_series(series)
# Harvest digitised pages
harvest_images(series)
# Let's see how many items we've harvested
db = TinyDB('data/db-{}.json'.format(series))
items = db.table('items')
len(items)
def convert_to_df(series):
'''
Get the series data from TinyDB and save as a Pandas dataframe.
Also flattens the date dictionary, and does a bit of ordering.
'''
# Load the series db
db = TinyDB('data/db-{}.json'.format(series))
items = db.table('items')
# Let's convert the database into a simple list
item_list = [i for i in items]
# Now let's turm that list into a Pandas Dataframe
df = pd.DataFrame(item_list)
# The 'contents_date' column is a dictionary, we need to flatten this out so we can easily work with the values
df = pd.concat([df, pd.DataFrame((d for idx, d in df['contents_dates'].iteritems()))], axis=1)
# Delete the old date field
del df['contents_dates']
# Rename column
df.rename({'date_str': 'contents_dates'}, axis=1, inplace=True)
# Put columns in preferred order
df = df[['identifier', 'series', 'control_symbol', 'title', 'contents_dates', 'start_date', 'end_date', 'access_status', 'location', 'digitised_status', 'digitised_pages']]
df.sort_values(['identifier'])
return df
def save_as_csv(series):
'''
Converts harvested data in TinyDB to a CSV file, via a Pandas dataframe.
'''
df = convert_to_df(series)
df.to_csv('data/{}.csv'.format(series.replace('/', '-')), index=False)
# Save the harvested metadata as a CSV file
save_as_csv(series)
Once you've saved a harvest as a CSV file, you can download it from the workbench data directory.
def harvest_large_series(series, control_range=None):
'''
RecordSearch will not return more than 20,000 results.
If a series has more than 20,000 items you'll need to break it up.
The easiest way to do this is to add a param for control_symbol.
This function will break break a series harvest down into a series of harvests --
using each letter and number with a wildcard as the control_symbol parameter.
This should be enough to harvest most large series, but in some cases you might need to supply a custom list of control_symbol prefixes.
'''
if not control_range:
control_range = [letter + '*' for letter in string.ascii_uppercase] + [str(number) + '*' for number in range(0, 10)]
for control in control_range:
print(control)
h = SeriesHarvester(series=series, control=control)
h.start_harvest()
# Harvest a large series using the default control range
harvest_large_series('B13')
# For series like A1 that use the year as the control symbol prefix, this range should work.
control_range = [str(num) + '*' for num in range(2,10)] + ['1{}*'.format(num2) for num2 in [str(num) for num in range(0,9)]] + ['19{}*'.format(num2) for num2 in [str(num) for num in range(1,10)]]
# Use custom range to harvest a large series
harvest_large_series('A1', control_range)
series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']
for series in series_list:
harvest_series(series)