The Ghent University Library, provides open access and open data programmes to enhance access to research. They produce high-resolution scans of historic documents, print journals and promote open access for academic publications.
This notebook introduces how to explore the repository, basically read a metadata record, obtain the fulltext and create a CSV dataset.
The content used in this notebook is based on la Russie illustrée which is a periodical with 15 volumes and 748 issues. The digital content can be retrieved here.
Additional information about the collection is accessible here.
import requests, csv
import json
import pandas as pd
import matplotlib.pyplot as plt
In this section, we can add item that we want to use by providing its manifest URI.
manifestUrl = 'https://adore.ugent.be/IIIF/collections/RUG01-001643403'
responseManifest = requests.get(manifestUrl)
print(responseManifest.url)
# retrieving the metadata
m = json.loads(responseManifest.text)
# the title
print('label:' + m['label'])
print('attribution:' + m['attribution'])
for i in m['manifests']:
print(i['@id'])
csv_out = csv.writer(open('gent_records.csv', 'w'), delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
csv_out.writerow(['title', 'label', 'date', 'thumbnail', 'publisher', 'attribution', 'provenance', 'manifestItemUrl'])
for i in m['manifests']:
title = label = date = thumbnail = publisher = attribution = provenance = manifestItemUrl = ''
manifestItemUrl = i['@id']
responseManifestItem = requests.get(manifestItemUrl)
# retrieving the metadata
manifestItem = json.loads(responseManifestItem.text)
date = manifestItem['navDate']
attribution = manifestItem['attribution']
label = manifestItem['label']
thumbnail = manifestItem['thumbnail']['@id']
for metadata in manifestItem['metadata']:
if metadata['label'] == 'Title' and not title: # first title
title = metadata['value']
elif metadata['label'] == 'Publisher':
publisher = metadata['value']
elif metadata['label'] == 'Provenance':
provenance = metadata['value']
else: pass
print(label + " " + thumbnail)
csv_out.writerow([title, label, date, thumbnail, publisher, attribution, provenance, manifestItemUrl])
# Load the CSV file from GitHub.
# This puts the data in a Pandas DataFrame
df = pd.read_csv('gent_records.csv')
df
# How many images?
df['thumbnail'].count()
This chart shows the number of resources by year
# First we create a new column in pandas with the year
df['year'] = pd.DatetimeIndex(df['date']).year
ax = df['year'].value_counts().plot(kind='bar',
figsize=(14,8),
title="Number of resources per date")
ax.set_xlabel("Dates")
ax.set_ylabel("Resources")
plt.show()
Once we have queried the repository and we have the metadata as a CSV file, let's show the results as a thumbnail gallery.
from IPython.display import HTML, Image
def _src_from_data(data):
"""Base64 encodes image bytes for inclusion in an HTML img element"""
img_obj = Image(data=data)
for bundle in img_obj._repr_mimebundle_():
for mimetype, b64value in bundle.items():
if mimetype.startswith('image/'):
return f'data:{mimetype};base64,{b64value}'
def gallery(images, row_height='auto'):
"""Shows a set of images in a gallery that flexes with the width of the notebook.
Parameters
----------
images: list of str or bytes
URLs or bytes of images to display
row_height: str
CSS height value to assign to all images. Set to 'auto' by default to show images
with their native dimensions. Set to a value like '250px' to make all rows
in the gallery equal height.
"""
figures = []
for image in images:
if isinstance(image, bytes):
src = _src_from_data(image)
caption = ''
else:
src = image
caption = f'<figcaption style="font-size: 0.6em">{image}</figcaption>'
figures.append(f'''
<figure style="margin: 5px !important;">
<img src="{src}" style="height: {row_height}">
</figure>
''')
return HTML(data=f'''
<div style="display: flex; flex-flow: row wrap; text-align: center;">
{''.join(figures)}
</div>
''')
gallery(df['thumbnail'], row_height='150px')