Created in October-December 2022 for the National Library of Scotland's Data Foundry by Gustavo Candela, National Librarian’s Research Fellowship in Digital Scholarship 2022-23
This dataset represents the descriptive metadata from the Moving Image Archive catalogue, which is Scotland’s national collection of moving images.
Import the libraries required to extract the information from MARCXML to a CSV file:
import pymarc, re, csv
from pymarc import parse_xml_to_array
To extract the metadata we'll mainly use Pymarc, a Python 3 library for working with bibliographic data encoded in MARC21. The metadata will be stored in a CSV (comma-separated values) text file.
*Note: If you'd like to reuse this code for other MARC datasets you may have to refine the code to retrieve additional and/or different MARC fields according to how the metadata is defined.
with open('../data/output/movingImageArchive.csv', 'w') as csv_file:
csv_output = csv.writer(csv_file, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
csv_output.writerow(['title', 'author', 'authorOrganisation', 'author720', 'place_publication',\
'date', 'extent', 'credits', 'subjects',\
'summary', 'details', 'link', 'geographicNames',\
'contentType', 'mediaType', 'carrierType', 'generalNote','thumbnail'])
records = parse_xml_to_array(open('../data/movingImageArchive/Moving-Image-Archive-dataset-MARC.xml'))
for record in records:
title = author = authorOrganisation = author720 = place_publication = date = extent = credits = subjects =\
summary = details = link = geographicNames = contentType = mediaType = carrierType =\
generalNote = personalName = thumbnail = ''
# title
if record['245'] is not None:
title = record['245']['a']
if record['245']['b'] is not None:
title = title + " " + record['245']['b']
title = title.strip()
# place of publication
if record['264'] is not None:
place_publication = record['264']['a']
# date
for f in record.get_fields('264'):
dates = f.get_subfields('c')
if len(dates):
date = dates[0]
if date.endswith('.'): date = date[:-1]
# Physical Description - extent
for f in record.get_fields('300'):
extent = f.get_subfields('a')
if len(extent):
extent = extent[0]
# TODO cleaning
details = f.get_subfields('b')
if len(details):
details = details[0]
# Content Type
for f in record.get_fields('336'):
contentType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
# Media Type
for f in record.get_fields('337'):
mediaType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
# Carrier Type
for f in record.get_fields('338'):
carrierType = f.get_subfields('a')[0] + ' -- ' + f.get_subfields('2')[0]
# General Note
for f in record.get_fields('500'):
generalNote = f.get_subfields('a')
# credits
if record['508'] is not None:
for f in record.get_fields('508'):
credits += f.get_subfields('a')[0].strip() + " -- "
credits = credits.replace("\n", " ")
# remove -- at the end
credits = re.sub(' -- $', '', credits)
# summary
if record['520'] is not None:
summary = record['520']['a'].strip()
# subjects
if record['653'] is not None:
subjects = ''
geographicNames = ''
for f in record.get_fields('653'):
if f.indicator2 == '0':
subjects += f.get_subfields('a')[0] + ' -- '
elif f.indicator2 == '5':
geographicNames += f.get_subfields('a')[0] + ' -- '
# remove -- at the end
subjects = re.sub(' -- $', '', subjects)
geographicNames = re.sub(' -- $', '', geographicNames)
# author
# Added Entry-Personal Name and Corporate Name
if record['700'] is not None:
author = ''
for f in record.get_fields('700'):
author += f.get_subfields('a')[0].strip() + " -- "
author = author.replace("\n", " ")
if record['710'] is not None:
authorOrganisation = ''
for f in record.get_fields('710'):
authorOrganisation += f.get_subfields('a')[0].strip() + " -- "
authorOrganisation = authorOrganisation.replace("\n", " ")
if record['720'] is not None:
author720 = ''
for f in record.get_fields('720'):
author720 += f.get_subfields('a')[0].strip() + " -- "
author720 = author720.replace("\n", " ")
# remove -- at the end
author = re.sub(' -- $', '', author)
authorOrganisation = re.sub(' -- $', '', authorOrganisation)
author720 = re.sub(' -- $', '', author720)
# link
if record['856'] is not None:
link = record['856']['u']
# thumbnail
if record['859'] is not None:
thumbnail = record['859']['u']
csv_output.writerow([title,author,authorOrganisation,author720,place_publication,date,extent,credits,\
subjects,summary,details,link,geographicNames,\
contentType,mediaType,carrierType,generalNote,thumbnail])