#!/usr/bin/env python # coding: utf-8 # # Data quality assessment of the Moving Image Archive dataset # # Created in October-December 2022 for the National Library of Scotland's Data Foundry by [Gustavo Candela, National Librarian’s Research Fellowship in Digital Scholarship 2022-23](https://data.nls.uk/projects/the-national-librarians-research-fellowship-in-digital-scholarship-2022-23/) # ### About the Moving Image Archive Dataset # # This dataset represents the descriptive metadata from the Moving Image Archive catalogue, which is Scotland’s national collection of moving images. # # - Data format: metadata available as MARCXML and Dublin Core # - Data source: https://data.nls.uk/data/metadata-collections/moving-image-archive/ # ### Table of contents # # - [Preparation](#Preparation) # - [Loading the dataset](#Loading-the-dataset) # - [Using the website assess the data](#Using-the-website-assess-the-data) # - [Let's explore the subjects](#Let's-explore-the-subjects) # - [Let's explore the authors](#Let's-explore-the-authors) # ### Citations # # - Candela, G., Sáez, M. D., Escobar, P., & Marco-Such, M. (2022). Reusing digital collections from GLAM institutions. Journal of Information Science, 48(2), 251–267. https://doi.org/10.1177/0165551520950246 # ### Preparation # # Import the libraries required to query the RDF dataset # In[1]: from rdflib import Graph # ### Loading the dataset # In[2]: # Create a Graph g = Graph().parse("../rdf/datasetEnriched.ttl") # ### Using the website assess the data # # According to the [Moving Image Archive website](https://movingimage.nls.uk/search?subject=37), the current total number of records associated to the subject Agriculture is 410. # # # The command line `grep` can be used to identify text in the dataset. It provides several parameters to configure the instruction such as `-n4` that retrieves 4 lines per occurrence. # # For example, `grep` can be used in order to identify the number of videos related to a subject (e.g., Agriculture). However, note that the word may appear in several fields, not only in 653 MARC field. # # We can retrieve the number of occurrences using the command line wc -l # # ### Let's explore the subjects # # #### Let's check how many property dc:subject contains the text "Agriculture" # In[3]: print('##### check Agriculture subject:') # Query the data in g using SPARQL q = """ SELECT ?subject (COUNT(?s) as ?count) WHERE { ?s dc:subject ?subject . FILTER regex(str(?subject),"Agriculture")} GROUP BY ?subject ORDER BY DESC(?count) """ # Apply the query to the graph and iterate through results for r in g.query(q): print(str(r["subject"]) + " " + str(r["count"])) # #### Let's check the number of items containing the subject Gaelic # In[4]: print('##### total number of items containing a property dc:subject with the text Gaelic:') # Query the data in g using SPARQL q = """ SELECT (COUNT(?subject) as ?total) WHERE { ?s dc:subject ?subject . FILTER regex(?subject, "Gaelic")} """ # Apply the query to the graph and iterate through results for r in g.query(q): print(str(r["total"])) # ### Let's explore the authors # # #### How many videos are associated to each author? # In[6]: # Query the data in g using SPARQL q = """ SELECT ?author (COUNT(distinct ?s) as ?count) WHERE { ?s schema:author ?author} GROUP BY ?author HAVING (count(distinct ?s) > 100) ORDER BY DESC(?count) """ # Apply the query to the graph and iterate through results for r in g.query(q): print(str(r["author"]) + " " + str(r["count"])) # #### Let's check how many resources are linked to a particular author # In[7]: # Query the data in g using SPARQL q = """ PREFIX foaf: SELECT (count(distinct ?s) as ?total) WHERE {?s schema:author } """ # Apply the query to the graph and iterate through results for r in g.query(q): print(r["total"]) # In[8]: # Query the data in g using SPARQL q = """ PREFIX foaf: SELECT ?name WHERE {?s schema:author . ?s schema:name ?name} """ # Apply the query to the graph and iterate through results for r in g.query(q): print(r["name"]) # In[ ]: