#!/usr/bin/env python # coding: utf-8 # # Facets in DigitalNZ # # This notebook examines what data is available via facets in DigitalNZ. # ## Import what we need # In[6]: import requests import requests_cache import pandas as pd from tqdm.auto import tqdm from IPython.display import FileLinks, display from pathlib import Path s = requests_cache.CachedSession() # In[ ]: API_KEY = '[YOUR API KEY]' API_URL = 'http://api.digitalnz.org/v3/records.json' # ## Define some functions # In[8]: def get_records(params): ''' Get records from a search using the supplied parameters. ''' response = s.get(API_URL, params=params) return response.json() def check_facet(facet): ''' Get values for the specified facet, return the total number of values & records, and save the complete set of values and counts as a CSV. ''' facet_data = [] params = { 'facets': [facet], 'api_key': API_KEY, 'per_page': 0, 'facets_per_page': 350 } data = get_records(params) try: facets = data['search']['facets'][facet] except KeyError: print('Not a facet!') facet_data = {'facet': facet} else: # If there are more than 350 facet values, harvest them all if len(facets) == 350: facets = harvest_facet_values(facet) # Convert the facet data to a dataframe df = pd.DataFrame.from_dict(facets, orient='index').reset_index() df.columns = ['value', 'count'] # Save all the values and counts as a CSV df.to_csv(Path('facets', f'{facet}.csv'), index=False) # Display summary details print(f'Number of values: {df.shape[0]:,}') print(f'Number of records: {df["count"].sum():,}') # Return summary details facet_data = {'facet': facet, 'num_values': df.shape[0], 'num_records': df['count'].sum()} return facet_data def harvest_facet_values(facet, **kwargs): ''' Harvest all the available values for the given facet. ''' facets = {} more = True page = 1 params = { 'api_key': API_KEY, 'per_page': 0, 'facets': facet, 'facets_per_page': 350, } for k, v in kwargs.items(): if k == 'text': params[k] = v else: params[f'and[{k}][]'] = v with tqdm(leave=False) as pbar: while more: params['facets_page'] = page data = get_records(params) if data['search']['facets'][facet]: facets.update(data['search']['facets'][facet]) pbar.update(350) page += 1 else: more = False return facets # ## Collect facet data # The API docs say that the following facets are available via the API: `category`, `display_collection`, `creator`, `placename`, `year`, `decade`, `century`, `language`, `content_partner`, `rights`, `collection`. However, `display_collection` isn't available. It's also worth noting that the `collection` facet corresponds to the `collection_title` field. # # After a bit of poking around, I found that facets are also available for `usage`, `copyright`, `dc_type`, `format`, `subject`, and `primary_collection`. # # Let's gather values for each of the available facets. # In[9]: facets = [ 'category', 'display_collection', 'creator', 'placename', 'year', 'decade', 'century', 'language', 'content_partner', 'rights', 'collection', 'usage', 'copyright', 'dc_type', 'format', 'subject', 'primary_collection' ] facet_data = [] for facet in facets: print(f'\n{facet}') facet_data.append(check_facet(facet)) # We've now a dataset that summarises the contents of each facet. If you look in the `facets` directory, you'll also find there's a CSV file containing all the values and counts for each facet. # # Let's look at the summary data. # In[10]: # Convert to a dataframe df = pd.DataFrame(facet_data) # Make sure counts are integers df['num_values'] = df['num_values'].fillna(0.0).astype('int64') df['num_records'] = df['num_records'].fillna(0.0).astype('int64') df # Let's save this dataset as a CSV. # In[11]: df.to_csv(Path('facets', 'facets.csv'), index=False) # Let's list all the CSV files we've saved! # In[12]: display(FileLinks('facets', included_suffixes='.csv', recursive=False)) # ## Primary collections by Content Partner # # I'm not sure how strict the hierarchies are, but I'm assuming we should be able to connect content partners to collections. # # I've used the results of this to [visualise open collections](visualise_open_collections.ipynb) in DigitalNZ. # In[17]: partners = pd.read_csv(Path('facets', 'content_partner.csv')) # In[ ]: dfs = [] for row in partners.itertuples(): partner = row.value facets = harvest_facet_values('primary_collection', content_partner=partner) df = pd.DataFrame.from_dict(facets, orient='index').reset_index() df.columns = ['primary_collection', 'count'] df['content_partner'] = partner dfs.append(df) # In[15]: df_collections = pd.concat(dfs) df_collections = df_collections[['content_partner', 'primary_collection', 'count']].sort_values(by=['content_partner', 'primary_collection']) # In[16]: df_collections.to_csv(Path('facets', 'collections_by_partner.csv'), index=False) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).