#!/usr/bin/env python # coding: utf-8 # # Getting some top-level data from the DigitalNZ API # # This notebook pokes around at the top-level of DigitalNZ, mainly using facets. # # See the [API documentation](https://digitalnz.org/developers/api-docs-v3) for more detailed information. #
#

If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.

# #

# Some tips: #

#

#
# In[1]: import requests import pandas as pd import altair as alt from IPython.display import display, HTML # [Get yourself an API key](https://digitalnz.org/developers/getting-started) and paste it between the quotes below. # In[ ]: api_key = '[YOUR API KEY]' print('Your API key is: {}'.format(api_key)) # In[3]: # Base url for queries api_search_url = 'http://api.digitalnz.org/v3/records.json' # Set up the query params (we'll change these later) # Let's start with an empty text query to look at everything def set_params(): params = { 'api_key': api_key, 'text': '' } return params # In[4]: def get_data(params): ''' Retrieve an API query and extract the JSON payload. ''' response = requests.get(api_search_url, params=params) return response.json() # ## Hello world! # In[5]: # How many items are there? params = set_params() data = get_data(params) print(' There are {:,} items'.format(data['search']['result_count'])) # ## Items by century # In[6]: params['facets'] = 'century' data = get_data(params) # In[7]: centuries = data['search']['facets']['century'] centuries_df = pd.Series(centuries).to_frame().reset_index() centuries_df.columns = ['century', 'count'] centuries_df # In[8]: c1 = alt.Chart(centuries_df).mark_bar().encode( x = 'century:O', y = 'count:Q', tooltip = alt.Tooltip('count', format=',') ) c2 = alt.Chart(centuries_df).mark_bar().encode( x = 'century:O', y = alt.Y('count:Q', scale=alt.Scale(type='log')), tooltip = alt.Tooltip('count', format=',') ) c1 | c2 # ## Items by decade # In[9]: params['facets'] = 'decade' params['facets_per_page'] = 25 data = get_data(params) # In[10]: decades = data['search']['facets']['decade'] decades_df = pd.Series(decades).to_frame().reset_index() decades_df.columns = ['decade', 'count'] decades_df.head() # In[11]: alt.Chart(decades_df).mark_bar().encode( x = 'decade:O', y = 'count:Q', tooltip = alt.Tooltip('count', format=',') ) # ## Top 25 collections # In[12]: params['facets'] = 'display_collection' params['facets_per_page'] = 26 data = get_data(params) # In[13]: # Note that the facet is called 'primary_collection' in the results! collections = data['search']['facets']['primary_collection'] collections_df = pd.Series(collections).to_frame().reset_index() collections_df.columns = ['collection', 'count'] collections_df.head() # Papers Past is so much bigger than anything else, let's exclude it from the chart. # In[14]: alt.Chart(collections_df[1:]).mark_bar().encode( x=alt.X('count:Q'), y=alt.Y('collection:N'), tooltip = alt.Tooltip('count', format=',') ) # ## Create a dataset of all collections # In[15]: more = True all_collections = {} params['facets'] = 'display_collection' params['facets_per_page'] = 100 params['facets_page'] = 1 while more: data = get_data(params) facets = data['search']['facets']['primary_collection'] if facets: all_collections.update(facets) params['facets_page'] += 1 else: more = False # In[16]: all_collections_df = pd.Series(all_collections).to_frame().reset_index() all_collections_df.columns = ['collection', 'count'] all_collections_df.head() # In[17]: all_collections_df.to_csv('digitalnz_collections.csv', index=False) display(HTML('Download CSV file')) # ## Top 25 newspapers in Papers Past # In[18]: params['facets'] = 'collection' params['and[display_collection][]'] = 'Papers Past' params['facets_per_page'] = 26 params['facets_page'] = 1 data = get_data(params) # In[19]: newspapers = data['search']['facets']['collection'] newspapers_df = pd.Series(newspapers).to_frame().reset_index() newspapers_df.columns = ['newspaper', 'count'] newspapers_df.head() # In[20]: alt.Chart(newspapers_df[1:]).mark_bar().encode( x=alt.X('count:Q'), y=alt.Y('newspaper:N'), tooltip = alt.Tooltip('count', format=',') ) # ## All newspapers in Papers Past # In[21]: more = True all_newspapers = {} params['facets'] = 'collection' params['and[display_collection][]'] = 'Papers Past' params['facets_per_page'] = 100 params['facets_page'] = 1 while more: data = get_data(params) facets = data['search']['facets']['collection'] if facets: all_newspapers.update(facets) params['facets_page'] += 1 else: more = False # In[22]: all_newspapers_df = pd.Series(all_newspapers).to_frame().reset_index() all_newspapers_df.columns = ['newspaper', 'count'] all_newspapers_df.head() # In[23]: all_newspapers_df[1:].to_csv('paperspast_newspapers.csv', index=False) display(HTML('Download CSV file')) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).