#!/usr/bin/env python
# coding: utf-8

# # Facets in DigitalNZ
# 
# This notebook examines what data is available via facets in DigitalNZ.

# ## Import what we need

# In[6]:


import requests
import requests_cache
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import FileLinks, display
from pathlib import Path

s = requests_cache.CachedSession()


# In[ ]:


API_KEY = '[YOUR API KEY]'
API_URL = 'http://api.digitalnz.org/v3/records.json'


# ## Define some functions

# In[8]:


def get_records(params):
    '''
    Get records from a search using the supplied parameters.
    '''
    response = s.get(API_URL, params=params)
    return response.json()

def check_facet(facet):
    '''
    Get values for the specified facet, return the total number of values & records,
    and save the complete set of values and counts as a CSV.
    '''
    facet_data = []
    params = {
        'facets': [facet],
        'api_key': API_KEY,
        'per_page': 0,
        'facets_per_page': 350
    }
    data = get_records(params)
    try:
        facets = data['search']['facets'][facet]
    except KeyError:
        print('Not a facet!')
        facet_data = {'facet': facet}
    else:
        # If there are more than 350 facet values, harvest them all
        if len(facets) == 350:
            facets = harvest_facet_values(facet)
            
        # Convert the facet data to a dataframe
        df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
        df.columns = ['value', 'count']
        
        # Save all the values and counts as a CSV
        df.to_csv(Path('facets', f'{facet}.csv'), index=False)
        
        # Display summary details
        print(f'Number of values: {df.shape[0]:,}')
        print(f'Number of records: {df["count"].sum():,}')
        
        # Return summary details
        facet_data = {'facet': facet, 'num_values': df.shape[0], 'num_records': df['count'].sum()}
    return facet_data
        
def harvest_facet_values(facet, **kwargs):
    '''
    Harvest all the available values for the given facet.
    '''
    facets = {}
    more = True
    page = 1
    params = {
        'api_key': API_KEY,
        'per_page': 0,
        'facets': facet,
        'facets_per_page': 350,
    }
    for k, v in kwargs.items():
        if k == 'text':
            params[k] = v
        else:
            params[f'and[{k}][]'] = v
    with tqdm(leave=False) as pbar:
        while more:
            params['facets_page'] = page
            data = get_records(params)
            if data['search']['facets'][facet]:
                facets.update(data['search']['facets'][facet])
                pbar.update(350)
                page += 1
            else:
                more = False
        return facets


# ## Collect facet data

# The API docs say that the following facets are available via the API: `category`, `display_collection`, `creator`, `placename`, `year`, `decade`, `century`, `language`, `content_partner`, `rights`, `collection`. However, `display_collection` isn't available. It's also worth noting that the `collection` facet corresponds to the `collection_title` field.
# 
# After a bit of poking around, I found that facets are also available for `usage`, `copyright`, `dc_type`, `format`, `subject`, and `primary_collection`.
# 
# Let's gather values for each of the available facets.

# In[9]:


facets = [
    'category', 
    'display_collection', 
    'creator', 
    'placename', 
    'year', 
    'decade', 
    'century', 
    'language', 
    'content_partner', 
    'rights', 
    'collection', 
    'usage',
    'copyright',
    'dc_type',
    'format',
    'subject',
    'primary_collection'
]

facet_data = []
for facet in facets:
    print(f'\n{facet}')
    facet_data.append(check_facet(facet))


# We've now a dataset that summarises the contents of each facet. If you look in the `facets` directory, you'll also find there's a CSV file containing all the values and counts for each facet.
# 
# Let's look at the summary data.

# In[10]:


# Convert to a dataframe
df = pd.DataFrame(facet_data)

# Make sure counts are integers
df['num_values'] = df['num_values'].fillna(0.0).astype('int64')
df['num_records'] = df['num_records'].fillna(0.0).astype('int64')
df


# Let's save this dataset as a CSV.

# In[11]:


df.to_csv(Path('facets', 'facets.csv'), index=False)


# Let's list all the CSV files we've saved! 

# In[12]:


display(FileLinks('facets', included_suffixes='.csv', recursive=False))


# ## Primary collections by Content Partner
# 
# I'm not sure how strict the hierarchies are, but I'm assuming we should be able to connect content partners to collections.
# 
# I've used the results of this to [visualise open collections](visualise_open_collections.ipynb) in DigitalNZ.

# In[17]:


partners = pd.read_csv(Path('facets', 'content_partner.csv'))


# In[ ]:


dfs = []
for row in partners.itertuples():
    partner = row.value
    facets = harvest_facet_values('primary_collection', content_partner=partner)
    df = pd.DataFrame.from_dict(facets, orient='index').reset_index()
    df.columns = ['primary_collection', 'count']
    df['content_partner'] = partner
    dfs.append(df)


# In[15]:


df_collections = pd.concat(dfs)
df_collections = df_collections[['content_partner', 'primary_collection', 'count']].sort_values(by=['content_partner', 'primary_collection'])


# In[16]:


df_collections.to_csv(Path('facets', 'collections_by_partner.csv'), index=False)


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.net/). Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).