#!/usr/bin/env python
# coding: utf-8

# # Save a list of oral history collections
# 
# Oral histories are often grouped into collections or projects. The names of these collections can be found as  `series` in the `isPartOf` metadata field.
# 
# This notebook extracts a list of series from a [pre-harvested dataset](https://github.com/GLAM-Workbench/trove-oral-histories-data/blob/main/trove-oral-histories.csv) of oral histories held by the NLA and described in Trove, and saves them to a text file. You can [download an example](https://github.com/GLAM-Workbench/trove-oral-histories-data/blob/main/trove-oral-history-series.txt) of the file created by this notebook from the [trove-oral-histories-data](https://github.com/GLAM-Workbench/trove-oral-histories-data) GitHub repository.
# 
# If you're using data from the oral histories in Trove, you should read the section on [licensing of oral histories](https://tdg.glam-workbench.net/other-digitised-resources/oral-histories/overview.html#licensing-of-oral-histories) in the Trove Data Guide.

# In[1]:


from pathlib import Path

import pandas as pd


# In[2]:


# Load the preharvested dataset
df = pd.read_csv("https://github.com/GLAM-Workbench/trove-oral-histories-data/raw/main/trove-oral-histories.csv")

# Split the is_part_of field in case there are multiple values
collections = df["is_part_of"].str.split(" | ", regex=False).explode().dropna().reset_index()

# Get a list of series values
series = list(
    collections.loc[collections["is_part_of"].str.startswith("series")][
        "is_part_of"
    ].unique()
)

# Remove duplicates and strip trailing fullstops
series = sorted(set([s.lstrip("series: ").rstrip(".").strip() for s in series]))


# In[3]:


# Save the list of series to a file
with Path("trove-oral-history-series.txt").open("w") as text_file:
    for s in series:
        text_file.write(s + "\n")


# ----
# 
# Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.net/).