#!/usr/bin/env python # coding: utf-8 # # Save a list of oral history collections # # Oral histories are often grouped into collections or projects. The names of these collections can be found as `series` in the `isPartOf` metadata field. # # This notebook extracts a list of series from a [pre-harvested dataset](https://github.com/GLAM-Workbench/trove-oral-histories-data/blob/main/trove-oral-histories.csv) of oral histories held by the NLA and described in Trove, and saves them to a text file. You can [download an example](https://github.com/GLAM-Workbench/trove-oral-histories-data/blob/main/trove-oral-history-series.txt) of the file created by this notebook from the [trove-oral-histories-data](https://github.com/GLAM-Workbench/trove-oral-histories-data) GitHub repository. # # If you're using data from the oral histories in Trove, you should read the section on [licensing of oral histories](https://tdg.glam-workbench.net/other-digitised-resources/oral-histories/overview.html#licensing-of-oral-histories) in the Trove Data Guide. # In[1]: from pathlib import Path import pandas as pd # In[2]: # Load the preharvested dataset df = pd.read_csv("https://github.com/GLAM-Workbench/trove-oral-histories-data/raw/main/trove-oral-histories.csv") # Split the is_part_of field in case there are multiple values collections = df["is_part_of"].str.split(" | ", regex=False).explode().dropna().reset_index() # Get a list of series values series = list( collections.loc[collections["is_part_of"].str.startswith("series")][ "is_part_of" ].unique() ) # Remove duplicates and strip trailing fullstops series = sorted(set([s.lstrip("series: ").rstrip(".").strip() for s in series])) # In[3]: # Save the list of series to a file with Path("trove-oral-history-series.txt").open("w") as text_file: for s in series: text_file.write(s + "\n") # ---- # # Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.net/).