Harvest agencies associated with all functions

This notebook loops through the list of functions that were extracted from the RecordSearch interface and saves basic details of the agencies responsible for each function. To keep down the file size and avoid too much duplication it doesn't include the full range of relationships that an agency might have. If you want the full agency data, use this notebook to harvest agencies associated with an indivividual function or hierarchy.

The JSON data file created has the following structure:

[
    {
        'term': FUNCTION NAME
        'agencies': [
            'agency_id': AGENCY IDENTIFIER,
            'title': AGENCY NAME,
            'dates': {
                'date_str': AGENCY LIFE DATES AS A STRING,
                'start_date': AGENCY START DATE (YYYY-MM-DD),
                'end_date': AGENCY END DATE (YYYY-MM-DD),
            },
            'agency_status': TYPE/LEVEL OF AGENCY,
            'location': AGENCY LOCATION,
            'function_start_date': DATE AGENCY STARTED BEING RESPONSIBLE FOR THIS FUNCTION (YYYY-MM-DD),
            'function_end_date': DATE AGENCY STOPPED BEING RESPONSIBLE FOR THIS FUNCTION (YYYY-MM-DD),
        ]
    }
]

Set up the harvesting code

In [1]:
import json
import time

from IPython.display import FileLink, clear_output, display
from recordsearch_data_scraper.scrapers import RSAgencySearch
from tinydb import Query, TinyDB
from tqdm.auto import tqdm
In [2]:
def harvest_agencies(function):
    agencies = []
    search = RSAgencySearch(function=function, record_detail="full")
    with tqdm(total=search.total_results) as pbar:
        more = True
        while more:
            data = search.get_results()
            if data["results"]:
                agencies += data["results"]
                pbar.update(len(data["results"]))
                time.sleep(0.5)
            else:
                more = False
    return agencies


def get_children(function):
    """
    Gets child terms of a given function.
    """
    f_list = []
    if "narrower" in function:
        for subf in function["narrower"]:
            f_list.append(subf["term"])
            f_list += get_children(subf)
    return f_list


def load_functions():
    """
    Loads a pre-harvested JSON file containing functions data.
    Returns a flat list of functions.
    """
    functions_list = []
    with open("data/functions.json", "r") as json_file:
        functions = json.load(json_file)
    for function in functions:
        functions_list.append(function["term"])
        functions_list += get_children(function)
    # Get rid of duplicates
    functions_list = set(functions_list)
    # Sort terms
    functions_list = sorted(functions_list)
    return functions_list


def get_function_dates(function, agency):
    """
    Get the dates an agency was responsible for a given function.
    """
    dates = {}
    # Loop through the functions associated with an agency
    for f in agency["functions"]:
        # Find the current function
        if f["identifier"].lower() == function:
            # Get the dates this agency was responsible for the current function
            dates["function_start_date"] = f["start_date"]
            dates["function_end_date"] = f["end_date"]
            break
    return dates


def get_all_agencies():
    """
    Sends function terms off to the harvester to get related agencies.
    """
    clear_output()
    Record = Query()
    # Get a list of functions
    functions = load_functions()
    db = TinyDB("data/db_agencies_by_function")
    # Loop through the list of functions
    for function in functions:
        clear_output()
        print('\nHarvesting "{}"'.format(function))
        # Fire up the harvester for this function
        results = harvest_agencies(function)
        agencies = []
        # Create a subset of the agency data to limit the filesize
        for a in results:
            # Keep the fields we want
            agency = {
                k: a[k]
                for k in [
                    "identifier",
                    "title",
                    "start_date",
                    "end_date",
                    "agency_status",
                    "location",
                ]
            }
            # Add extra fields to show when the agency was responsible for this function
            agency.update(get_function_dates(function, a))
            agencies.append(agency)
        db.upsert({"term": function, "agencies": agencies}, Record.term == function)

Start the harvest

In [ ]:
get_all_agencies()

Save the results for download

In [4]:
def save_json():
    db = TinyDB("data/db_agencies_by_function")
    functions = db.all()
    filename = "data/agencies_by_function.json"
    with open(filename, "w") as json_file:
        json.dump(functions, json_file, indent=4)
        display(FileLink(filename))
In [5]:
save_json()

Created by Tim Sherratt as part of the GLAM Workbench.