In this notebook we'll import data about functions that we've harvested earlier and search for each of these functions in RecordSearch to see how many are actually used.
import json
import altair as alt
import pandas as pd
from recordsearch_data_scraper.scrapers import RSAgencySearch
from tqdm.auto import tqdm
# Load the JSON file we've already harvested
with open("data/functions.json", "r") as json_file:
functions = json.load(json_file)
def get_children(function):
f_list = []
if "narrower" in function:
for subf in function["narrower"]:
f_list.append(subf["term"])
f_list += get_children(subf)
return f_list
functions_list = []
for function in functions:
functions_list.append(function["term"])
functions_list += get_children(function)
# Get rid of duplicates
functions_list = set(functions_list)
# Sort terms
sorted(functions_list)
['accommodation services', 'acquisition', 'administrative decision appeal', 'administrative decision review', 'administrative law', 'administrative services', 'advertising standards', 'aged persons services', 'agricultural sciences', 'agriculture', 'air force', 'air force administration', 'air force commands', 'air operations', 'air safety', 'air transport', 'air transport safety', 'aircraft standards', 'airport services', 'airports', 'ambulance services', 'analytical services', 'animal and veterinary sciences', 'applications for native title', 'applied sciences', 'arbitration', 'archives administration', 'army', 'army administration', 'army commands', 'artifact export regulation', 'arts', 'arts development', 'arts funding', 'arts incentive schemes', 'arts promotion', 'associations and corporate law', 'atmospheric sciences', 'audit', 'australian capital territory', 'australian defence forces (adf)', 'banking', 'bankruptcy', 'biological sciences', 'botany', 'bounties', 'broadcasting', 'broadcasting standards', 'building', 'built environment', 'cabinet', 'call centre administration', 'carriage service providers', 'carrier licensing', 'censorship', 'censorship standards', 'census collection', 'ceremonial functions', 'chemical and pesticide regulation', 'child welfare', 'citizenship', 'civic infrastructure', 'civic management', 'civil engineering', 'civil law', 'climate information services', 'coastal surveillance', 'collection access', 'collection accessioning', 'collection acquisition', 'collection management', 'collection promotion', 'collection storage', 'colonial administration', 'commissions of inquiry', 'committees of inquiry', 'commonwealth state relations', 'communications', 'community health services', 'community policing', 'community protection', 'community services', 'community support', 'community transport', 'compensation schemes', 'conservation', 'conservation programs', 'construction', 'consular services', 'consumer affairs', 'copyright', 'copyright regulation', 'coronial law', 'corporate affairs', 'corrective services', 'counselling services', 'counterfeiting', 'courier services', 'court reporting', 'courts and tribunals', 'courts martial', 'criminal law', 'criminology', 'crown land administration', 'cultural affairs', 'cultural awards and scholarships', 'cultural festivals', 'cultural gifts programs', 'currency', 'curriculum development', 'customs', 'customs regulations', 'declaration of interests', 'defence', 'defence administration', 'defence coordination', 'defence estate management', 'defence force careers', 'defence forces', 'defence forces assistance', 'defence industries', 'defence intelligence', 'defence research', 'defence service home schemes', 'dental services', 'deportation', 'detention programs', 'development assistance programs', 'diplomatic missions', 'disability services', 'disaster recovery', 'disaster relief', 'driving licenses administration', 'early childhood education', 'earth sciences', 'education', 'education and training', 'election campaigning', 'electoral boundary assessment', 'electoral matters', 'electronic commerce', 'electronic postal services', 'emergency funding', 'emergency management', 'emergency services', 'employment', 'employment services', 'energy', 'energy resources', 'environment', 'environmental impact assessment', 'environmental monitoring', 'equipment licensing', 'equity programs', 'ethical compliance', 'exchange rates', 'excise', 'export regulation', 'exports and imports', 'expositions', 'external security', 'extraditions', 'family law', 'federal law', 'field force (army)', 'film production', 'finance management', 'financial assistance', 'financial matters', 'firefighting services', 'fiscal policy', 'fisheries regulation', 'fleet', 'flight regulation', 'foreign investment control', 'foreign policy', 'forensic analysis', 'forestry regulation', 'freight', 'freight movement regulation', 'genetics', 'goods and services', 'governance', 'government accommodation and catering', 'government media', 'government representation overseas', 'governor general', 'grants administration', 'health', 'health care', 'health insurance', 'health services', 'hearing services', 'historic memorials', 'historic relic protection', 'home savings schemes', 'horticulture', 'hospitals and clinics', 'house of representatives committees', 'housing', 'human rights', 'human rights obligations', 'hydrology', 'immigration', 'import regulation', 'income assessment', 'indigenous affairs', 'indigenous cultural heritage', 'indigenous enterprises', 'indigenous heritage conservation', 'indigenous land rights', 'indigenous settlements', 'industrial relations', 'industries', 'information management standards', 'information security', 'inspection services', 'insurance', 'intelligence', 'intelligence liaison', 'intelligence support', 'internal security', 'international affairs', 'international liaison', 'international relations', 'international security liaison', 'international trade agreements', 'international treaty participation', 'internees', 'interpreter services', 'interstate trade agreements', 'investigation', 'justice administration', 'juvenile justice', 'labour market programs', 'land transport', 'land use', 'land use planning', 'land use zoning', 'land valuation', 'language services', 'law enforcement', 'leasing', 'legal', 'legal aid', 'legal aid services', 'legal services', 'legislation', 'lighthouses', 'literature funding', 'litigation processes', 'loans', 'local laws and ordinances', 'logistics', 'logistics (air force)', 'logistics (army)', 'logistics (defence)', 'maintenance', 'marine and rural regulation', 'marine and rural support', 'marine life protection programs', 'marine science', 'maritime commands (navy)', 'maritime services', 'market regulation', 'marketing', 'mathematical sciences', 'media ownership regulation', 'mediation programs', 'medical aids regulation', 'medical and health sciences', 'medical research', 'medical research funding', 'memorials', 'metals', 'meteorology', 'migrant accommodation services', 'migrant services', 'migrant settlements programs', 'migration', 'military operations', 'mineral exploration', 'mineral resources', 'mining', 'mobile telephone services', 'multicultural heritage promotion', 'multiculturalism', 'munitions', 'national events', 'national fitness', 'national heritage', 'national land use', 'national parks', 'national referral laboratory services', 'national service', 'native title claims', 'natural disasters', 'natural heritage protection', 'naturalisation assessment', 'navigation', 'navy', 'navy administration', 'navy commands', 'navy support', 'nursing services', 'occupational health and safety', 'oceanography', 'oceans governance', 'ombudsman', 'ordnance', 'overseas aid programs', 'overseas promotion', 'overseas student scholarship programs', 'parks', 'parliamentary chamber administration', 'parliamentary committees', 'parliamentary legislation', 'parliamentary matters', 'passenger entry control', 'passenger services', 'passport services', 'passports', 'pastoral', 'patent registration', 'patents and trademarks', 'pathology', 'peacekeeping forces', 'pensions and benefits', 'personnel', 'pharmaceuticals and medical aids', 'physical sciences', 'planning', 'police administration', 'police station', 'pollutant prevention programs', 'pollution emission control', 'population-based research', 'port authorities', 'port regulation', 'postal services', 'preschool education', 'presentation arrangements', 'preservation services', 'primary education', 'primary industries', 'prisoners of war', 'privacy guideline monitoring', 'property management', 'prosecution services', 'protective services', 'public borrowing', 'public service', 'public utilities', 'publishing', 'publishing and printing', 'quarantine', 'radio broadcasting', 'radio communication', 'rail harmonisation standards', 'rail land acquisition regulation', 'rail transport', 'rail transport safety', 'railway maintenance', 'rationing and price control', 'recordkeeping standards', 'records of the government', 'recreation', 'recruitment', 'refugee services', 'refugees', 'regional development', 'rehabilitation', 'removals', 'repatriation', 'repatriation hospitals', 'rescue coordination', 'research', 'research and development', 'resources', 'retail postal services', 'retirement income', 'revenue raising', 'road safety', 'road surface maintenance', 'road traffic regulation', 'road transport', 'road transport safety', 'royal commissions', 'rural community development', 'rural field day promotion', 'rural partnership programs', 'satellite communication', 'science', 'scientific research', 'sea safety', 'sea transport', 'seat of government', 'secondary education', 'secondary industries', 'security', 'security and intelligence', 'seismography', 'settlement negotiations', 'shipbuilding', 'social and economic research', 'social justice and equity', 'social welfare', 'space science', 'spatial information research', 'sport', 'standard setting', 'statistics', 'storage', 'strategic development', 'strategic policy', 'strategic support', 'student assistance', 'superannuation', 'supreme court law', 'supreme law', 'surveillance', 'surveillance, electronic', 'survey and mapping', 'tariff', 'tariff regulation', 'tariffs', 'taxation', 'taxation compliance', 'telecommunications', 'telephone services', 'television broadcasting', 'territory administration', 'tertiary education', 'tourism', 'tourism industry development', 'tourist event promotion', 'trade', 'trade development programs', 'trade expositions', 'trade practices', 'trade skills assessment', 'trade union training', 'trademark registration', 'training', 'training (air force)', 'training (army)', 'transport', 'transport and storage', 'transport infrastructure development', 'travel authorisation', 'travel missions', 'urban development', 'urban or regional development', 'valuation', 'vehicle registration', 'vehicle standards', "veterans' affairs", 'visas', 'viticulture', 'vocational training schemes', 'war memorials', 'wartime security', 'waste disposal', 'water conservation plans', 'water quality monitoring', 'water resources', 'water usage management', 'waterway management', 'weights and measures', 'works', 'world heritage listings', 'zoology']
In RecordSearch, functions are performed by agencies. So when you search for a function you get back a list of agencies. Here we'll loop through the list of functions and search for associated agencies.
function_totals = []
for function in tqdm(functions_list):
agencies = RSAgencySearch(function=function)
# Get the total results from each search (replace None with 0)
total = agencies.total_results
function_totals.append({"function": function, "total": total})
# Create a DataFrame with the results
df = pd.DataFrame(function_totals)
df.describe()
total | |
---|---|
count | 472.000000 |
mean | 27.118644 |
std | 52.554882 |
min | 0.000000 |
25% | 0.000000 |
50% | 1.000000 |
75% | 28.250000 |
max | 419.000000 |
So 75% of all functions have less than 28 associated agencies.
How many are actiually used?
# How many functions are actually used
used = df.loc[df["total"] > 0].count()
print(used["total"])
243
percent_used = used["function"] / len(functions_list)
print("{:.1%} of the functions are used".format(percent_used))
51.5% of the functions are used
# Most used function
df.loc[df["total"] == df["total"].max()]
function | total | |
---|---|---|
72 | employment | 419 |
# Top 20 by number of agencies
df.sort_values(by="total", ascending=False)[:20]
function | total | |
---|---|---|
72 | employment | 419 |
340 | education | 294 |
226 | army commands | 286 |
214 | social welfare | 270 |
275 | indigenous affairs | 268 |
163 | training | 232 |
354 | housing | 220 |
203 | scientific research | 216 |
68 | migration | 199 |
417 | goods and services | 195 |
25 | customs | 183 |
36 | government representation overseas | 176 |
306 | community services | 175 |
207 | secondary industries | 173 |
359 | administrative law | 169 |
431 | broadcasting | 169 |
276 | logistics (army) | 166 |
466 | sea transport | 163 |
56 | health | 161 |
402 | air transport | 157 |
# Bin the agencies to make it wasier to read
alt.Chart(df).mark_bar().encode(
x=alt.X("total:Q", bin=alt.Bin(step=10), title="Number of associated agencies"),
y=alt.Y("count()", title="Number of functions"),
tooltip=[
alt.Tooltip("total:Q", bin=alt.Bin(step=10), title="Agencies"),
alt.Tooltip("count()", title="Functions"),
],
)
Created by Tim Sherratt as part of the GLAM Workbench.