QueryPic is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.
This is a deconstructed, extended, and hackable version of QueryPic.
import math
import os
import time
from collections import OrderedDict
from operator import itemgetter # used for sorting
import altair as alt
import ipywidgets as widgets
import pandas as pd # makes manipulating the data easier
import requests
from IPython.display import HTML, FileLink, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
# Make sure data directory exists
os.makedirs("data", exist_ok=True)
# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
%%capture
# Load env variables
%load_ext dotenv
%dotenv
Get your own Trove API key and enter it below.
api_key = widgets.Text(
placeholder="Enter your Trove API key", description="API key:", disabled=False
)
display(api_key)
params = {
"q": " ", # A space to search for everything
"facet": "year",
"zone": "newspaper",
# 'l-category': 'Article',
"encoding": "json",
"n": 0,
}
results = widgets.Output()
save_data = widgets.Output()
df = None
def get_results(params):
"""
Get JSON response data from the Trove API.
Parameters:
params
Returns:
JSON formatted response data from Trove API
"""
response = s.get(
"https://api.trove.nla.gov.au/v2/result", params=params, timeout=30
)
response.raise_for_status()
# print(response.url) # This shows us the url that's sent to the API
data = response.json()
return data
def get_facets(data):
"""
Loop through facets in Trove API response, saving terms and counts.
Parameters:
data - JSON formatted response data from Trove API
Returns:
A list of dictionaries containing: 'year', 'total_results'
"""
facets = []
try:
for term in data["response"]["zone"][0]["facets"]["facet"]["term"]:
if (
int(term["display"]) >= date_range.value[0]
and int(term["display"]) <= date_range.value[1]
):
facets.append(
{"year": int(term["display"]), "total_results": int(term["count"])}
)
facets.sort(key=itemgetter("year"))
except TypeError:
pass
return facets
def combine_totals(query_data, total_data):
"""
Take facets data from the query search and a blank search (ie everything) for a decade and combine them.
Parameters:
query_data - list of dictionaries containing facets data from a query search
total_data - list of dictionaries containing facets data from a blank search
Returns:
A list of dictionaries containing: 'year', 'total_results', 'total articles'
"""
combined_data = []
query_data = get_facets(query_data)
total_data = get_facets(total_data)
for index, query_row in enumerate(query_data):
total_row = total_data[index]
query_row["total_articles"] = total_row["total_results"]
combined_data.append(query_row)
return combined_data
def year_totals(params):
"""
Generate a dataset for a search query.
Parameters:
query - search query
Returns:
A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.
"""
totals = []
start_decade = math.floor(date_range.value[0] / 10)
end_decade = math.floor(date_range.value[1] / 10) + 1
query = params["q"]
with results:
for decade in tqdm(range(start_decade, end_decade)):
params["l-decade"] = decade
params["q"] = query
query_data = get_results(params)
params["q"] = " "
total_data = get_results(params)
combined_data = combine_totals(query_data, total_data)
totals.extend(combined_data)
totals.sort(key=itemgetter("year"))
return totals
date_range = widgets.IntRangeSlider(
value=[1803, 1954],
min=1803,
max=2018,
step=1,
description="Date range:",
disabled=False,
continuous_update=False,
orientation="horizontal",
readout=True,
readout_format="0<4d",
layout=widgets.Layout(width="50%"),
)
display(date_range)
You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:
cat
vs dog
swimmers
in NSW, Victoria, and Queenslandprotectionism
in The Age vs The Argusqueries = []
out = widgets.Output()
@out.capture()
def add_query(b):
queries.append(query.value)
query.value = ""
print("Query {}: {}".format(len(queries), queries[-1]))
query = widgets.Text(
placeholder="Enter your query then click the button to add",
disabled=False,
)
query_button = widgets.Button(
description="Add query", disabled=False, tooltip="Click to add query", icon=""
)
query_button.on_click(add_query)
query_tip = widgets.HTML(
value="A query can be anything you'd enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want."
)
def get_titles(b):
params = {"encoding": "json", "key": api_key.value}
response = requests.get(
"http://api.trove.nla.gov.au/v2/newspaper/titles", params=params
)
data = response.json()
title_list = [
(t["title"], {"id": t["id"], "title": t["title"]})
for t in data["response"]["records"]["newspaper"]
]
title_list.sort(key=itemgetter(0))
titles_sorted = OrderedDict(title_list)
titles.options = titles_sorted
title_query = widgets.Text(
placeholder="Enter your query",
description="Search for:",
disabled=False,
)
titles = widgets.SelectMultiple(
options=["Click on button to load titles"],
rows=10,
description="In:",
disabled=False,
layout=widgets.Layout(width="50%"),
)
titles_button = widgets.Button(
description="Load titles",
disabled=False,
button_style="", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Click to load titles",
icon="",
)
titles_button.on_click(get_titles)
titles_tip = widgets.HTML(
value="Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple newspapers to compare."
)
state_query = widgets.Text(
placeholder="Enter your query",
description="Search for:",
disabled=False,
)
states = widgets.SelectMultiple(
options=[
"ACT",
"New South Wales",
"Queensland",
"South Australia",
"Northern Territory",
"Tasmania",
"Victoria",
"Western Australia",
"National",
"International",
],
rows=10,
description="In:",
disabled=False,
layout=widgets.Layout(width="50%"),
)
states_tip = widgets.HTML(
value="Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple states to compare."
)
def plot_raw_results(width=700, height=400):
chart = (
alt.Chart(df)
.mark_line(point=True)
.encode(
x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
y=alt.Y(
"total_results:Q",
axis=alt.Axis(format=",d", title="Number of articles"),
),
color=alt.Color("query", legend=alt.Legend(title="")),
tooltip=[
alt.Tooltip("query", title="Query:"),
alt.Tooltip("year:Q", title="Year"),
alt.Tooltip("total_results:Q", title="Articles", format=","),
],
)
.properties(width=width, height=height)
)
return chart
def plot_relative_results(width=700, height=400):
chart = (
alt.Chart(df)
.mark_line(point=True)
.encode(
x=alt.X("year:Q", axis=alt.Axis(format="c", title="Year")),
y=alt.Y(
"PercentOfTotal:Q",
axis=alt.Axis(format=".2%", title="Percentage of total articles"),
),
color=alt.Color("query", legend=alt.Legend(title="")),
tooltip=[
alt.Tooltip("query", title="Query:"),
alt.Tooltip("year:Q", title="Year"),
alt.Tooltip("PercentOfTotal:Q", title="Articles", format=".2%"),
],
)
.properties(width=width, height=height)
.transform_calculate(
PercentOfTotal="datum.total_results / datum.total_articles"
)
)
return chart
def clear_all(b):
states.value = []
state_query.value = ""
titles.value = []
title_query.value = ""
out.clear_output()
queries.clear()
results.clear_output()
save_data.clear_output()
def get_data(b):
global df
results.clear_output()
save_data.clear_output()
traces = []
q_params = params.copy()
q_params["key"] = api_key.value
if tab.selected_index == 0:
for query in queries:
q_params["q"] = query
with results:
display(HTML("Searching for {}...".format(query)))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals["query"] = query
traces.append(df_totals)
elif tab.selected_index == 1:
q_params["q"] = state_query.value
for state in states.value:
q_params["l-state"] = state
with results:
display(HTML("Searching in {}...".format(state)))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals["query"] = state
traces.append(df_totals)
elif tab.selected_index == 2:
q_params["q"] = title_query.value
for title in titles.value:
q_params["l-title"] = title["id"]
with results:
display(HTML("Searching in {}...".format(title["title"])))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals["query"] = title["title"]
traces.append(df_totals)
try:
df = pd.concat(traces, ignore_index=True)
except ValueError:
with results:
display(HTML("No results!"))
else:
results.clear_output()
chart = plot_raw_results()
chart_type.value = "raw"
csv_file = save_as_csv()
with results:
display(chart_type)
display(chart)
with save_data:
display(
HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>')
)
display(
widgets.HBox([save_chart_button, save_chart_width, save_chart_height])
)
def save_chart(b):
width = save_chart_width.value
height = save_chart_height.value
if chart_type.value == "proportion":
chart = plot_relative_results(width, height)
else:
chart = plot_raw_results(width, height)
filename = "data/querypic-{}.html".format(int(time.time()))
chart.save(filename)
with save_data:
display(HTML("View HTML version:"), FileLink(filename))
def save_as_csv():
filename = "data/querypic-{}.csv".format(int(time.time()))
df.to_csv(filename, index=False)
return filename
def change_chart(o):
results.clear_output(wait=True)
if chart_type.value == "proportion":
chart = plot_relative_results()
else:
chart = plot_raw_results()
with results:
display(chart_type)
display(chart)
chart_type = widgets.Dropdown(
options=[
("Raw number of results", "raw"),
("Proportion of total articles", "proportion"),
],
value="raw",
)
chart_type.observe(change_chart)
clear_all_button = widgets.Button(
description="Clear all",
disabled=False,
button_style="", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Clear current queries",
icon="",
)
get_data_button = widgets.Button(
description="Create chart",
disabled=False,
button_style="primary", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Create chart",
icon="",
)
save_chart_button = widgets.Button(
description="Save chart",
disabled=False,
button_style="primary", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Save chart as HTML",
icon="",
)
save_chart_width = widgets.BoundedIntText(
value=700, min=700, max=2000, step=100, description="Width", disabled=False
)
save_chart_height = widgets.BoundedIntText(
value=400, min=400, max=1500, step=100, description="Height", disabled=False
)
clear_all_button.on_click(clear_all)
get_data_button.on_click(get_data)
save_chart_button.on_click(save_chart)
tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])
tab2 = widgets.VBox([state_query, states, states_tip])
tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])
tab = widgets.Tab(children=[tab1, tab2, tab3])
tab.set_title(0, "Compare queries")
tab.set_title(1, "Compare states")
tab.set_title(2, "Compare newspapers")
display(
widgets.VBox(
[tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]
)
)
# TESTING
if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"):
api_key.value = os.getenv("TROVE_API_KEY")
query.value = "cat"
query_button.click()
get_data_button.click()
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.