View in GLAM Workbench · View code
What does it mean when your search in Trove's digitised newspapers returns 3 million results? QueryPic helps you explore your search results by showing you how they change over time – aggregating the number of articles matching your query by day, month, or year.
# This notebook is designed to run in Voila
# If you can see the code, just select 'View > Open with Voila in new browser tab' from the menu.
%%capture
import os
import re
import time
from calendar import monthrange
from operator import itemgetter # used for sorting
import altair as alt
import arrow
import ipywidgets as widgets
import pandas as pd # makes manipulating the data easier
import requests_cache
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from trove_query_parser.parser import parse_query
# Make sure data directory exists
os.makedirs("data", exist_ok=True)
# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession("querypic", expire_after=60 * 60)
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
# CONFIG SO THAT ALTAIR HREFS OPEN IN A NEW TAB
def blank_href():
return {"usermeta": {"embedOptions": {"loader": {"target": "_blank"}}}}
# register the custom theme under a chosen name
alt.themes.register("blank_href", blank_href)
# enable the newly registered theme
alt.themes.enable("blank_href")
dfs = []
queries = []
unit = None
shifted = False
%%capture
# Load environment variables if available
%load_ext dotenv
%dotenv
def get_results(params):
"""
Get JSON response data from the Trove API.
Parameters:
params
Returns:
JSON formatted response data from Trove API
"""
response = s.get(
"https://api.trove.nla.gov.au/v2/result", params=params, timeout=30
)
response.raise_for_status()
# display(response.url) # This shows us the url that's sent to the API
data = response.json()
if not response.from_cache:
time.sleep(0.2)
return data
def get_year_facets(data, start, end):
"""
Loop through facets in Trove API response, saving terms and counts.
Parameters:
data - JSON formatted response data from Trove API
Returns:
A list of dictionaries containing: 'year', 'total_results'
"""
dates = {}
try:
for term in data["response"]["zone"][0]["facets"]["facet"]["term"]:
if int(term["display"]) >= start and int(term["display"]) <= end:
dates[f'{term["display"]}-01-01'] = int(term["count"])
except TypeError:
pass
return dates
def get_month_facets(data, year, start, end):
"""
Loop through facets in Trove API response, saving terms and counts.
Parameters:
data - JSON formatted response data from Trove API
"""
dates = {}
try:
for term in data["response"]["zone"][0]["facets"]["facet"]["term"]:
iso_date = f'{year}-{term["search"]:02}-01'
date = arrow.get(iso_date)
if date >= arrow.get(start) and date <= arrow.get(end):
dates[iso_date] = int(term["count"])
except TypeError:
pass
return dates
def combine_totals(query_data, total_data, start, end, unit):
"""
Take facets data from the query search and a blank search (ie everything) for a decade and combine them.
Parameters:
query_data - list of dictionaries containing facets data from a query search
total_data - list of dictionaries containing facets data from a blank search
Returns:
A list of dictionaries containing: 'year', 'total_results', 'total articles'
"""
totals = []
# These are for cases where a full datetime is provided
if unit == "year":
start = f"{start[:4]}-01-01"
elif unit == "month":
start = f"{start[:7]}-01"
start_date = arrow.get(start)
if shifted and unit == "day":
start_date = start_date.shift(days=+1)
end_date = arrow.get(end)
while start_date <= end_date:
totals.append(
{
"date": start_date.format("YYYY-MM-DD"),
"total_results": query_data.get(start_date.format("YYYY-MM-DD"), 0),
"total_articles": total_data.get(start_date.format("YYYY-MM-DD"), 0),
}
)
if unit == "year":
start_date = start_date.shift(years=+1)
elif unit == "month":
start_date = start_date.shift(months=+1)
elif unit == "day":
start_date = start_date.shift(days=+1)
return totals
def clean_params(params):
"""
Remove unwanted facets from query to get total articles.
"""
keep = [
"l-decade",
"l-year",
"l-month",
"l-title",
"l-state",
"key",
"encoding",
"q",
"n",
"zone",
"facet",
]
params_c = params.copy()
for k in list(params_c.keys()):
if k not in keep:
del params_c[k]
return params_c
def year_totals(params):
"""
Generate a dataset for a search query.
Parameters:
params: the API search parameters
Returns:
A list of dicts, each containing:
- date
- total_results
- total_articles
"""
global unit
query_dates = {}
total_dates = {}
params_c = params.copy()
q = params_c["q"]
if choose_unit.value != "auto":
unit = choose_unit.value
start, end, _ = set_date_range(params_c)
else:
start, end, unit = set_date_range(params_c)
start_year = int(start[:4])
end_year = int(end[:4])
with results:
if unit == "year":
start_decade = int(start[:3])
end_decade = int(end[:3])
for decade in tqdm(range(start_decade, end_decade + 1), leave=False):
params_c["facet"] = "year"
params_c["q"] = q
params_c["l-decade"] = decade
query_data = get_results(params_c)
params_cleaned = clean_params(params_c)
params_cleaned["q"] = " "
total_data = get_results(params_cleaned)
query_dates.update(get_year_facets(query_data, start_year, end_year))
total_dates.update(get_year_facets(total_data, start_year, end_year))
totals = combine_totals(query_dates, total_dates, start, end, unit)
totals.sort(key=itemgetter("date"))
elif unit == "month":
for year in tqdm(range(start_year, end_year + 1), leave=False):
params_c["q"] = q
params_c["l-decade"] = str(year)[:3]
params_c["l-year"] = year
params_c["facet"] = "month"
query_data = get_results(params_c)
params_cleaned = clean_params(params_c)
params_cleaned["q"] = " "
total_data = get_results(params_cleaned)
query_dates.update(get_month_facets(query_data, year, start, end))
total_dates.update(get_month_facets(total_data, year, start, end))
totals = combine_totals(query_dates, total_dates, start, end, unit)
elif unit == "day":
totals = []
start_date = arrow.get(start)
if shifted:
start_date = start_date.shift(days=+1)
end_date = arrow.get(end)
with tqdm(total=(end_date - start_date).days + 1, leave=False) as pbar:
while start_date <= end_date:
q = re.sub(r" date:\[.+\]", "", q)
from_date = start_date.shift(days=-1).format("YYYY-MM-DDT00:00:00")
to_date = start_date.format("YYYY-MM-DDT00:00:00")
q = q + f" date:[{from_date}Z TO {to_date}Z]"
params_c["q"] = q
query_data = get_results(params_c)
params_cleaned = clean_params(params_c)
params_cleaned["q"] = f"date:[{from_date}Z TO {to_date}Z]"
total_data = get_results(params_cleaned)
totals.append(
{
"date": to_date,
"total_results": int(
query_data["response"]["zone"][0]["records"]["total"]
),
"total_articles": int(
total_data["response"]["zone"][0]["records"]["total"]
),
}
)
start_date = start_date.shift(days=+1)
pbar.update(1)
return totals
def set_date_range(params):
"""
Determines the date range from the query paramaters,
then uses the date range to set the time unit.
Returns:
- start: start date (ISO format)
- end: end date (ISO format)
- unit: one of 'year', 'month', or 'day'
"""
global shifted
shifted = False
if "l-month" in params:
start = f'{params["l-year"][0]}-{params["l-month"][0]}-01'
end = f'{params["l-year"][0]}-{params["l-month"][0]}-{monthrange(int(params["l-year"][0]), int(params["l-month"][0]))[1]}'
unit = "day"
elif "l-year" in params:
start = params["l-year"][0] + "-01-01"
end = params["l-year"][0] + "-12-31"
unit = "month"
elif "l-decade" in params:
start = params["l-decade"][0] + "0-01-01"
end = params["l-decade"][0] + "9-12-31"
unit = "month"
elif "date:" in params["q"]:
date_range = re.search(r"date:\[(.+)\]", params["q"]).group(1)
start_date, _, end_date = date_range.split()
if len(start_date) > 4:
shifted = True
diff = arrow.get(end_date) - arrow.get(start_date)
days = diff.days
# More than a 20 years
if days > 2 * 3653:
start = start_date[:10]
end = end_date[:10]
unit = "year"
# A single year
elif days == 0:
start = start_date[:10]
end = (
arrow.get(end_date[:10])
.shift(years=+1)
.shift(days=-1)
.format("YYYY-MM-DD")
)
unit = "month"
elif days < 94:
start = start_date[:10]
end = end_date[:10]
unit = "day"
else:
start = start_date[:10]
end = end_date[:10]
unit = "month"
else:
start = "1803-01-01"
end = arrow.now().format("YYYY-01-01")
unit = "year"
return start, end, unit
def show_results(view="raw"):
"""
Display the chart and the save data options.
"""
results.clear_output(wait=True)
save_data.clear_output(wait=True)
chart = make_chart(view=view)
chart_type.unobserve(change_chart, "value")
chart_type.value = "raw"
chart_type.observe(change_chart, "value")
csv_file = save_as_csv()
with results:
display(chart_type)
display(chart)
with save_data:
display(
widgets.HBox([save_chart_button, save_chart_width, save_chart_height]),
layout=widgets.Layout(margin="50px 0 50px 0"),
)
display(HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>'))
def make_chart(view, width=800, height=400):
"""
Create the chart.
Parameters:
- view: either 'raw' or 'relative'
- width: in pixels
- height: in pixels
"""
# Combine dfs into a single df
df = pd.concat(dfs, ignore_index=True)
# Define shared tooltips
tooltip = [
alt.Tooltip("id", title="query"),
alt.Tooltip("total_results:Q", title="results", format=","),
alt.Tooltip("PercentOfTotal:Q", title="proportion", format=".2%"),
]
# Configure x & tooltips based on time unit
if unit == "year":
x = alt.X(
"year(date):T", axis=alt.Axis(title="Year"), scale=alt.Scale(padding=10)
)
tooltip.insert(1, alt.Tooltip("year(date):T", title="year"))
elif unit == "month":
x = alt.X(
"yearmonth(date):T",
axis=alt.Axis(title="Month"),
scale=alt.Scale(padding=10),
)
tooltip.insert(1, alt.Tooltip("yearmonth(date):T", title="month"))
elif unit == "day":
x = alt.X(
"date:T",
axis=alt.Axis(title="Date", format="%e %b %Y"),
scale=alt.Scale(padding=10),
)
tooltip.insert(1, alt.Tooltip("date:T", title="date", format="%A, %e %b %Y"))
# Configure y based on cahrt view type
if view == "raw":
y = alt.Y(
"total_results:Q", axis=alt.Axis(format=",d", title="Number of articles")
)
elif view == "relative":
y = alt.Y(
"PercentOfTotal:Q",
axis=alt.Axis(format=".2%", title="Percentage of total articles"),
)
# Create chart
plot = (
alt.Chart(df)
.mark_line(point=True, interpolate="cardinal")
.encode(
x=x,
y=y,
tooltip=tooltip,
color=alt.Color("id", legend=alt.Legend(title="")),
href="url:N",
)
.properties(
width=width,
height=height,
title={
"text": "Trove Newspapers & Gazettes Search",
"subtitle": f'Created by QueryPic: {arrow.now().format("D MMMM YYYY")}',
},
)
.transform_calculate(
PercentOfTotal="datum.total_results / datum.total_articles"
)
)
# Create text chart listing queries
query_list = list_queries()
# Combine charts
chart = (
alt.vconcat(plot, query_list)
.configure(padding=20)
.configure_view(strokeWidth=0)
.configure_title(fontSize=14)
)
return chart
def list_queries():
"""
Creates a text-based chart that lists the saved queries.
"""
df = pd.DataFrame(queries)
chart = (
alt.Chart(df)
.mark_text(align="left", dx=2, dy=1, baseline="middle")
.encode(
x=alt.X("x:Q", title=None, axis=None, scale=alt.Scale(range=[0, 1])),
y=alt.Y(
"id:O",
title=None,
axis=alt.Axis(labelFontWeight="bold", domain=False, grid=False),
),
text="url:N",
href="url",
color=alt.value("blue"),
)
)
return chart
def clear_all(b):
"""
Clear all queries and results.
"""
global dfs, queries
dfs = []
queries = []
query.value = ""
results.clear_output()
save_data.clear_output()
def clear_last(b):
"""
Remove the most recent query from the chart.
"""
global dfs, queries
results.clear_output()
save_data.clear_output()
dfs.pop()
queries.pop()
if dfs:
show_results()
def save_chart(b):
"""
Save the chart as HTML for download.
"""
width = save_chart_width.value
height = save_chart_height.value
if chart_type.value == "proportion":
chart = make_chart("relative", width, height)
else:
chart = make_chart("raw", width, height)
filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.html'
chart.save(filename)
with save_data:
display(
HTML(f"Download HTML version: <a href={filename} download>{filename}</a>")
)
# display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height], layout=widgets.Layout(margin='50px 0 50px 0')))
def save_as_csv():
"""
Save harvested data as a CSV for download.
"""
df = pd.concat(dfs, ignore_index=True)
filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.csv'
df.to_csv(filename, index=False)
return filename
def change_chart(o):
"""
Switch between chart views.
"""
results.clear_output()
if chart_type.value == "proportion":
view = "relative"
else:
view = "raw"
chart = make_chart(view)
# chart_type.value = view
with results:
display(chart_type)
display(chart)
def add_date_query(date):
date_from = arrow.get(date).shift(days=-1).format("YYYY-MM-DD")
date_query = f"date:[{date_from}T00:00:00Z TO {date}T00:00:00Z]"
url = re.sub(r"\s*date:\[.+\]", "", query.value)
url = re.sub(r"(keyword=[^&]+)", r"\1 " + date_query, url)
return url
def add_urls_to_df(df):
url = re.sub(r"\s*date:\[.+\]", "", query.value)
if unit == "year":
df["url"] = df["date"].apply(lambda x: f"{url}&l-decade={x[:3]}&l-year={x[:4]}")
elif unit == "month":
df["url"] = df["date"].apply(
lambda x: f'{url}&l-decade={x[:3]}&l-year={x[:4]}&l-month={x[5:7].strip("0")}'
)
elif unit == "day":
df["url"] = df["date"].apply(lambda x: add_date_query(x))
return df
def get_data(b):
"""
Assemble the data and prepare it for display.
"""
global dfs, queries
# Add current query to queries list
queries.append(
{
"x": 0,
"y": len(queries),
"id": f"Query {len(queries) + 1}",
"url": query.value,
"params": query.value.split("?")[1],
}
)
# Extract params from query
params = parse_query(query.value)
# Add extra params for API
params["key"] = api_key.value
params["encoding"] = "json"
params["n"] = 1
# Limit to newspapers if no specific zone set
if "," in params["zone"]:
params["zone"] = "newspaper"
# Get the data
totals = year_totals(params)
# Convert to dataframe
df = pd.DataFrame(totals)
# Add urls to the data rows
df = add_urls_to_df(df)
# Add a query id to the dataframe
df["id"] = f"Query {len(queries)}"
# Add current ddf to list of dfs
dfs.append(df)
# Display the results
show_results()
# CREATE WIDGETS
results = widgets.Output()
save_data = widgets.Output()
chart_type = widgets.Dropdown(
options=[
("Raw number of results", "raw"),
("Proportion of total articles", "proportion"),
],
value="raw",
)
chart_type.observe(change_chart, "value")
api_key = widgets.Password(
placeholder="Enter your Trove API key",
description="API key:",
disabled=False,
value="",
)
query = widgets.Text(
placeholder="Enter your search query",
description="Query:",
disabled=False,
value="",
layout=widgets.Layout(width="80%"),
)
choose_unit = widgets.Dropdown(
options=[
("Automatic", "auto"),
("Year", "year"),
("Month", "month"),
("Day", "day"),
],
value="auto",
description="Time unit:",
)
clear_last_button = widgets.Button(
description="Remove last query",
disabled=False,
button_style="", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Remove the last query",
icon="",
)
clear_all_button = widgets.Button(
description="Clear all queries",
disabled=False,
button_style="", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Clear current queries",
icon="",
)
get_data_button = widgets.Button(
description="Visualise query",
disabled=False,
button_style="primary", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Create chart from query",
icon="",
)
save_chart_button = widgets.Button(
description="Save chart as HTML",
disabled=False,
button_style="primary", # 'success', 'info', 'warning', 'danger' or ''
tooltip="Save chart as HTML",
icon="",
)
save_chart_width = widgets.BoundedIntText(
value=700, min=700, max=2000, step=100, description="Width", disabled=False
)
save_chart_height = widgets.BoundedIntText(
value=400, min=400, max=1500, step=100, description="Height", disabled=False
)
clear_all_button.on_click(clear_all)
clear_last_button.on_click(clear_last)
get_data_button.on_click(get_data)
save_chart_button.on_click(save_chart)
Get your own Trove API key and enter it below.
display(api_key)
Construct your search using the Trove web interface, then just copy and paste the url into the box below.
display(query)
QueryPic aggregates search results by time units – either 'year', 'month', or 'day'. If you choose 'Automatic' in the list below, QueryPic will choose a unit based on the date range of your query, trying to balance resolution and efficiency:
If you're not happy with these results you can select your own time unit.
display(choose_unit)
You can add as many queries as you want to a single chart.
display(
widgets.VBox(
[
widgets.HBox(
[get_data_button, clear_last_button, clear_all_button],
layout=widgets.Layout(margin="0 0 20px 0"),
),
results,
save_data,
]
)
)
But what are you actually searching? For more ways of analysing Trove's digitised newspaper corpus, see the Trove Newspapers in Context of the GLAM Workbench.
How does it work? For examples of how to use Trove's facets to construct high-level visualisations like these, see Visualise Trove newspaper searches over time.
Any problems? Feel free to ask questions in the GLAM Workbench section of OzGLAM Help.
# TESTING
if os.getenv("GW_STATUS") == "dev" and os.getenv("TROVE_API_KEY"):
api_key.value = os.getenv("TROVE_API_KEY")
query.value = "https://trove.nla.gov.au/search/category/newspapers?keyword=cat"
get_data_button.click()
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.