QueryPic is a tool I created many years ago to visualise searches in Trove's digitised newspapers. It shows you the number of articles each year that match your query — instead of a page of search results, you see the complete result set. You can look for patterns and trends across time.
This is a deconstructed, extended, and hackable version of QueryPic.
import requests
from requests.exceptions import HTTPError, Timeout
import os
import ipywidgets as widgets
from operator import itemgetter # used for sorting
import pandas as pd # makes manipulating the data easier
import altair as alt
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from tqdm.auto import tqdm
from IPython.display import display, HTML, FileLink, clear_output
import math
from collections import OrderedDict
import time
# Make sure data directory exists
os.makedirs('data', exist_ok=True)
# Create a session that will automatically retry on server errors
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
Get your own Trove API key and enter it below.
api_key = widgets.Text(
placeholder='Enter your Trove API key',
description='API key:',
disabled=False
)
display(api_key)
params = {
'q': ' ', # A space to search for everything
'facet': 'year',
'zone': 'newspaper',
# 'l-category': 'Article',
'encoding': 'json',
'n': 0
}
results = widgets.Output()
save_data = widgets.Output()
df = None
def get_results(params):
'''
Get JSON response data from the Trove API.
Parameters:
params
Returns:
JSON formatted response data from Trove API
'''
response = s.get('https://api.trove.nla.gov.au/v2/result', params=params, timeout=30)
response.raise_for_status()
# print(response.url) # This shows us the url that's sent to the API
data = response.json()
return data
def get_facets(data):
'''
Loop through facets in Trove API response, saving terms and counts.
Parameters:
data - JSON formatted response data from Trove API
Returns:
A list of dictionaries containing: 'year', 'total_results'
'''
facets = []
try:
for term in data['response']['zone'][0]['facets']['facet']['term']:
if int(term['display']) >= date_range.value[0] and int(term['display']) <= date_range.value[1]:
facets.append({'year': int(term['display']), 'total_results': int(term['count'])})
facets.sort(key=itemgetter('year'))
except TypeError:
pass
return facets
def combine_totals(query_data, total_data):
'''
Take facets data from the query search and a blank search (ie everything) for a decade and combine them.
Parameters:
query_data - list of dictionaries containing facets data from a query search
total_data - list of dictionaries containing facets data from a blank search
Returns:
A list of dictionaries containing: 'year', 'total_results', 'total articles'
'''
combined_data = []
query_data = get_facets(query_data)
total_data = get_facets(total_data)
for index, query_row in enumerate(query_data):
total_row = total_data[index]
query_row['total_articles'] = total_row['total_results']
combined_data.append(query_row)
return combined_data
def year_totals(params):
'''
Generate a dataset for a search query.
Parameters:
query - search query
Returns:
A Pandas dataframe with three columns -- year, total_results, total_articles -- and one row per year.
'''
totals = []
start_decade = math.floor(date_range.value[0] / 10)
end_decade = math.floor(date_range.value[1] / 10) + 1
query = params['q']
with results:
for decade in tqdm(range(start_decade, end_decade)):
params['l-decade'] = decade
params['q'] = query
query_data = get_results(params)
params['q'] = ' '
total_data = get_results(params)
combined_data = combine_totals(query_data, total_data)
totals.extend(combined_data)
totals.sort(key=itemgetter('year'))
return totals
date_range = widgets.IntRangeSlider(
value=[1803, 1954],
min=1803,
max=2018,
step=1,
description='Date range:',
disabled=False,
continuous_update=False,
orientation='horizontal',
readout=True,
readout_format='0<4d',
layout=widgets.Layout(width='50%')
)
display(date_range)
You can just add a single search query to see how the number of matching articles vary over time. But you can also compare frequencies between queries, states, and newspapers:
cat
vs dog
swimmers
in NSW, Victoria, and Queenslandprotectionism
in The Age vs The Argusqueries = []
out = widgets.Output()
@out.capture()
def add_query(b):
queries.append(query.value)
query.value = ''
print('Query {}: {}'.format(len(queries), queries[-1]))
query = widgets.Text(
placeholder='Enter your query then click the button to add',
disabled=False,
)
query_button = widgets.Button(
description='Add query',
disabled=False,
tooltip='Click to add query',
icon=''
)
query_button.on_click(add_query)
query_tip = widgets.HTML(value='A query can be anything you\'d enter in the Trove simple search box — from a single keyword to a complex boolean expression. Add as many queries as you want.')
def get_titles(b):
params = {
'encoding': 'json',
'key': api_key.value
}
response = requests.get('http://api.trove.nla.gov.au/v2/newspaper/titles', params=params)
data = response.json()
title_list = [(t['title'], {'id': t['id'], 'title': t['title']}) for t in data['response']['records']['newspaper']]
title_list.sort(key=itemgetter(0))
titles_sorted = OrderedDict(title_list)
titles.options = titles_sorted
title_query = widgets.Text(
placeholder='Enter your query',
description='Search for:',
disabled=False,
)
titles = widgets.SelectMultiple(
options=['Click on button to load titles'],
rows=10,
description='In:',
disabled=False,
layout=widgets.Layout(width='50%')
)
titles_button = widgets.Button(
description='Load titles',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Click to load titles',
icon=''
)
titles_button.on_click(get_titles)
titles_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple newspapers to compare.')
state_query = widgets.Text(
placeholder='Enter your query',
description='Search for:',
disabled=False,
)
states = widgets.SelectMultiple(
options=[
'ACT',
'New South Wales',
'Queensland',
'South Australia',
'Northern Territory',
'Tasmania',
'Victoria',
'Western Australia',
'National',
'International'
],
rows=10,
description='In:',
disabled=False,
layout=widgets.Layout(width='50%')
)
states_tip = widgets.HTML(value='Use <b>Shift</b> or <b>Cmd/Ctrl</b> to select multiple states to compare.')
def plot_raw_results(width=700, height=400):
chart = alt.Chart(df).mark_line(point=True).encode(
x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
y=alt.Y('total_results:Q', axis=alt.Axis(format=',d', title='Number of articles')),
color=alt.Color('query', legend=alt.Legend(title='')),
tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('total_results:Q', title='Articles', format=',')]
).properties(width=width, height=height)
return chart
def plot_relative_results(width=700, height=400):
chart = alt.Chart(df).mark_line(point=True).encode(
x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
y=alt.Y('PercentOfTotal:Q', axis=alt.Axis(format='.2%', title='Percentage of total articles')),
color=alt.Color('query', legend=alt.Legend(title='')),
tooltip=[alt.Tooltip('query', title='Query:'), alt.Tooltip('year:Q', title='Year'), alt.Tooltip('PercentOfTotal:Q', title='Articles', format='.2%')]
).properties(width=width, height=height).transform_calculate(
PercentOfTotal="datum.total_results / datum.total_articles"
)
return chart
def clear_all(b):
states.value = []
state_query.value = ''
titles.value = []
title_query.value = ''
out.clear_output()
queries.clear()
results.clear_output()
save_data.clear_output()
def get_data(b):
global df
results.clear_output()
save_data.clear_output()
traces = []
q_params = params.copy()
q_params['key'] = api_key.value
if tab.selected_index == 0:
for query in queries:
q_params['q'] = query
with results:
display(HTML('Searching for {}...'.format(query)))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals['query'] = query
traces.append(df_totals)
elif tab.selected_index == 1:
q_params['q'] = state_query.value
for state in states.value:
q_params['l-state'] = state
with results:
display(HTML('Searching in {}...'.format(state)))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals['query'] = state
traces.append(df_totals)
elif tab.selected_index == 2:
q_params['q'] = title_query.value
for title in titles.value:
q_params['l-title'] = title['id']
with results:
display(HTML('Searching in {}...'.format(title['title'])))
totals = year_totals(q_params.copy())
df_totals = pd.DataFrame(totals)
df_totals['query'] = title['title']
traces.append(df_totals)
try:
df = pd.concat(traces, ignore_index=True)
except ValueError:
with results:
display(HTML('No results!'))
else:
results.clear_output()
chart = plot_raw_results()
chart_type.value = 'raw'
csv_file = save_as_csv()
with results:
display(chart_type)
display(chart)
with save_data:
display(HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>'))
display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]))
def save_chart(b):
width = save_chart_width.value
height = save_chart_height.value
if chart_type.value == 'proportion':
chart = plot_relative_results(width, height)
else:
chart = plot_raw_results(width, height)
filename = 'data/querypic-{}.html'.format(int(time.time()))
chart.save(filename)
with save_data:
display(HTML('View HTML version:'), FileLink(filename))
def save_as_csv():
filename = 'data/querypic-{}.csv'.format(int(time.time()))
df.to_csv(filename, index=False)
return filename
def change_chart(o):
results.clear_output(wait=True)
if chart_type.value == 'proportion':
chart = plot_relative_results()
else:
chart = plot_raw_results()
with results:
display(chart_type)
display(chart)
chart_type = widgets.Dropdown(
options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],
value='raw'
)
chart_type.observe(change_chart)
clear_all_button = widgets.Button(
description='Clear all',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Clear current queries',
icon=''
)
get_data_button = widgets.Button(
description='Create chart',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Create chart',
icon=''
)
save_chart_button = widgets.Button(
description='Save chart',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Save chart as HTML',
icon=''
)
save_chart_width = widgets.BoundedIntText(
value=700,
min=700,
max=2000,
step=100,
description='Width',
disabled=False
)
save_chart_height = widgets.BoundedIntText(
value=400,
min=400,
max=1500,
step=100,
description='Height',
disabled=False
)
clear_all_button.on_click(clear_all)
get_data_button.on_click(get_data)
save_chart_button.on_click(save_chart)
tab1 = widgets.VBox([widgets.HBox([query, query_button]), query_tip, out])
tab2 = widgets.VBox([state_query, states, states_tip])
tab3 = widgets.VBox([title_query, widgets.HBox([titles, titles_button]), titles_tip])
tab = widgets.Tab(children=[tab1, tab2, tab3])
tab.set_title(0, 'Compare queries')
tab.set_title(1, 'Compare states')
tab.set_title(2, 'Compare newspapers')
display(widgets.VBox([tab, widgets.HBox([get_data_button, clear_all_button]), results, save_data]))
Created by Tim Sherratt for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.