QueryPic helps you explore your search results in Papers Past by showing you how they change over time – aggregating the number of articles matching your query by year.
%%capture
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
import pandas as pd
import altair as alt
import arrow
import os
from IPython.display import display, HTML
import ipywidgets as widgets
# Make sure data directory exists
os.makedirs('data', exist_ok=True)
# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession('querypic', expire_after=60*60)
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
# CONFIG SO THAT ALTAIR HREFS OPEN IN A NEW TAB
def blank_href():
return {
"usermeta": {
"embedOptions": {
'loader': {'target': '_blank'}
}
}
}
# register the custom theme under a chosen name
alt.themes.register('blank_href', blank_href)
# enable the newly registered theme
alt.themes.enable('blank_href')
dfs = []
queries = []
titles = [
'All',
'Akaroa Mail and Banks Peninsula Advertiser',
'Albertland Gazette',
'Ashburton Guardian',
'Auckland Star',
'Bruce Herald',
'Bush Advocate',
'Clutha Leader',
'Colonist',
'Daily Southern Cross',
'Daily Telegraph',
'Ellesmere Guardian',
'Evening Post',
'Fair Play',
'Feilding Star',
'Grey River Argus',
'Hawera & Normanby Star',
"Hawke's Bay Herald",
"Hawke's Bay Weekly Times",
'Hutt News',
'Inangahua Times',
'Kai Tiaki',
'Kaipara and Waitemata Echo',
'Lyttelton Times',
'Manawatu Herald',
'Manawatu Standard',
'Manawatu Times',
'Marlborough Express',
'Mataura Ensign',
'NZ Truth',
'Nelson Evening Mail',
'New Zealand Advertiser and Bay of Islands Gazette',
'New Zealand Colonist and Port Nicholson Advertiser',
'New Zealand Free Lance',
'New Zealand Gazette and Wellington Spectator',
'New Zealand Illustrated Magazine',
"New Zealand Spectator and Cook's Strait Guardian",
'New Zealand Tablet',
'New Zealander',
'North Otago Times',
'Northern Advocate',
'Observer',
'Ohinemuri Gazette',
'Otago Daily Times',
'Otago Witness',
'Otautau Standard and Wallace County Chronicle',
'Oxford Observer',
'Papers Past',
'Poverty Bay Herald',
'Progress',
'Rodney and Otamatea Times, Waitemata and Kaipara Gazette',
'Southland Times',
'Star',
'Taranaki Daily News',
'Taranaki Herald',
'Te Aroha News',
'Thames Star',
'Timaru Herald',
'Tuapeka Times',
'Waiapu Church Gazette',
'Waiapu Church Times',
'Waikato Times',
'Waimate Daily Advertiser',
'Wairarapa Daily Times',
'Wanganui Chronicle',
'Wanganui Herald',
'Wellington Independent',
'West Coast Times'
]
start_year = 1839
end_year = 1945
def get_titles_and_years():
params = {
'api_key': api_key.value,
'text': '',
'and[display_collection][]': 'Papers Past',
'facets': 'collection,year',
'facets_per_page': 350
}
response = s.get('http://api.digitalnz.org/v3/records.json', params=params)
data = response.json()
titles = sorted(list(data['search']['facets']['collection'].keys()))
titles.insert(0, 'All')
years = sorted(list(data['search']['facets']['year'].keys()))
return titles, years
def get_data(query=''):
params = {
'api_key': api_key.value,
'text': query,
'and[display_collection][]': 'Papers Past',
'facets': 'year',
'facets_per_page': 350
}
if select_newspaper.value != 'All':
params['and[collection][]'] = select_newspaper.value
response = s.get('http://api.digitalnz.org/v3/records.json', params=params)
data = response.json()
return data
def fill_year_gaps(df, min_year=None, max_year=None):
df.set_index('year', inplace=True)
if not min_year:
min_year = int(df.index.min())
max_year = int(df.index.max())
idx = sorted(list(range(min_year, max_year + 1)))
df = df.reindex(idx).reset_index()
return df, min_year, max_year
def create_year_df(data, col_name='total_articles', min_year=None, max_year=None):
years = data['search']['facets']['year']
df = pd.Series(years).to_frame().reset_index()
df.columns = ['year', col_name]
df['year'] = df['year'].astype('int64')
df, min_year, max_year = fill_year_gaps(df)
# Add search url
# years_df['url'] = 'https://paperspast.natlib.govt.nz/newspapers?query={0}&start_date=01-01-{1}&end_date=31-12-{1}'.format(params['text'], years_df['year'][0])
return df, min_year, max_year
def prepare_data(b):
global dfs, queries
query_id = f'Query {len(queries) + 1}'
if select_newspaper.value == 'All':
query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past'
else:
query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past&i[collection]={select_newspaper.value}'
queries.append({'x': 0, 'y': len(queries), 'id': query_id, 'url': query_url})
totals = get_data()
totals_df, min_year, max_year = create_year_df(totals)
years = get_data(query.value)
try:
years_df, _, _ = create_year_df(years, 'total_results')
except ValueError:
with results:
display('No results')
else:
df = pd.merge(totals_df, years_df, how='left', on='year')
df['query_id'] = query_id
df['url'] = df['year'].apply(lambda x: f'{query_url}&i[year]={x}')
dfs.append(df.loc[(df['year'] >= int(date_range.value[0])) & (df['year'] <= int(date_range.value[1]))])
show_results()
def show_results(view='raw'):
'''
Display the chart and the save data options.
'''
results.clear_output(wait=True)
save_data.clear_output(wait=True)
chart = make_chart(view=view)
chart_type.unobserve(change_chart, 'value')
chart_type.value = 'raw'
chart_type.observe(change_chart, 'value')
csv_file = save_as_csv()
with results:
display(chart_type)
display(chart)
with save_data:
display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]), layout=widgets.Layout(margin='50px 0 50px 0'))
display(HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>'))
def list_queries():
'''
Creates a text-based chart that lists the saved queries.
'''
df = pd.DataFrame(queries)
chart = alt.Chart(df).mark_text(align='left', dx=2, dy=1, baseline='middle').encode(
x=alt.X('x:Q', title=None, axis=None, scale=alt.Scale(range=[0,1])),
y=alt.Y('id:O', title=None, axis=alt.Axis(labelFontWeight='bold', domain=False, grid=False)),
text='url:N',
href='url',
color=alt.value('blue')
)
return chart
def make_chart(view, width=800, height=400):
df = pd.concat(dfs, ignore_index=True)
if view == 'raw':
y = alt.Y('total_results:Q', title='Number of results')
else:
y = alt.Y('ratio:Q', axis=alt.Axis(format='.2%'), title='Percentage of total articles')
plot = alt.Chart(df, width=600).mark_line(point=True, interpolate='cardinal'
).transform_calculate(
ratio='datum.total_results / datum.total_articles'
).encode(
x = alt.X('year:O', title='Year'),
y = y,
color = alt.Color('query_id', legend=alt.Legend(title='Query')),
tooltip = [
alt.Tooltip('query_id:N', title='Query'),
alt.Tooltip('year:O', title='Year'),
alt.Tooltip('total_results', format=',', title='Number of results'),
alt.Tooltip('ratio:Q', format='.2%', title='Percentage of articles')
],
href='url:N'
).properties(
height=height,
width=width,
title={
'text': 'Papers Past Newspapers Search (via DigitalNZ)',
'subtitle': f'Created by QueryPic: {arrow.now().format("D MMMM YYYY")}'
}
)
# Create text chart listing queries
query_list = list_queries()
# Combine charts
chart = alt.vconcat(plot, query_list).configure(padding=20
).configure_view(
strokeWidth=0
).configure_title(
fontSize=14
)
'''
c2 = alt.Chart(titles_df[1:11]).mark_bar().encode(
x = 'count:Q',
y = 'title:O',
tooltip = alt.Tooltip('count', format=',')
).properties(
height=300,
width=200
)
'''
return chart
def change_chart(o):
'''
Switch between chart views.
'''
results.clear_output()
if chart_type.value == 'proportion':
view = 'relative'
else:
view = 'raw'
chart = make_chart(view)
# chart_type.value = view
with results:
display(chart_type)
display(chart)
def clear_all(b):
'''
Clear all queries and results.
'''
global dfs, queries
dfs = []
queries = []
query.value = ''
select_newspaper.value = 'All'
date_range.value = [start_year, end_year]
results.clear_output()
save_data.clear_output()
def clear_last(b):
'''
Remove the most recent query from the chart.
'''
global dfs, queries
results.clear_output()
save_data.clear_output()
dfs.pop()
queries.pop()
if dfs:
show_results()
def save_chart(b):
'''
Save the chart as HTML for download.
'''
width = save_chart_width.value
height = save_chart_height.value
if chart_type.value == 'proportion':
chart = make_chart('relative', width, height)
else:
chart = make_chart('raw', width, height)
filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.html'
chart.save(filename)
with save_data:
display(HTML(f'Download HTML version: <a href={filename} download>{filename}</a>'))
def save_as_csv():
'''
Save harvested data as a CSV for download.
'''
df = pd.concat(dfs, ignore_index=True)
filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.csv'
df.to_csv(filename, index=False)
return filename
#titles, years = get_titles_and_years()
#start_year = int(years[0])
#end_year = int(years[-1])
api_key = widgets.Password(
placeholder='Enter your DigitalNZ API key',
description='API key:',
disabled=False,
value=''
)
query = widgets.Text(
value='',
placeholder='enter your search query',
description='Keywords:',
disabled=False
)
chart_type = widgets.Dropdown(
options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],
value='raw'
)
chart_type.observe(change_chart, 'value')
search_button = widgets.Button(
description='Create chart',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Create a chart',
icon=''
)
select_newspaper = widgets.Dropdown(
description='Newspaper:',
options=titles,
value='All'
)
date_range = widgets.IntRangeSlider(
value=[start_year, end_year],
min=start_year,
max=end_year,
step=1,
description='Date range:',
disabled=False,
continuous_update=False,
orientation='horizontal',
readout=True,
readout_format='0<4d',
layout=widgets.Layout(width='50%')
)
clear_last_button = widgets.Button(
description='Remove last query',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Remove the last query',
icon=''
)
clear_all_button = widgets.Button(
description='Clear all queries',
disabled=False,
button_style='', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Clear current queries',
icon='',
)
save_chart_button = widgets.Button(
description='Save chart as HTML',
disabled=False,
button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
tooltip='Save chart as HTML',
icon=''
)
save_chart_width = widgets.BoundedIntText(
value=700,
min=700,
max=2000,
step=100,
description='Width',
disabled=False
)
save_chart_height = widgets.BoundedIntText(
value=400,
min=400,
max=1500,
step=100,
description='Height',
disabled=False
)
results = widgets.Output()
save_data = widgets.Output()
search_button.on_click(prepare_data)
clear_last_button.on_click(clear_last)
clear_all_button.on_click(clear_all)
save_chart_button.on_click(save_chart)
display(api_key)
display(widgets.HBox([query, select_newspaper]))
display(date_range)
display(widgets.VBox([widgets.HBox([search_button, clear_last_button, clear_all_button], layout=widgets.Layout(margin='20px 0 20px 0')), results, save_data]))
Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.