QueryPic DigitalNZ

Visualise searches in Papers Past newspapers

QueryPic helps you explore your search results in Papers Past by showing you how they change over time – aggregating the number of articles matching your query by year.

  • Just enter keywords to search for, optionally limiting the results by newspaper or date
  • Combine multiple searches to compare changes in language, technology, or the impact of particular events.
  • Click on any point in a chart to view the results for that date in DigitalNZ.
  • Save your chart as an image or a HTML file.
In [ ]:
%%capture
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests_cache
import pandas as pd
import altair as alt
import arrow
import os
from IPython.display import display, HTML
import ipywidgets as widgets

# Make sure data directory exists
os.makedirs('data', exist_ok=True)

# Create a session that will automatically retry on server errors
s = requests_cache.CachedSession('querypic', expire_after=60*60)
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))

# CONFIG SO THAT ALTAIR HREFS OPEN IN A NEW TAB

def blank_href():
    return {
        "usermeta": {
            "embedOptions": {
                'loader': {'target': '_blank'}
            }
        }
    }


# register the custom theme under a chosen name
alt.themes.register('blank_href', blank_href)

# enable the newly registered theme
alt.themes.enable('blank_href')
In [ ]:
dfs = []
queries = []
titles = [
    'All',
    'Akaroa Mail and Banks Peninsula Advertiser',
    'Albertland Gazette',
    'Ashburton Guardian',
    'Auckland Star',
    'Bruce Herald',
    'Bush Advocate',
    'Clutha Leader',
    'Colonist',
    'Daily Southern Cross',
    'Daily Telegraph',
    'Ellesmere Guardian',
    'Evening Post',
    'Fair Play',
    'Feilding Star',
    'Grey River Argus',
    'Hawera & Normanby Star',
    "Hawke's Bay Herald",
    "Hawke's Bay Weekly Times",
    'Hutt News',
    'Inangahua Times',
    'Kai Tiaki',
    'Kaipara and Waitemata Echo',
    'Lyttelton Times',
    'Manawatu Herald',
    'Manawatu Standard',
    'Manawatu Times',
    'Marlborough Express',
    'Mataura Ensign',
    'NZ Truth',
    'Nelson Evening Mail',
    'New Zealand Advertiser and Bay of Islands Gazette',
    'New Zealand Colonist and Port Nicholson Advertiser',
    'New Zealand Free Lance',
    'New Zealand Gazette and Wellington Spectator',
    'New Zealand Illustrated Magazine',
    "New Zealand Spectator and Cook's Strait Guardian",
    'New Zealand Tablet',
    'New Zealander',
    'North Otago Times',
    'Northern Advocate',
    'Observer',
    'Ohinemuri Gazette',
    'Otago Daily Times',
    'Otago Witness',
    'Otautau Standard and Wallace County Chronicle',
    'Oxford Observer',
    'Papers Past',
    'Poverty Bay Herald',
    'Progress',
    'Rodney and Otamatea Times, Waitemata and Kaipara Gazette',
    'Southland Times',
    'Star',
    'Taranaki Daily News',
    'Taranaki Herald',
    'Te Aroha News',
    'Thames Star',
    'Timaru Herald',
    'Tuapeka Times',
    'Waiapu Church Gazette',
    'Waiapu Church Times',
    'Waikato Times',
    'Waimate Daily Advertiser',
    'Wairarapa Daily Times',
    'Wanganui Chronicle',
    'Wanganui Herald',
    'Wellington Independent',
    'West Coast Times'
]

start_year = 1839
end_year = 1945
In [ ]:
def get_titles_and_years():
    params = {
        'api_key': api_key.value,
        'text': '',
        'and[display_collection][]': 'Papers Past',
        'facets': 'collection,year',
        'facets_per_page': 350
    }
    response = s.get('http://api.digitalnz.org/v3/records.json', params=params)
    data = response.json()
    titles = sorted(list(data['search']['facets']['collection'].keys()))
    titles.insert(0, 'All')
    years = sorted(list(data['search']['facets']['year'].keys()))
    return titles, years

def get_data(query=''):
    params = {
        'api_key': api_key.value,
        'text': query,
        'and[display_collection][]': 'Papers Past',
        'facets': 'year',
        'facets_per_page': 350
    }
    if select_newspaper.value != 'All':
        params['and[collection][]'] = select_newspaper.value
    response = s.get('http://api.digitalnz.org/v3/records.json', params=params)
    data = response.json()
    return data

def fill_year_gaps(df, min_year=None, max_year=None):
    df.set_index('year', inplace=True)
    if not min_year:
        min_year = int(df.index.min())
        max_year = int(df.index.max())
    idx = sorted(list(range(min_year, max_year + 1)))
    df = df.reindex(idx).reset_index()
    return df, min_year, max_year

def create_year_df(data, col_name='total_articles', min_year=None, max_year=None):
    years = data['search']['facets']['year']
    df = pd.Series(years).to_frame().reset_index()
    df.columns = ['year', col_name]
    df['year'] = df['year'].astype('int64')
    df, min_year, max_year = fill_year_gaps(df)
    # Add search url
    # years_df['url'] = 'https://paperspast.natlib.govt.nz/newspapers?query={0}&start_date=01-01-{1}&end_date=31-12-{1}'.format(params['text'], years_df['year'][0])
    return df, min_year, max_year

def prepare_data(b):
    global dfs, queries
    query_id = f'Query {len(queries) + 1}'
    if select_newspaper.value == 'All':
        query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past'
    else:
        query_url = f'https://digitalnz.org/records?text={query.value}&i[primary_collection]=Papers%20Past&i[collection]={select_newspaper.value}'
    queries.append({'x': 0, 'y': len(queries), 'id': query_id, 'url': query_url})
    totals = get_data()
    totals_df, min_year, max_year = create_year_df(totals)
    years = get_data(query.value)
    try:
        years_df, _, _ = create_year_df(years, 'total_results')
    except ValueError:
        with results:
            display('No results')
    else:
        df = pd.merge(totals_df, years_df, how='left', on='year')
        df['query_id'] = query_id
        df['url'] = df['year'].apply(lambda x: f'{query_url}&i[year]={x}')
        dfs.append(df.loc[(df['year'] >= int(date_range.value[0])) & (df['year'] <= int(date_range.value[1]))])
        show_results()
    
def show_results(view='raw'):
    '''
    Display the chart and the save data options.
    '''
    results.clear_output(wait=True)
    save_data.clear_output(wait=True)
    chart = make_chart(view=view)
    chart_type.unobserve(change_chart, 'value')
    chart_type.value = 'raw'
    chart_type.observe(change_chart, 'value')
    csv_file = save_as_csv()
    with results:
        display(chart_type)
        display(chart)
    with save_data:
        display(widgets.HBox([save_chart_button, save_chart_width, save_chart_height]), layout=widgets.Layout(margin='50px 0 50px 0'))
        display(HTML(f'Download data: <a href="{csv_file}" download>{csv_file}</a>'))
        
def list_queries():
    '''
    Creates a text-based chart that lists the saved queries.
    '''
    df = pd.DataFrame(queries)
    chart = alt.Chart(df).mark_text(align='left', dx=2, dy=1, baseline='middle').encode(
        x=alt.X('x:Q', title=None, axis=None, scale=alt.Scale(range=[0,1])),
        y=alt.Y('id:O', title=None, axis=alt.Axis(labelFontWeight='bold', domain=False, grid=False)),
        text='url:N',
        href='url',
        color=alt.value('blue')
    )
    return chart
    
def make_chart(view, width=800, height=400):
    df = pd.concat(dfs, ignore_index=True)
    if view == 'raw':
        y = alt.Y('total_results:Q', title='Number of results')
    else:
        y = alt.Y('ratio:Q', axis=alt.Axis(format='.2%'), title='Percentage of total articles')
    plot = alt.Chart(df, width=600).mark_line(point=True, interpolate='cardinal'
    ).transform_calculate(
        ratio='datum.total_results / datum.total_articles'
    ).encode(
        x = alt.X('year:O', title='Year'),
        y = y,
        color = alt.Color('query_id', legend=alt.Legend(title='Query')),
        tooltip = [
            alt.Tooltip('query_id:N', title='Query'), 
            alt.Tooltip('year:O', title='Year'), 
            alt.Tooltip('total_results', format=',', title='Number of results'), 
            alt.Tooltip('ratio:Q', format='.2%', title='Percentage of articles')
        ],
        href='url:N'
    ).properties(
        height=height,
        width=width,
        title={
            'text': 'Papers Past Newspapers Search (via DigitalNZ)',
            'subtitle': f'Created by QueryPic: {arrow.now().format("D MMMM YYYY")}'
        }
    )
    
    # Create text chart listing queries
    query_list = list_queries()
    # Combine charts
    chart = alt.vconcat(plot, query_list).configure(padding=20
        ).configure_view(
            strokeWidth=0
        ).configure_title(
            fontSize=14
        )
    '''
    c2 = alt.Chart(titles_df[1:11]).mark_bar().encode(
        x = 'count:Q',
        y = 'title:O',
        tooltip = alt.Tooltip('count', format=',')
    ).properties(
        height=300,
        width=200
    )
    '''
    return chart
        
def change_chart(o):
    '''
    Switch between chart views.
    '''
    results.clear_output()
    if chart_type.value == 'proportion':
        view = 'relative'
    else:
        view = 'raw'
    chart = make_chart(view)
    # chart_type.value = view
    with results:
        display(chart_type)
        display(chart)
        
def clear_all(b):
    '''
    Clear all queries and results.
    '''
    global dfs, queries
    dfs = []
    queries = []
    query.value = ''
    select_newspaper.value = 'All'
    date_range.value = [start_year, end_year]
    results.clear_output()
    save_data.clear_output()
    
def clear_last(b):
    '''
    Remove the most recent query from the chart.
    '''
    global dfs, queries
    results.clear_output()
    save_data.clear_output()
    dfs.pop()
    queries.pop()
    if dfs:
        show_results()
        
def save_chart(b):
    '''
    Save the chart as HTML for download.
    '''
    width = save_chart_width.value
    height = save_chart_height.value
    if chart_type.value == 'proportion':
        chart = make_chart('relative', width, height)
    else:
        chart = make_chart('raw', width, height)
    filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.html'
    chart.save(filename)
    with save_data:
        display(HTML(f'Download HTML version: <a href={filename} download>{filename}</a>'))

def save_as_csv():
    '''
    Save harvested data as a CSV for download.
    '''
    df = pd.concat(dfs, ignore_index=True)
    filename = f'data/querypic-{arrow.now().format("YYYYMMDDHHmmss")}.csv'
    df.to_csv(filename, index=False)
    return filename
In [ ]:
#titles, years = get_titles_and_years()
#start_year = int(years[0])
#end_year = int(years[-1])

api_key = widgets.Password(
    placeholder='Enter your DigitalNZ API key',
    description='API key:',
    disabled=False,
    value=''
)

query = widgets.Text(
    value='',
    placeholder='enter your search query',
    description='Keywords:',
    disabled=False
)

chart_type = widgets.Dropdown(
    options=[('Raw number of results', 'raw'), ('Proportion of total articles', 'proportion')],
    value='raw'
)

chart_type.observe(change_chart, 'value')

search_button = widgets.Button(
        description='Create chart',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Create a chart',
        icon=''
    )

select_newspaper = widgets.Dropdown(
    description='Newspaper:',
    options=titles,
    value='All'
)

date_range = widgets.IntRangeSlider(
    value=[start_year, end_year],
    min=start_year,
    max=end_year,
    step=1,
    description='Date range:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='0<4d',
    layout=widgets.Layout(width='50%')
)

clear_last_button = widgets.Button(
        description='Remove last query',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Remove the last query',
        icon=''
    )
    
clear_all_button = widgets.Button(
        description='Clear all queries',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Clear current queries',
        icon='',
        
    )

save_chart_button = widgets.Button(
        description='Save chart as HTML',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Save chart as HTML',
        icon=''
    )

save_chart_width = widgets.BoundedIntText(
    value=700,
    min=700,
    max=2000,
    step=100,
    description='Width',
    disabled=False
)

save_chart_height = widgets.BoundedIntText(
    value=400,
    min=400,
    max=1500,
    step=100,
    description='Height',
    disabled=False
)

results = widgets.Output()
save_data = widgets.Output()

search_button.on_click(prepare_data)
clear_last_button.on_click(clear_last)
clear_all_button.on_click(clear_all)
save_chart_button.on_click(save_chart)

display(api_key)
display(widgets.HBox([query, select_newspaper]))
display(date_range)
display(widgets.VBox([widgets.HBox([search_button, clear_last_button, clear_all_button], layout=widgets.Layout(margin='20px 0 20px 0')), results, save_data]))

Created by Tim Sherratt for the GLAM Workbench. Support this project by becoming a GitHub sponsor.