Find when a piece of text appears in an archived web page

View in GitHub · View in GLAM Workbench

In [ ]:
# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.

This notebook helps you find when a particular piece of text appears in, or disappears from, a web page. Using Memento Timemaps, it gets a list of available captures from the selected web archive. It then searches each capture for the desired text, displaying the results.

You can select the direction in which the notebook searches:

  • First occurrence – find the first capture in which the text appears (start from the first capture and come forward in time)
  • Last occurrence – find the last capture in which the text appears (start from present and go backwards in time)
  • All occurrences – find all matches (start from the first capture and continue until the last)

If you select 'All occurrences' the notebook will generate a simple chart showing how the number of matches changes over time.

By default, the notebook displays possible or 'fuzzy' matches as well as exact matches, but these are not counted in the totals.

In [ ]:
import json
import os
import re

import altair as alt
import arrow
import ipywidgets as widgets
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fuzzysearch import find_near_matches
from IPython.display import HTML, display

# This is to restyle the standard html table output from difflib
HTML(
    "<style>.x-match {background-color: #ccffcc;} .p-match {background-color: #ffffcc;}</style>"
)
In [ ]:
# Default list of repositories -- you could add to this
TIMEGATES = {
    "nla": "https://web.archive.org.au/awa/",
    "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/",
    "bl": "https://www.webarchive.org.uk/wayback/archive/",
    "ia": "https://web.archive.org/web/",
}
In [ ]:
def get_html(url):
    """
    Get html from a capture url.
    """
    response = requests.get(url)
    # Sometimes the Mementos don't go to captures?!
    # Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
    try:
        re.search(r"/(\d{14})id_/", response.url).group(1)
    except AttributeError:
        return None
    return {"url": response.url, "html": response.content}


def format_date(url):
    """
    Extract timestamp from url and format in a human readable way.
    """
    timestamp = re.search(r"/(\d{14})id_/", url).group(1)
    return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")


def format_date_as_iso(url):
    """
    Extract timestamp from url and format as ISO.
    """
    timestamp = re.search(r"/(\d{14})id_/", url).group(1)
    return arrow.get(timestamp, "YYYYMMDDHHmmss").format("YYYY-MM-DD")


def convert_lists_to_dicts(results):
    """
    Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
    Renames keys to standardise IA with other Timemaps.
    """
    if results:
        keys = results[0]
        results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
    else:
        results_as_dicts = results
    # Rename keys
    for d in results_as_dicts:
        d["status"] = d.pop("statuscode")
        d["mime"] = d.pop("mimetype")
        d["url"] = d.pop("original")
    return results_as_dicts


def get_capture_data_from_memento(url, request_type="head"):
    """
    For OpenWayback systems this can get some extra cpature info to insert in Timemaps.
    """
    if request_type == "head":
        response = requests.head(url)
    else:
        response = requests.get(url)
    headers = response.headers
    length = headers.get("x-archive-orig-content-length")
    status = headers.get("x-archive-orig-status")
    status = status.split(" ")[0] if status else None
    mime = headers.get("x-archive-orig-content-type")
    mime = mime.split(";")[0] if mime else None
    return {"length": length, "status": status, "mime": mime}


def convert_link_to_json(results, enrich_data=False):
    """
    Converts link formatted Timemap to JSON.
    """
    data = []
    for line in results.splitlines():
        parts = line.split("; ")
        if len(parts) > 1:
            link_type = re.search(
                r'rel="(original|self|timegate|first memento|last memento|memento)"',
                parts[1],
            ).group(1)
            if link_type == "memento":
                link = parts[0].strip("<>")
                timestamp, original = re.search(r"/(\d{14})/(.*)$", link).groups()
                capture = {"timestamp": timestamp, "url": original}
                if enrich_data:
                    capture.update(get_capture_data_from_memento(link))
                data.append(capture)
    return data


def get_timemap_as_json(timegate, url):
    """
    Get a Timemap then normalise results (if necessary) to return a list of dicts.
    """
    tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/"
    response = requests.get(tg_url)
    response_type = response.headers["content-type"]
    # pywb style Timemap
    if response_type == "text/x-ndjson":
        data = [json.loads(line) for line in response.text.splitlines()]
    # IA Wayback stype Timemap
    elif response_type == "application/json":
        data = convert_lists_to_dicts(response.json())
    # Link style Timemap (OpenWayback)
    elif response_type in ["application/link-format", "text/html;charset=utf-8"]:
        data = convert_link_to_json(response.text)
    return data


def display_chart(matches):
    """
    Visualise matches over time.
    """
    df = pd.DataFrame(matches)
    chart = (
        alt.Chart(df)
        .mark_line(point=True)
        .encode(x="date:T", y="matches:Q", tooltip=["date:T", "matches:Q"])
    )
    with chart_display:
        display(chart)


def process_text(html):
    """
    Extract text from an HTML page and return it as a list of lines.
    Removes blank lines.
    """
    lines = [
        line
        for line in BeautifulSoup(html).get_text().splitlines()
        if not re.match(r"^\s*$", line)
    ]
    return lines


def format_date_link(url):
    """
    Extract date from url, format, and display as link.
    """
    date = format_date(url)
    return f'<a href="{url.replace("id_", "")}">{date}</a>'


def format_context(text, match):
    """
    Extract, markup, and format context around a match.
    """
    style = "p-match" if match.dist > 0 else "x-match"
    marked_up = f'{text[:match.start]}<span class="{style}">{text[match.start:match.end]}</span>{text[match.end:]}'
    result_string = marked_up[max(0, match.start - 40) : match.end + 40 + 22 + 7]
    result_string = result_string[
        result_string.index(" ") : result_string.rindex(" ")
    ].strip()
    return f"...{result_string}..."


def search_page(capture_data, pattern):
    """
    Search for a text string in the html of a page.
    """
    found = 0
    text = BeautifulSoup(capture_data["html"]).get_text()
    date = format_date_link(capture_data["url"])
    matches = find_near_matches(pattern.casefold(), text.casefold(), max_l_dist=1)
    if matches:
        results = f'<h4><a href="{capture_data["url"]}">{date}</a></h4><ul>'
        for match in matches:
            results += f"<li>'{format_context(text, match)}'</li>"
            if match.dist == 0:
                found += 1
        results += "</ul>"
        with out:
            display(HTML(results))
    return found


def update_status(i, total_matches):
    """
    Display numbers of documents processed and matches found.
    """
    with status:
        status.clear_output(wait=True)
        display(HTML(f"Captures processed: {i + 1}"))
        display(HTML(f"Exact matches found: {total_matches}"))


def find_text(timegate, url, pattern, direction):
    """
    Get all captures for a page from a Timemap, then search for requested text in each page,
    aggregating the results.
    """
    total_matches = 0
    matches = []
    with out:
        key = '<b>Key</b><ul><li><span class="x-match">exact match</li><li><span class="p-match">possible match</span></li></ul>'
        display(HTML(key))
    timemap = get_timemap_as_json(timegate, url)
    if direction == "last":
        timemap.reverse()
    for i, capture in enumerate(timemap):
        capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
        if timegate == "nlnz" or (
            capture["digest"] != timemap[i - 1]["digest"] and capture["status"] == "200"
        ):
            capture_data = get_html(capture_url)
            if capture_data:
                found = search_page(capture_data, pattern)
                total_matches += found
                if found > 0:
                    matches.append(
                        {"date": format_date_as_iso(capture_url), "matches": found}
                    )
                    if direction in ["first", "last"]:
                        break
        update_status(i, total_matches)
    if direction in ["first", "last"]:
        update_status(i, total_matches)
    else:
        display_chart(matches)


def start(e):
    clear("e")
    find_text(
        repository.value, target_url.value, search_string.value, search_direction.value
    )


def clear(e):
    status.clear_output()
    chart_display.clear_output()
    out.clear_output()


out = widgets.Output()
status = widgets.Output()
chart_display = widgets.Output()

repository = widgets.Dropdown(
    options=[
        ("---", ""),
        ("UK Web Archive", "bl"),
        ("National Library of Australia", "nla"),
        ("National Library of New Zealand", "nlnz"),
        ("Internet Archive", "ia"),
    ],
    description="Archive:",
    disabled=False,
    value="",
)

search_direction = widgets.Dropdown(
    options=[
        ("First occurrence", "first"),
        ("Last occurrence", "last"),
        ("All occurrences", "all"),
    ],
    description="Find:",
    disabled=False,
    value="first",
)

target_url = widgets.Text(description="URL:")

search_string = widgets.Text(description="Search text:")

tc_button = widgets.Button(description="Find text", button_style="primary")
tc_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)

display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding="10px")))
display(
    widgets.HBox(
        [search_string, search_direction], layout=widgets.Layout(padding="10px")
    )
)
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding="10px")))
display(status)
display(chart_display)
display(out)
In [ ]:
%%capture
%load_ext dotenv
%dotenv

# Insert some values for automated testing

if os.getenv("GW_STATUS") == "dev":
    target_url.value = "http://discontents.com.au/"
    repository.value = "nla"
    search_string.value = "Trove"
In [ ]:
# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.

if target_url.value and search_string.value:
    script = """
    <script type="text/javascript">
        function start() {
          if (document.querySelector("button")) {
            let button = document.querySelector("button.mod-primary");
            button.click();
          } else {
            setTimeout(start, 5);
          }
        }
    start();
    </script>"""
    display(HTML(script))

Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020