# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.
This notebook helps you find when a particular piece of text appears in, or disappears from, a web page. Using Memento Timemaps, it gets a list of available captures from the selected web archive. It then searches each capture for the desired text, displaying the results.
You can select the direction in which the notebook searches:
If you select 'All occurrences' the notebook will generate a simple chart showing how the number of matches changes over time.
By default, the notebook displays possible or 'fuzzy' matches as well as exact matches, but these are not counted in the totals.
import json
import os
import re
import altair as alt
import arrow
import ipywidgets as widgets
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fuzzysearch import find_near_matches
from IPython.display import HTML, display
# This is to restyle the standard html table output from difflib
HTML(
"<style>.x-match {background-color: #ccffcc;} .p-match {background-color: #ffffcc;}</style>"
)
# Default list of repositories -- you could add to this
TIMEGATES = {
"nla": "https://web.archive.org.au/awa/",
"nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/",
"bl": "https://www.webarchive.org.uk/wayback/archive/",
"ia": "https://web.archive.org/web/",
"ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/"
}
def get_html(url):
"""
Get html from a capture url.
"""
response = requests.get(url)
# Sometimes the Mementos don't go to captures?!
# Eg https://web.archive.org.au/awa/20090912180610id_/http://www.discontents.com.au/
try:
re.search(r"/(\d{12}|\d{14})id_/", response.url).group(1)
except AttributeError:
return None
return {"url": response.url, "html": response.content}
def format_date(url):
"""
Extract timestamp from url and format in a human readable way.
"""
timestamp = re.search(r"/(\d{12}|\d{14})id_/", url).group(1)
return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")
def format_date_as_iso(url):
"""
Extract timestamp from url and format as ISO.
"""
timestamp = re.search(r"/(\d{12}|\d{14})id_/", url).group(1)
return arrow.get(timestamp, "YYYYMMDDHHmmss").format("YYYY-MM-DD")
def convert_lists_to_dicts(results):
"""
Converts IA style timemap (a JSON array of arrays) to a list of dictionaries.
Renames keys to standardise IA with other Timemaps.
"""
if results:
keys = results[0]
results_as_dicts = [dict(zip(keys, v)) for v in results[1:]]
else:
results_as_dicts = results
# Rename keys
for d in results_as_dicts:
d["status"] = d.pop("statuscode")
d["mime"] = d.pop("mimetype")
d["url"] = d.pop("original")
return results_as_dicts
def get_capture_data_from_memento(url, request_type="head"):
"""
For OpenWayback systems this can get some extra cpature info to insert in Timemaps.
"""
if request_type == "head":
response = requests.head(url)
else:
response = requests.get(url)
headers = response.headers
length = headers.get("x-archive-orig-content-length")
status = headers.get("x-archive-orig-status")
status = status.split(" ")[0] if status else None
mime = headers.get("x-archive-orig-content-type")
mime = mime.split(";")[0] if mime else None
return {"length": length, "status": status, "mime": mime}
def convert_link_to_json(results, enrich_data=False):
"""
Converts link formatted Timemap to JSON.
"""
data = []
for line in results.splitlines():
parts = line.split("; ")
if len(parts) > 1:
link_type = re.search(
r'rel="(original|self|timegate|first memento|last memento|memento)"',
parts[1],
).group(1)
if link_type == "memento":
link = parts[0].strip("<>")
timestamp, original = re.search(r"/(\d{12}|\d{14})/(.*)$", link).groups()
capture = {"timestamp": timestamp, "url": original}
if enrich_data:
capture.update(get_capture_data_from_memento(link))
data.append(capture)
return data
def get_timemap_as_json(timegate, url):
"""
Get a Timemap then normalise results (if necessary) to return a list of dicts.
"""
tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/"
response = requests.get(tg_url)
response_type = response.headers["content-type"]
# pywb style Timemap
if response_type == "text/x-ndjson":
data = [json.loads(line) for line in response.text.splitlines()]
# IA Wayback stype Timemap
elif response_type == "application/json":
data = convert_lists_to_dicts(response.json())
# Link style Timemap (OpenWayback)
elif response_type in ["application/link-format", "text/html;charset=utf-8"]:
data = convert_link_to_json(response.text)
return data
def display_chart(matches):
"""
Visualise matches over time.
"""
df = pd.DataFrame(matches)
chart = (
alt.Chart(df)
.mark_line(point=True)
.encode(x="date:T", y="matches:Q", tooltip=["date:T", "matches:Q"])
)
with chart_display:
display(chart)
def process_text(html):
"""
Extract text from an HTML page and return it as a list of lines.
Removes blank lines.
"""
lines = [
line
for line in BeautifulSoup(html).get_text().splitlines()
if not re.match(r"^\s*$", line)
]
return lines
def format_date_link(url):
"""
Extract date from url, format, and display as link.
"""
date = format_date(url)
return f'<a href="{url.replace("id_", "")}">{date}</a>'
def format_context(text, match):
"""
Extract, markup, and format context around a match.
"""
style = "p-match" if match.dist > 0 else "x-match"
marked_up = f'{text[:match.start]}<span class="{style}">{text[match.start:match.end]}</span>{text[match.end:]}'
result_string = marked_up[max(0, match.start - 40) : match.end + 40 + 22 + 7]
result_string = result_string[
result_string.index(" ") : result_string.rindex(" ")
].strip()
return f"...{result_string}..."
def search_page(capture_data, pattern):
"""
Search for a text string in the html of a page.
"""
found = 0
text = BeautifulSoup(capture_data["html"]).get_text()
date = format_date_link(capture_data["url"])
matches = find_near_matches(pattern.casefold(), text.casefold(), max_l_dist=1)
if matches:
results = f'<h4><a href="{capture_data["url"]}">{date}</a></h4><ul>'
for match in matches:
results += f"<li>'{format_context(text, match)}'</li>"
if match.dist == 0:
found += 1
results += "</ul>"
with out:
display(HTML(results))
return found
def update_status(i, total_matches):
"""
Display numbers of documents processed and matches found.
"""
with status:
status.clear_output(wait=True)
display(HTML(f"Captures processed: {i + 1}"))
display(HTML(f"Exact matches found: {total_matches}"))
def find_text(timegate, url, pattern, direction):
"""
Get all captures for a page from a Timemap, then search for requested text in each page,
aggregating the results.
"""
total_matches = 0
matches = []
with out:
key = '<b>Key</b><ul><li><span class="x-match">exact match</li><li><span class="p-match">possible match</span></li></ul>'
display(HTML(key))
timemap = get_timemap_as_json(timegate, url)
if direction == "last":
timemap.reverse()
for i, capture in enumerate(timemap):
capture_url = f'{TIMEGATES[timegate]}{capture["timestamp"]}id_/{capture["url"]}'
if timegate == "nlnz" or (
capture["digest"] != timemap[i - 1]["digest"] and capture["status"] == "200"
):
capture_data = get_html(capture_url)
if capture_data:
found = search_page(capture_data, pattern)
total_matches += found
if found > 0:
matches.append(
{"date": format_date_as_iso(capture_url), "matches": found}
)
if direction in ["first", "last"]:
break
update_status(i, total_matches)
if direction in ["first", "last"]:
update_status(i, total_matches)
else:
display_chart(matches)
def start(e):
clear("e")
find_text(
repository.value, target_url.value, search_string.value, search_direction.value
)
def clear(e):
status.clear_output()
chart_display.clear_output()
out.clear_output()
out = widgets.Output()
status = widgets.Output()
chart_display = widgets.Output()
repository = widgets.Dropdown(
options=[
("---", ""),
("UK Web Archive", "bl"),
("UK Government Web Archive", "ukgwa"),
("National Library of Australia", "nla"),
("National Library of New Zealand", "nlnz"),
("Internet Archive", "ia"),
],
description="Archive:",
disabled=False,
value="",
)
search_direction = widgets.Dropdown(
options=[
("First occurrence", "first"),
("Last occurrence", "last"),
("All occurrences", "all"),
],
description="Find:",
disabled=False,
value="first",
)
target_url = widgets.Text(description="URL:")
search_string = widgets.Text(description="Search text:")
tc_button = widgets.Button(description="Find text", button_style="primary")
tc_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)
display(widgets.HBox([repository, target_url], layout=widgets.Layout(padding="10px")))
display(
widgets.HBox(
[search_string, search_direction], layout=widgets.Layout(padding="10px")
)
)
display(widgets.HBox([tc_button, clear_button], layout=widgets.Layout(padding="10px")))
display(status)
display(chart_display)
display(out)
%%capture
%load_ext dotenv
%dotenv
# Insert some values for automated testing
if os.getenv("GW_STATUS") == "dev":
target_url.value = "http://discontents.com.au/"
repository.value = "nla"
search_string.value = "Trove"
# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.
if target_url.value and search_string.value:
script = """
<script type="text/javascript">
function start() {
if (document.querySelector("button")) {
let button = document.querySelector("button.mod-primary");
button.click();
} else {
setTimeout(start, 5);
}
}
start();
</script>"""
display(HTML(script))
Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!
Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020.
The Web Archives section of the GLAM Workbench is sponsored by the British Library.