Get full page screenshots from archived web pages¶

In [ ]:

# This notebook is designed to run in Voila as an app (with the code hidden).
# To launch this notebook in Voila, just select 'View > Open with Voila in New Browser Tab'
# Your browser might ask for permission to open the new tab as a popup.

In [ ]:

%%capture
import base64
import io
import math
import os
import re
import time
from pathlib import Path
from urllib.parse import urlparse

import arrow
import geckodriver_autoinstaller
import ipywidgets as widgets
import PIL
import requests
import selenium
from IPython.display import HTML, display
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from slugify import slugify

geckodriver_autoinstaller.install()

In [ ]:

TIMEGATES = {
    "nla": "https://web.archive.org.au/awa/",
    "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/",
    "bl": "https://www.webarchive.org.uk/wayback/archive/",
    "ia": "https://web.archive.org/web/",
    "ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/",
}

wayback = ["web.archive.org"]
pywb = {
    "web.archive.org.au": "replayFrame",
    "webarchive.nla.gov.au": "replayFrame",
    "webarchive.org.uk": "replay_iframe",
    "ndhadeliver.natlib.govt.nz": "replayFrame",
    "webarchive.nationalarchives.gov.uk": "replay_iframe",
}

html_output = []


def format_date_for_headers(iso_date, tz):
    """
    Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone.
    Convert the datetime to UTC and format as required by Accet-Datetime headers:
    eg Fri, 23 Mar 2007 01:00:00 GMT
    """
    local = arrow.get(f"{iso_date} 12:00:00 {tz}", "YYYY-MM-DD HH:mm:ss ZZZ")
    gmt = local.to("utc")
    return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT'


def format_date_from_timestamp(url):
    timestamp = re.search(r"/(\d{14}|\d{12})(?:if_|mp_)*/", url).group(1)
    return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY")


def parse_links_from_headers(response):
    """
    Extract original, timegate, timemap, and memento links from 'Link' header.
    """
    links = response.links
    return {k: v["url"] for k, v in links.items()}


def query_timegate(timegate, url, date=None, tz="Australia/Canberra"):
    headers = {}
    if date:
        formatted_date = format_date_for_headers(date, tz)
        headers["Accept-Datetime"] = formatted_date
    # BL, NLNZ & UKGWA don't seem to default to latest date if no date supplied
    elif not date and timegate in ["bl", "nlnz", "ukgwa"]:
        formatted_date = format_date_for_headers(
            arrow.utcnow().format("YYYY-MM-DD"), tz
        )
        headers["Accept-Datetime"] = formatted_date
    # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt!
    tg_url = (
        f"{TIMEGATES[timegate]}{url}/"
        if not url.endswith("/")
        else f"{TIMEGATES[timegate]}{url}"
    )
    # print(tg_url)
    # IA doesn't work with head, others don't work with get...
    if timegate == "ia":
        response = requests.get(tg_url, headers=headers)
    else:
        response = requests.head(tg_url, headers=headers)
    return parse_links_from_headers(response)


def get_memento(timegate, url, date):
    links = query_timegate(timegate, url, date)
    # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness
    if links:
        if "memento" in links:
            memento = links["memento"]
        elif "prev memento" in links:
            memento = links["prev memento"]
        elif "next memento" in links:
            memento = links["next memento"]
        elif "last memento" in links:
            memento = links["last memento"]
    else:
        memento = None
    return memento


def get_full_page_screenshot(url, save_width=200):
    """
    Gets a full page screenshot of the supplied url.
    By default resizes the screenshot to a maximum width of 200px.
    Provide a 'save_width' value to change this.

    NOTE the webdriver sometimes fails for unknown reasons. Just try again.
    """
    global html_output
    domain = urlparse(url)[1].replace("www.", "")
    # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls)
    if domain in wayback and "if_" not in url:
        url = re.sub(r"/(\d{14}|\d{12})/http", r"/\1if_/http", url)
    try:
        date_str, site = re.search(
            r"/(\d{14}|\d{12})(?:if_|mp_)*/https*://?(.+/)", url
        ).groups()
    except AttributeError:
        # There's something wrong with the link...
        # print(url)
        show_error(f"{url} isn't a Memento – did you forget to select an archive?")
    else:
        output_dir = Path("screenshots")
        output_dir.mkdir(parents=True, exist_ok=True)
        ss_file = Path(output_dir, f"{slugify(site)}-{date_str}-{save_width}.png")
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(options=options)
        driver.implicitly_wait(15)
        driver.get(url)
        # Give some time for everything to load
        time.sleep(30)
        driver.maximize_window()
        # UK and AU use pywb in framed replay mode, so we need to switch to the framed content
        if domain in pywb:
            try:
                driver.switch_to.frame(pywb[domain])
            except selenium.common.exceptions.NoSuchFrameException:
                # If we pass here we'll probably still get a ss, just not full page -- better than failing?
                pass
        ss = None
        for tag in ["body", "html", "frameset"]:
            try:
                elem = driver.find_element(By.TAG_NAME, tag)
                ss = elem.screenshot_as_base64
                break
            except (
                selenium.common.exceptions.NoSuchElementException,
                selenium.common.exceptions.WebDriverException,
            ):
                pass
        driver.quit()
        if not ss:
            show_error(f"Couldn't get a screenshot of {url} – sorry...")
        else:
            img = Image.open(io.BytesIO(base64.b64decode(ss)))
            ratio = save_width / img.width
            (width, height) = (save_width, math.ceil(img.height * ratio))
            resized_img = img.resize((width, height), PIL.Image.Resampling.LANCZOS)
            resized_img.save(ss_file)
            return ss_file


def display_screenshot(ss_file, url):
    date = format_date_from_timestamp(url)
    try:
        display_url = re.search(r"/(\d{14}|\d{12})(?:mp_|if_|id_)*/(.*)$", url).group(1)
    except AttributeError:
        display_url = url
    status.clear_output()
    html_output.append(
        f'<div style="float:left; margin-left: 20px;"><p><b>{date}</b><br><a href="{url.replace("if_/", "/")}">{display_url}</a></p><p><a href="{ss_file}"><img src="{ss_file}"></a><br><a href="{ss_file}">[Download]</a></p></div>'
    )
    with out:
        display((HTML("".join(html_output))))


def show_error(message=None):
    status.clear_output()
    with status:
        print(f"Something went wrong – {message}")


def start(e):
    status.clear_output()
    out.clear_output(wait=True)
    with status:
        print("Generating screenshot...")
    if repository.value:
        memento = get_memento(repository.value, target_url.value, target_date.value)
    else:
        memento = target_url.value
    if memento:
        try:
            ss_file = get_full_page_screenshot(memento, save_width=width.value)
            if ss_file:
                display_screenshot(ss_file, memento)
        except selenium.common.exceptions.WebDriverException:
            show_error(f"couldn't get a screenshot of {memento} – sorry...")
    else:
        show_error("couldn't find a Memento – sorry...")


def clear(e):
    global html_output
    html_output = []
    status.clear_output()
    out.clear_output()


def clear_last(e):
    global html_output
    html_output.pop()
    out.clear_output(wait=True)
    with out:
        display((HTML("".join(html_output))))


repository = widgets.Dropdown(
    options=[
        ("---", ""),
        ("UK Web Archive", "bl"),
        ("UK Government Web Archive", "ukgwa"),
        ("National Library of Australia", "nla"),
        ("National Library of New Zealand", "nlnz"),
        ("Internet Archive", "ia"),
    ],
    description="Archive:",
    disabled=False,
)

target_url = widgets.Text(description="Target URL:")

target_date = widgets.DatePicker(description="Target date: ", disabled=False)

width = widgets.IntSlider(
    value=7,
    min=200,
    max=1000,
    step=100,
    description="Width:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)

out = widgets.Output()
status = widgets.Output()
ss_button = widgets.Button(description="Get screenshot", button_style="primary")
ss_button.on_click(start)
clear_button = widgets.Button(description="Clear all")
clear_button.on_click(clear)
clear_last_button = widgets.Button(description="Clear last")
clear_last_button.on_click(clear_last)
note = """
    <ul class="browser-default">
    <li>Select a repository, and insert a url to generate a screenshot from the archive.</li>
    <li>If you include a date, it\'ll attempt to find the closest capture using Memento Timegates.</li>
    <li>If you don't include a date, it'll give you the most recent capture.</li>
    <li>If you already have the url of the exact capture you want, just put it in the 'Target url' box and leave 'Archive' and 'Target date' blank.
    <li>You can add multiple screenshots to compare changes.</li>
    </ul>
    """
display(
    HTML(note),
    widgets.HBox(
        [widgets.VBox([repository, target_date]), widgets.VBox([target_url, width])],
        layout=widgets.Layout(padding="20px"),
    ),
    widgets.HBox([ss_button, clear_button, clear_last_button]),
    status,
    out,
)

In [ ]:

%%capture
%load_ext dotenv
%dotenv

In [ ]:

# Insert some values for automated testing

if os.getenv("GW_STATUS") == "dev":
    repository.value = "nlnz"
    target_url.value = "http://digitalnz.org"
    target_date.value = arrow.get("2015-01-01").date()

In [ ]:

# If values have been provided via url or above, then start automatically.
# Note that Voila widgets don't load immediately, hence the polling to
# make sure the start button exists.

if target_url.value:
    script = """
    <script type="text/javascript">
        function start() {
          if (document.querySelector("button")) {
            let button = document.querySelector("button.mod-primary");
            button.click();
          } else {
            setTimeout(start, 5);
          }
        }
    start();
    </script>"""
    display(HTML(script))

Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!

Work on this notebook was supported by the IIPC Discretionary Funding Programme 2019-2020.

The Web Archives section of the GLAM Workbench is sponsored by the British Library.