# Compare two versions of an archived web page

# [View in GitHub](https://github.com/GLAM-Workbench/web-archives/blob/master/show_diffs.ipynb) · [View in GLAM Workbench](https://glam-workbench.net/web-archives/#compare-two-versions-of-an-archived-web-page)

This notebook demonstrates a number of different ways of comparing versions of archived web pages. Just choose a repository, enter a url, and select two dates to see comparisons based on:

import base64
import io
import math
import os
import re
import time
from difflib import HtmlDiff
from pathlib import Path
from urllib.parse import parse_qs, quote, urlparse

import arrow
import geckodriver_autoinstaller
import ipywidgets as widgets
import jsons
import pandas as pd
import PIL
import requests
import selenium
import trafilatura as tf
from bs4 import BeautifulSoup
from IPython.display import HTML, display
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from sklearn.feature_extraction.text import TfidfVectorizer
from slugify import slugify

geckodriver_autoinstaller.install()

# Add styles for the diff
HTML(
    ""
)


")) display(df) # In[ ]: # STATISTICS def size_in_bytes(html): """ The HTML should already be a bytes string, so len should give us the number of bytes. """ return len(html) def number_of_words(text): """ Split text on whitespace and count resulting words. (Might include some punctuation as well.) """ try: words = len(text.split()) except AttributeError: words = 0 return words def get_summary_data(capture): """ Compile some summary statistics about a page. """ summary = { "File size (bytes)": size_in_bytes(capture["html"]), "Number of words (all text)": number_of_words(capture["text"]), "Number of words (main text)": number_of_words(capture["main_text"]), } return summary def get_summaries(page_data): """ Get summaries of all the saved pages. """ summaries = [] for capture in page_data: summaries.append(get_summary_data(capture)) return summaries def display_summaries(page_data): """ Display the summaries using Pandas. """ summaries = get_summaries(page_data) dates = get_dates(page_data) df = pd.DataFrame(summaries, index=dates) with stats_out: display(HTML("


")) # Include thousands separator display(df.head().style.format("{:,.0f}")) # In[ ]: # LINKS def link_is_local(site, href): """ Check to see if a link is internal or external by looking to see if it includes the current domain. """ # Relative urls will always be local of course if href.startswith("http") and site not in href.lower(): return False else: return True def get_site_from_url(url): # Get the current domain from the url site = re.search( r"(\d{12}|\d{14})(?:id_)*/https*://(?:.*@){0,1}(.*?)(?:\:\d*){0,1}/", url ).group(1) # Remove any wwws site = re.sub(r"^www\d*\.", "", site) return site def get_links_in_page(capture): """ Extract internal and external links from a html page. """ internal_links = [] external_links = [] site = get_site_from_url(capture["url"]) soup = BeautifulSoup(capture["html"]) links = soup.find_all("a") for link in links: try: href = link["href"] except KeyError: pass else: if link_is_local(site, href): if href not in internal_links: internal_links.append(href) else: if href not in external_links: external_links.append(href) return {"internal": internal_links, "external": external_links} def get_links(page_data): """ Extract link info from all saved pages. """ all_links = [] for capture in page_data: links = get_links_in_page(capture) all_links.append(links) return all_links def display_link_data(dates, all_links): """ Display the number of links in saved pages. """ totals = [] for links in all_links: totals.append( { "Total internal links": len(links["internal"]), "Total external links": len(links["external"]), } ) df = pd.DataFrame(totals, index=dates) display(df) def make_clickable(val): """ Make the value of a Pandas cell into a clickable link. """ return f'{val}' if val is not None else "" def list_external_links(dates, all_links): """ Display a list of external links using Pandas. """ # Put links into a dataframe, then transpose to make dates into columns df = pd.DataFrame([link["external"] for link in all_links], index=dates).T # Make links clickable and align left df_styler = df.style.format(make_clickable).set_properties(**{"text-align": "left"}) # Make the headers left aligned as well df_styler.set_table_styles([dict(selector="th", props=[("text-align", "left")])]) # Display without the index display(df_styler.hide(axis="index")) def display_links(page_data): """ Extract and display information about links in the saved pages. """ all_links = get_links(page_data) dates = get_dates(page_data) with links_out: display(HTML("


")) display_link_data(dates, all_links) display(HTML("

External links

")) list_external_links(dates, all_links) # In[ ]: # SIMILARITY def calculate_similarity(text1, text2): """ Calculate cosine similarity of two texts. """ try: tfidf = TfidfVectorizer(min_df=1).fit_transform([text1, text2]) except AttributeError: return None return (tfidf * tfidf.T).A[0][1] def calculate_similarities(page_data): """ Calculate cosine similarities for all the text, and the main text only, of the saved pages. """ similarities = { "All text": calculate_similarity(page_data[0]["text"], page_data[1]["text"]), "Main text": calculate_similarity( page_data[0]["main_text"], page_data[1]["main_text"] ), } return similarities def display_similarities(page_data): """ Display the similarity values. """ similarities = calculate_similarities(page_data) df = pd.DataFrame([similarities], index=["Cosine similarity"]).T with sim_out: display(HTML("

Cosine similarity

")) display(df) # In[ ]: # DIFFERENCES def process_text(capture, include="text"): """ Prepare extracted text for diffing, by splitting into lines, and removing any blank lines. """ if include == "text": lines = [ line.strip() for line in BeautifulSoup(capture["html"]).get_text().splitlines() if not re.match(r"^\s*$", line) ] # lines = capture['text'].splitlines() elif include == "main_text": lines = capture["main_text"].splitlines() else: lines = [line.decode() for line in capture["html"].splitlines()] return lines def format_date_link(url): date = format_date_from_timestamp(url) return f'{date}' def show_line_differences(page_data, include="text", context=True, numlines=0): """ Use difflib to show a side-by-side comparison of the text in two web pages. """ differ = HtmlDiff() doc1 = process_text(page_data[0], include=include) doc2 = process_text(page_data[1], include=include) date1 = format_date_link(page_data[0]["url"]) date2 = format_date_link(page_data[1]["url"]) html = differ.make_table( doc1, doc2, context=context, numlines=numlines, fromdesc=date1, todesc=date2 ) # Rewrite the table html to make the column widths work better html = html.replace( r'", "").replace("", "") with diff_out: display(HTML(html)) def display_diff(e): """ Update the diff display when the drop downs selection change. """ diff_out.clear_output(wait=True) which_text.observe(display_diff) what_context.observe(display_diff) with diff_out: display(HTML("

Differences by line

")) display(widgets.HBox([which_text, what_context])) show_line_differences( page_data, include=which_text.value, context=what_context.value ) which_text = widgets.Dropdown( options=[ ("All text", "text"), ("Main text", "main_text"), ("Complete html", "html"), ], description="Compare:", disabled=False, ) what_context = widgets.Dropdown( options=[("Just changes", True), ("Complete context", False)], description="Context:", disabled=False, ) # In[ ]: # SCREENSHOTS wayback = ["ndhadeliver.natlib.govt.nz", "web.archive.org"] pywb = { "web.archive.org.au": "replayFrame", "webarchive.nla.gov.au": "replayFrame", "webarchive.org.uk": "replay_iframe", "webarchive.nationalarchives.gov.uk" : "replay_iframe", } def get_full_page_screenshot(url, save_width=200): """ Gets a full page screenshot of the supplied url. By default resizes the screenshot to a maximum width of 200px. Provide a 'save_width' value to change this. NOTE the webdriver sometimes fails for unknown reasons. Just try again. """ domain = urlparse(url)[1].replace("www.", "") # NZ and IA inject content into the page, so we use if_ to get the original page (with rewritten urls) if domain in wayback and "if_" not in url: url = re.sub(r"/(\d{12}|\d{14})/http", r"/\1if_/http", url) try: date_str, site = re.search( r"/(\d{12}|\d{14})(?:if_|mp_)*/https*://(.+/)", url ).groups() except AttributeError: # There's something wrong with the link... # print(url) show_error(f"{url} isn't a Memento – did you forget to select an archive?") else: output_dir = Path("screenshots") output_dir.mkdir(parents=True, exist_ok=True) ss_file = Path(output_dir, f"{slugify(site)}-{date_str}-{save_width}.png") options = webdriver.FirefoxOptions() options.headless = True driver = webdriver.Firefox(options=options) driver.implicitly_wait(15) driver.get(url) # Give some time for everything to load time.sleep(30) driver.maximize_window() # UK and AU use pywb in framed replay mode, so we need to switch to the framed content if domain in pywb: try: driver.switch_to.frame(pywb[domain]) except selenium.common.exceptions.NoSuchFrameException: # If we pass here we'll probably still get a ss, just not full page -- better than failing? pass ss = None for tag in ["body", "html", "frameset"]: try: elem = driver.find_element(By.TAG_NAME, tag) ss = elem.screenshot_as_base64 break except ( selenium.common.exceptions.NoSuchElementException, selenium.common.exceptions.WebDriverException, ): pass driver.quit() if not ss: show_error(f"Couldn't get a screenshot of {url} – sorry...") else: img = Image.open(io.BytesIO(base64.b64decode(ss))) ratio = save_width / img.width (width, height) = (save_width, math.ceil(img.height * ratio)) resized_img = img.resize((width, height), PIL.Image.Resampling.LANCZOS) resized_img.save(ss_file) return ss_file status = widgets.Output() def display_screenshots(urls): html_output = [] with ss_out: display(HTML("


")) display(status) for url in urls: with status: print("Generating screenshot...") try: ss_file = get_full_page_screenshot(url, save_width=350) if ss_file: date = format_date_from_timestamp(url) try: display_url = re.search( r"/(\d{12}|\d{14})(?:mp_|if_|id_)*/(.*)$", url ).group(1) except AttributeError: display_url = url html_output.append( f'



' ) status.clear_output() ss_out.clear_output(wait=True) with ss_out: display(HTML("


")) display(status) display((HTML("".join(html_output)))) except selenium.common.exceptions.WebDriverException: show_error(f"couldn't get a screenshot of {url} – sorry...") def show_error(message=None): status.clear_output() with status: print(f"Something went wrong – {message}") # In[ ]: # USER INTERFACE page_data = [] TIMEGATES = { "nla": "https://web.archive.org.au/awa/", "nlnz": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/", "bl": "https://www.webarchive.org.uk/wayback/archive/", "ia": "https://web.archive.org/web/", "ukgwa": "https://webarchive.nationalarchives.gov.uk/ukgwa/" } def format_date_for_headers(iso_date, tz): """ Convert an ISO date (YYYY-MM-DD) to a datetime at noon in the specified timezone. Convert the datetime to UTC and format as required by Accet-Datetime headers: eg Fri, 23 Mar 2007 01:00:00 GMT """ local = arrow.get(f"{iso_date} 12:00:00 {tz}", "YYYY-MM-DD HH:mm:ss ZZZ") gmt = local.to("utc") return f'{gmt.format("ddd, DD MMM YYYY HH:mm:ss")} GMT' def format_date_from_timestamp(url): timestamp = re.search(r"/(\d{12}|\d{14})(?:if_|mp_|id_)*/", url).group(1) return arrow.get(timestamp, "YYYYMMDDHHmmss").format("D MMMM YYYY") def parse_links_from_headers(response): """ Extract original, timegate, timemap, and memento links from 'Link' header. """ links = response.links return {k: v["url"] for k, v in links.items()} def query_timegate(timegate, url, date=None, tz="Australia/Canberra"): """ Query the specified repository for a Memento. """ headers = {} if date: formatted_date = format_date_for_headers(date, tz) headers["Accept-Datetime"] = formatted_date # BL,NLNZ & UKGWA don't seem to default to latest date if no date supplied elif not date and timegate in ["bl", "nlnz", "ukgwa"]: formatted_date = format_date_for_headers( arrow.utcnow().format("YYYY-MM-DD"), tz ) headers["Accept-Datetime"] = formatted_date # Note that you don't get a timegate response if you leave off the trailing slash, but extras don't hurt! tg_url = ( f"{TIMEGATES[timegate]}{url}/" if not url.endswith("/") else f"{TIMEGATES[timegate]}{url}" ) # print(tg_url) # IA only works if redirects are followed -- this defaults to False with HEAD requests... if timegate == "ia": allow_redirects = True else: allow_redirects = False response = requests.head(tg_url, headers=headers, allow_redirects=allow_redirects) return parse_links_from_headers(response) def get_memento(timegate, url, date=None, tz="Australia/Canberra"): """ If there's no memento in the results, look for an alternative. """ links = query_timegate(timegate, url, date, tz) # NLNZ doesn't always seem to return a Memento, so we'll build in some fuzziness if links: if "memento" in links: memento = links["memento"] elif "prev memento" in links: memento = links["prev memento"] elif "next memento" in links: memento = links["next memento"] elif "last memento" in links: memento = links["last memento"] else: memento = None return memento def get_mementos(): mementos = [ get_memento(repository.value, target_url.value, first_date.value), get_memento(repository.value, target_url.value, second_date.value), ] return mementos def share_this(urls): binder_url = "https://mybinder.org/v2/gh/GLAM-Workbench/web-archives/master?urlpath=/voila/render/show_diffs.ipynb" parameter_string = quote(f"?url1={urls[0]}&url2={urls[1]}") share_url = f"{binder_url}{parameter_string}" with share_out: display(HTML(f'

Share this: {share_url}

Created by [Tim Sherratt](https://timsherratt.org) for the [GLAM Workbench](https://glam-workbench.github.io).

Work on this notebook was supported by the [IIPC Discretionary Funding Programme 2019-2020](http://netpreserve.org/projects/).