New to Jupyter notebooks? Try Using Jupyter notebooks for a quick introduction.
This notebook explores what we can find when you look at all captures of a single page over time.
Work in progress – this notebook isn't finished yet. Check back later for more...
import re
import altair as alt
import pandas as pd
import requests
def query_cdx(url, **kwargs):
params = kwargs
params["url"] = url
params["output"] = "json"
response = requests.get(
"http://web.archive.org/cdx/search/cdx",
params=params,
headers={"User-Agent": ""},
)
response.raise_for_status()
return response.json()
url = "http://nla.gov.au"
data = query_cdx(url)
# Convert to a dataframe
# The column names are in the first row
df = pd.DataFrame(data[1:], columns=data[0])
# Convert the timestamp string into a datetime object
df["date"] = pd.to_datetime(df["timestamp"])
df.sort_values(by="date", inplace=True, ignore_index=True)
# Convert the length from a string into an integer
df["length"] = df["length"].astype("int")
As noted in the notebook comparing the CDX API with Timemaps, there are a number of duplicate snapshots in the CDX results, so let's remove them.
print(f"Before: {df.shape[0]}")
df.drop_duplicates(
subset=["timestamp", "original", "digest", "statuscode", "mimetype"],
keep="first",
inplace=True,
)
print(f"After: {df.shape[0]}")
df["date"].min()
df["date"].max()
df["length"].describe()
df["statuscode"].value_counts()
df["mimetype"].value_counts()
# This is just a bit of fancy customisation to group the types of errors by color
# See https://altair-viz.github.io/user_guide/customization.html#customizing-colors
domain = ["-", "200", "301", "302", "404", "503"]
# green for ok, blue for redirects, red for errors
range_ = ["#888888", "#39a035", "#5ba3cf", "#125ca4", "#e13128", "#b21218"]
alt.Chart(df).mark_point().encode(
x="date:T",
y="length:Q",
color=alt.Color("statuscode", scale=alt.Scale(domain=domain, range=range_)),
tooltip=["date", "length", "statuscode"],
).properties(width=700, height=300)