#!/usr/bin/env python # coding: utf-8 # # Gathering historical data about the addition of newspaper titles to Trove # # The number of digitised newspapers available through Trove has increased dramatically since 2009. Understanding when newspapers were added is important for historiographical purposes, but there's no data about this available directly from Trove. This notebook uses web archives to extract lists of newspapers in Trove over time, and chart Trove's development. # # Trove has always provided a browseable list of digitised newspaper titles. The url and format of this list has changed over time, but it's possible to find captures of this page in the Internet Archive and extract the full list of titles. The pages are also captured in the Australian Web Archive, but the Wayback Machine has a more detailed record. # # The pages that I'm looking for are: # # * [http://trove.nla.gov.au/ndp/del/titles](https://web.archive.org/web/*/http://trove.nla.gov.au/ndp/del/titles) # * [https://trove.nla.gov.au/newspaper/about](https://web.archive.org/web/*/https://trove.nla.gov.au/newspaper/about) # # This notebook creates the following data files: # # * [trove_newspaper_titles_2009_2021.csv](https://github.com/GLAM-Workbench/trove-newspapers/blob/master/trove_newspaper_titles_2009_2021.csv) – complete dataset of captures and titles # * [trove_newspaper_titles_first_appearance_2009_2021.csv](https://github.com/GLAM-Workbench/trove-newspapers/blob/master/trove_newspaper_titles_first_appearance_2009_2021.csv) – filtered dataset, showing only the first appearance of each title / place / date range combination # # I've also created a [browseable list of titles](https://gist.github.com/wragge/7d80507c3e7957e271c572b8f664031a), showing when they first appeared in Trove. # In[1]: import json import re from pathlib import Path import altair as alt import arrow import pandas as pd import requests_cache from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from surt import surt s = requests_cache.CachedSession("archived_titles") retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) s.mount("https://", HTTPAdapter(max_retries=retries)) s.mount("http://", HTTPAdapter(max_retries=retries)) # ## Code for harvesting web archive captures # # We're using the Memento protocol to get a list of captures. See the [Web Archives section](https://glam-workbench.net/web-archives/) of the GLAM Workbench for more details. # In[4]: # The code in this cell is copied from notebooks in the Web Archives section of the GLAM Workbench (https://glam-workbench.net/web-archives/) # In particular see: https://glam-workbench.net/web-archives/#find-all-the-archived-versions-of-a-web-page # These are the repositories we'll be using TIMEGATES = { "awa": "https://web.archive.org.au/awa/", "nzwa": "https://ndhadeliver.natlib.govt.nz/webarchive/wayback/", "ukwa": "https://www.webarchive.org.uk/wayback/en/archive/", "ia": "https://web.archive.org/web/", } def convert_lists_to_dicts(results): """ Converts IA style timemap (a JSON array of arrays) to a list of dictionaries. Renames keys to standardise IA with other Timemaps. """ if results: keys = results[0] results_as_dicts = [dict(zip(keys, v)) for v in results[1:]] else: results_as_dicts = results for d in results_as_dicts: d["status"] = d.pop("statuscode") d["mime"] = d.pop("mimetype") d["url"] = d.pop("original") return results_as_dicts def get_capture_data_from_memento(url, request_type="head"): """ For OpenWayback systems this can get some extra capture info to insert into Timemaps. """ if request_type == "head": response = s.head(url) else: response = s.get(url) headers = response.headers length = headers.get("x-archive-orig-content-length") status = headers.get("x-archive-orig-status") status = status.split(" ")[0] if status else None mime = headers.get("x-archive-orig-content-type") mime = mime.split(";")[0] if mime else None return {"length": length, "status": status, "mime": mime} def convert_link_to_json(results, enrich_data=False): """ Converts link formatted Timemap to JSON. """ data = [] for line in results.splitlines(): parts = line.split("; ") if len(parts) > 1: link_type = re.search( r'rel="(original|self|timegate|first memento|last memento|memento)"', parts[1], ).group(1) if link_type == "memento": link = parts[0].strip("<>") timestamp, original = re.search(r"/(\d{14})/(.*)$", link).groups() capture = { "urlkey": surt(original), "timestamp": timestamp, "url": original, } if enrich_data: capture.update(get_capture_data_from_memento(link)) print(capture) data.append(capture) return data def get_timemap_as_json(timegate, url, enrich_data=False): """ Get a Timemap then normalise results (if necessary) to return a list of dicts. """ tg_url = f"{TIMEGATES[timegate]}timemap/json/{url}/" response = s.get(tg_url) response_type = response.headers["content-type"] if response_type == "text/x-ndjson": data = [json.loads(line) for line in response.text.splitlines()] elif response_type == "application/json": data = convert_lists_to_dicts(response.json()) elif response_type in ["application/link-format", "text/html;charset=utf-8"]: data = convert_link_to_json(response.text, enrich_data=enrich_data) return data # ## Harvest the title data from the Internet Archive # # This gets the web page captures from the Internet Archive, scrapes the list of titles from the page, then does a bit of normalisation of the title data. # In[6]: titles = [] # These are the pages that listed available titles. # There was a change in 2016 pages = [ {"url": "http://trove.nla.gov.au/ndp/del/titles", "path": "/ndp/del/title/"}, {"url": "https://trove.nla.gov.au/newspaper/about", "path": "/newspaper/title/"}, ] for page in pages: for capture in get_timemap_as_json("ia", page["url"]): if capture["status"] == "200": url = f'https://web.archive.org/web/{capture["timestamp"]}id_/{capture["url"]}' # print(url) capture_date = arrow.get(capture["timestamp"][:8], "YYYYMMDD").format( "YYYY-MM-DD" ) # print(capture_date) response = s.get(url) soup = BeautifulSoup(response.content) title_links = soup.find_all("a", href=re.compile(page["path"])) for title in title_links: # Get the title text full_title = title.get_text().strip() # Get the title id title_id = re.search(r"\/(\d+)\/?$", title["href"]).group(1) # Most of the code below is aimed at normalising the publication place and dates values to allow for easy grouping & deduplication brief_title = re.sub(r"\(.+\)\s*$", "", full_title).strip() try: details = re.search(r"\((.+)\)\s*$", full_title).group(1).split(":") except AttributeError: place = "" dates = "" else: try: place = details[0].strip() # Normalise states try: place = re.sub( r"(, )?([A-Za-z]+)[\.\s]*$", lambda match: f'{match.group(1) if match.group(1) else ""}{match.group(2).upper()}', place, ) except AttributeError: pass # Normalise dates dates = " - ".join( [d.strip() for d in details[1].strip().split("-")] ) except IndexError: place = "" dates = " - ".join( [d.strip() for d in details[0].strip().split("-")] ) titles.append( { "title_id": title_id, "full_title": full_title, "title": brief_title, "place": place, "dates": dates, "capture_date": capture_date, "capture_timestamp": capture["timestamp"], } ) # ## Convert the title data to a DataFrame for analysis # In[7]: df = pd.DataFrame(titles) # In[8]: df # In[9]: # Number of captures len(df["capture_timestamp"].unique()) # In[10]: # Number of days on which the pages were captured len(df["capture_date"].unique()) # Save this dataset as a CSV file. # In[11]: df.to_csv("trove_newspaper_titles_2009_2021.csv", index=False) # ## How did the number of titles change over time? # In[12]: # Drop duplicates in cases where there were mutiple captures on a single day captures_df = df.drop_duplicates(subset=["capture_date", "full_title"]) # Calculate totals per capture capture_totals = captures_df["capture_date"].value_counts().to_frame().reset_index() capture_totals.columns = ["capture_date", "total"] capture_totals # In[13]: alt.Chart(capture_totals).mark_line(point=True).encode( x=alt.X("capture_date:T", title="Date captured"), y=alt.Y("total:Q", title="Number of newspaper titles"), tooltip=[alt.Tooltip("capture_date:T", format="%e %b %Y"), "total:Q"], ).properties(width=700) # ## When did titles first appear? # # For historiographical purposes, its useful to know when a particular title first appeared in Trove. Here we'll only keep the first appearance of each title (or any subsequent changes to its date range / location). # In[14]: first_appearance = df.drop_duplicates(subset=["title", "place", "dates"]) # In[15]: first_appearance # Find when a particular newspaper first appeared. # In[16]: first_appearance.loc[first_appearance["title"] == "Canberra Times"] # Generate an alphabetical list for easy browsing. View the [results as a Gist](https://gist.github.com/wragge/7d80507c3e7957e271c572b8f664031a). # In[17]: with Path("titles_list.md").open("w") as titles_list: for title, group in first_appearance.groupby(["title", "title_id"]): places = " | ".join(group["place"].unique()) titles_list.write( f'

{title[0]} ({places})

' ) titles_list.write( group.sort_values(by="capture_date")[ ["capture_date", "dates", "place"] ].to_html(index=False) ) # Save this dataset to CSV. # In[18]: first_appearance.to_csv( "trove_newspaper_titles_first_appearance_2009_2021.csv", index=False ) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/). # Support this project by becoming a [GitHub sponsor](https://github.com/sponsors/wragge?o=esb).