#!/usr/bin/env python # coding: utf-8 # # National Archives Index Explorer # # The National Archives provide a web based search interface for searching index catalogues of various National Archives collections. # # As well as a simple search box that does a free text search over all record columns (presumably?), we can also run advanced searches that can include reference and date limits. # # Search results containing the index records for your search hits can be downloaded as a CSV file. # # By searching for records associated with a particular collection tag / reference, we can obtain, and thence download, a copy of the collection's index records. # # We can then load these records into our own database and search them using our own search tools, as well as annotation the records using things like named entity recognition. # # So let's have a go at that... # ## Obtaining the Index Data # # Searching for index records associated with `HO-40-1` over the period `1800-15` leads us to a search results page with the URL: # # `https://discovery.nationalarchives.gov.uk/results/r?_cr=HO%2040-1&_dss=range&_sd=1810&_ed=1815&_ro=any&_st=adv` # # This HTTP GETs the URL `https://discovery.nationalarchives.gov.uk/results/r` with arguments: # # - `_cr:'HO 40-1'` # - `_dss:'range'` # - `_sd:1810` # - `_ed:1815` # # # To download the data records, we then need to click a form button, rather than a web link. # # We can automate this procedure by constructing the desired URL, with appropriate arguments, ensuring the correct form download options are set, "click" the download button and capture the response. # In[1]: # Mechanical soup is a combination of a simple virtual browser (mechanize) and # a web scraping package (BeautifulSoup) import mechanicalsoup # Define the URL of the search results and download page: # In[2]: url='https://discovery.nationalarchives.gov.uk/results/r' # Specify the search limits around the collection we are interested in: # In[3]: params = {'_cr':'HO 40-1','_dss':'range','_sd':1810,'_ed':1815} # Open the page with those parameters: # In[4]: browser = mechanicalsoup.StatefulBrowser() browser.open(url, params=params) # Configure the search form: # In[5]: browser.select_form('form[action="/search/download"]') browser["expSize"] = "10" #browser.get_current_form().print_summary() # "Click" the download button: # In[6]: response = browser.submit_selected() # Read the response into a *pandas* dataframe and preview the result, casting date fields into date format: # In[7]: #StringIO is a function for wrapping a file pointer around a string from io import StringIO #Pandas is a package for working with tabular datasets import pandas as pd # In[8]: df = pd.read_csv(StringIO(response.text)) #Force the start and end date columns into a date format df['Start Date'] = pd.to_datetime(df['Start Date'],errors='coerce', dayfirst=True) df['End Date'] = pd.to_datetime(df['End Date'],errors='coerce', dayfirst=True) df.head(3) # ## Building Up A Larger Index # # We can build up a larger index by extending our search, or by combining the downloads from mutliple searches. # Create a function to do the download of a single index: # In[9]: def get_index(reference, start=1810, end=1815, typ='ref'): """Download index for a specify reference and convert it to a dataframe.""" url='https://discovery.nationalarchives.gov.uk/results/r' params = {'_dss':'range','_sd':start,'_ed':end} if typ=='search': params['_q']=reference else: params['_cr']=reference browser = mechanicalsoup.StatefulBrowser() browser.open(url, params=params) #No results if browser.get_current_page().find("div", {"class": "emphasis-block no-results"}): return pd.DataFrame() browser.select_form('form[action="/search/download"]') browser["expSize"] = "10" response = browser.submit_selected() _df = pd.read_csv(StringIO(response.text)) #Force the start and end date columns into a date format _df['Start Date'] = pd.to_datetime(_df['Start Date'], errors='coerce', dayfirst=True) _df['End Date'] = pd.to_datetime(_df['End Date'], errors='coerce', dayfirst=True) return _df # In[10]: get_index('HO 42').head(3) # Note that some searches seem to be quite wideranging against particular codes (rather than lookups *by reference*), and some responses also appear to contain transcipts in the `Description` field. # In[45]: #Pull out the first 500 characters of records longer than 3000 characters [r[:500] for r in get_index('HO 42',typ='search')['Description'].to_list() if len(r)>2000] # We can now use that function to download and combine indexes for multiple references: # In[11]: search_references = ['HO 40-1', 'HO 40-2', 'HO 43-19', 'HO 43-20', 'HO 43-21', 'HO 42-110'] # In[41]: df_combined = pd.DataFrame() for reference in search_references: _df = get_index(reference) print(f'{reference}: {len(_df)}') df_combined = df_combined.append( _df ) df_combined = df_combined.sort_values('Citable Reference').reset_index(drop=True) df_combined.head() # We can get a better view over the descriptions: # In[13]: df_combined['Description'].to_list() # ## Extract Named Entities # # The title field appears to be a subset of the description field (up to the first N characters). # # We can parse named entities out of the description field to make searching the records easier. # # The `spacy` natural language processing (NLP) package provides a named entity tagger that is good enough to get us started. # In[14]: import spacy # In[15]: #Install the package that provides the named entity model #!python -m spacy download en_core_web_sm # Here's an example of running the named entity tagger: # In[16]: nlp = spacy.load("en_core_web_sm") TEST_STRING = "Joseph Radcliffe, wrote a letter to the Home Office on March 5th, 1812 about the Luddites." doc = nlp(TEST_STRING) for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_) # *`GPE` is a "geo-political entity". There is also a related `NORP`: "nationalities or religious or political groups".* The numbers are the index values identifying the first and last character of the extracted string in the original string. # # We can create a simple function to pull out the elements we want, returning a list of all elements extracted from a block of text. # In[17]: def entity_rec(txt): """Extract entities from text and return a list entity text and entity type tuples.""" doc = nlp(txt) ents = [] for ent in doc.ents: #ents.append((ent.text, ent.start_char, ent.end_char, ent.label_)) #Exclude certain entity types from the returned list if ent.label_ not in ['CARDINAL']: ents.append((ent.text, ent.label_)) return ents # We can apply this function to the `Description` text associated with each row: # In[18]: df['Entities'] = df['Description'].apply(lambda x: entity_rec(x)) df.head(3) # We can then generate a long format data frame that associates each entity tuple with each record, as identified by the record `ID`: # In[19]: df_entities = df.explode('Entities').reset_index(drop=True)[['ID','Entities']] df_entities.head(3) # We can then split out the entity tuple elements into separate columns, noting that the entity type recognition, as well the entity extraction, may be a bit ropey: # In[20]: df_entities[['Entity','Type']] = df_entities['Entities'].apply(pd.Series) df_entities.drop(columns='Entities', inplace=True) df_entities.head(10) # If we wanted to work on this a bit more, it would be handy to try be be able to recognise English county and placenames as such. We could also try to munge any `DATE` elements through a robust date parser in order to get the dates into an actual date object. # # One other useful bit of information are the folio / page numbers. # In[22]: import re TEST_STRING_2 = "Cheshire, Lancashire, Yorkshire ff 1-173 ff 174-283." FF_PATTERN = r"ff \d+-\d+" m = re.findall(FF_PATTERN, TEST_STRING_2, re.MULTILINE) m # Again, we can capture these into a long dataframe: # In[32]: df['Pages'] = df['Description'].apply(lambda x: re.findall(FF_PATTERN, x, re.MULTILINE)) df[['Description','Pages']].head(10) # We can make the table longer by exploding multiple page references for any given record, and then also splitting out the first and last page reference: # In[40]: df_pages = df.explode('Pages').reset_index(drop=True)[['ID','Pages']].dropna() df_pages[['Start', 'End']] = df_pages['Pages'].str.replace('ff','').str.strip().str.split('-').apply(pd.Series) df_pages.sort_values(['ID','Start'], inplace=True) df_pages.reset_index(drop=True, inplace=True) df_pages.head(10) # ## Referencing Into Actual PDF Documents # # When downloading a scanned collection from the National Archives, the scan associated with a reference, for example, the scan associated with `HO 40/1`, may be split into several separate PDF documents. # # We can merge these into a single document, which makes working with it slghtly easier from a programmatic point of view, albeit at making the memory requirements when dealing with a particular collection slightly heavier... # # The following cell finds the filenames of all the PDFs I downloaded as part of the `HO-40-1` download and sorts them. # In[25]: from os import listdir reference = 'HO-40-1' pdfs = [f'../HO - Home Office/{f}' for f in listdir('../HO - Home Office') if f.startswith(reference)] pdfs.sort() pdfs[:3] # We can then merge all these separate PDFs into a single PDF and save it as a new file: # In[26]: from PyPDF2 import PdfFileMerger merger = PdfFileMerger() for pdf in pdfs: merger.append(pdf) #Save the merged PDF merger.write(f"{reference}_result.pdf") merger.close() # We can view specified pages within the merged PDF as an image file, converted from the PDF using ImageMagick, at a specific page number. # In[27]: page_num = 500 # In[28]: #The wand package provides a Python API for the Imagemagick application #!pip3 install --user Wand from wand.image import Image as WImage print(f'Displaying at PDF page {page_num}.') WImage(filename=f'{reference}_result.pdf[{page_num}]',resolution=200) # In[ ]: