#!/usr/bin/env python
# coding: utf-8

# # National Archives Index Explorer
# 
# The National Archives provide a web based search interface for searching index catalogues of various National Archives collections.
# 
# As well as a simple search box that does a free text search over all record columns (presumably?), we can also run advanced searches that can include reference and date limits.
# 
# Search results containing the index records for your search hits can be downloaded as a CSV file.
# 
# By searching for records associated with a particular collection tag / reference, we can obtain, and thence download, a copy of the collection's index records.
# 
# We can then load these records into our own database and search them using our own search tools, as well as annotation the records using things like named entity recognition.
# 
# So let's have a go at that...

# ## Obtaining the Index Data
# 
# Searching for index records associated with `HO-40-1` over the period `1800-15` leads us to a search results page with the URL:
# 
# `https://discovery.nationalarchives.gov.uk/results/r?_cr=HO%2040-1&_dss=range&_sd=1810&_ed=1815&_ro=any&_st=adv`
# 
# This HTTP GETs the URL `https://discovery.nationalarchives.gov.uk/results/r` with arguments:
# 
# - `_cr:'HO 40-1'`
# - `_dss:'range'`
# - `_sd:1810`
# - `_ed:1815`
# 
# 
# To download the data records, we then need to click a form button, rather than a web link.
# 
# We can automate this procedure by constructing the desired URL, with appropriate arguments, ensuring the correct form download options are set, "click" the download button and capture the response.

# In[1]:


# Mechanical soup is a combination of a simple virtual browser (mechanize) and
# a web scraping package (BeautifulSoup)
import mechanicalsoup


# Define the URL of the search results and download page:

# In[2]:


url='https://discovery.nationalarchives.gov.uk/results/r'


# Specify the search limits around the collection we are interested in:

# In[3]:


params = {'_cr':'HO 40-1','_dss':'range','_sd':1810,'_ed':1815}


# Open the page with those parameters:

# In[4]:


browser = mechanicalsoup.StatefulBrowser()
browser.open(url, params=params)


# Configure the search form:

# In[5]:


browser.select_form('form[action="/search/download"]')
browser["expSize"] = "10"
#browser.get_current_form().print_summary()


# "Click" the download button:

# In[6]:


response = browser.submit_selected()


# Read the response into a *pandas* dataframe and preview the result, casting date fields into date format:

# In[7]:


#StringIO is a function for wrapping a file pointer around a string
from io import StringIO

#Pandas is a package for working with tabular datasets
import pandas as pd


# In[8]:


df = pd.read_csv(StringIO(response.text))

#Force the start and end date columns into a date format
df['Start Date'] = pd.to_datetime(df['Start Date'],errors='coerce', dayfirst=True)
df['End Date'] = pd.to_datetime(df['End Date'],errors='coerce', dayfirst=True)

df.head(3)


# ## Building Up A Larger Index
# 
# We can build up a larger index by extending our search, or by combining the downloads from mutliple searches.

# Create a function to do the download of a single index:

# In[9]:


def get_index(reference, start=1810, end=1815, typ='ref'):
    """Download index for a specify reference and convert it to a dataframe."""
    
    url='https://discovery.nationalarchives.gov.uk/results/r'
    params = {'_dss':'range','_sd':start,'_ed':end}
    
    if typ=='search':
        params['_q']=reference
    else:
        params['_cr']=reference
    
    browser = mechanicalsoup.StatefulBrowser()
    browser.open(url, params=params)
    
    #No results
    if browser.get_current_page().find("div", {"class": "emphasis-block no-results"}):
        return pd.DataFrame()
    
    browser.select_form('form[action="/search/download"]')
    browser["expSize"] = "10"
    
    response = browser.submit_selected()
    
    _df = pd.read_csv(StringIO(response.text))

    #Force the start and end date columns into a date format
    _df['Start Date'] = pd.to_datetime(_df['Start Date'], errors='coerce', dayfirst=True)
    _df['End Date'] = pd.to_datetime(_df['End Date'], errors='coerce', dayfirst=True)
    
    return _df 


# In[10]:


get_index('HO 42').head(3)


# Note that some searches seem to be quite wideranging against particular codes (rather than lookups *by reference*), and some responses also appear to contain transcipts in the `Description` field.

# In[45]:


#Pull out the first 500 characters of records longer than 3000 characters
[r[:500] for r in get_index('HO 42',typ='search')['Description'].to_list() if len(r)>2000]


# We can now use that function to download and combine indexes for multiple references:

# In[11]:


search_references = ['HO 40-1', 'HO 40-2', 'HO 43-19', 'HO 43-20', 'HO 43-21', 'HO 42-110']


# In[41]:


df_combined = pd.DataFrame()

for reference in search_references:
    _df = get_index(reference)
    print(f'{reference}: {len(_df)}')
    df_combined = df_combined.append( _df )
    
df_combined = df_combined.sort_values('Citable Reference').reset_index(drop=True)
df_combined.head()


# We can get a better view over the descriptions:

# In[13]:


df_combined['Description'].to_list()


# ## Extract Named Entities
# 
# The title field appears to be a subset of the description field (up to the first N characters).
# 
# We can parse named entities out of the description field to make searching the records easier.
# 
# The `spacy` natural language processing (NLP) package provides a named entity tagger that is good enough to get us started.

# In[14]:


import spacy


# In[15]:


#Install the package that provides the named entity model
#!python -m spacy download en_core_web_sm


# Here's an example of running the named entity tagger:

# In[16]:


nlp = spacy.load("en_core_web_sm")

TEST_STRING = "Joseph Radcliffe, wrote a letter to the Home Office on March 5th, 1812 about the Luddites."

doc = nlp(TEST_STRING)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


# *`GPE` is a "geo-political entity". There is also a related `NORP`: "nationalities or religious or political groups".* The numbers are the index values identifying the first and last character of the extracted string in the original string.
# 
# We can create a simple function to pull out the elements we want, returning a list of all elements extracted from a block of text.

# In[17]:


def entity_rec(txt):
    """Extract entities from text and return a list entity text and entity type tuples."""
    
    doc = nlp(txt)
    
    ents = []
    for ent in doc.ents:
        #ents.append((ent.text, ent.start_char, ent.end_char, ent.label_))
        #Exclude certain entity types from the returned list
        if ent.label_ not in ['CARDINAL']:
            ents.append((ent.text, ent.label_))
        
    return ents


# We can apply this function to the `Description` text associated with each row:

# In[18]:


df['Entities'] = df['Description'].apply(lambda x: entity_rec(x))
df.head(3)


# We can then generate a long format data frame that associates each entity tuple with each record, as identified by  the record `ID`:

# In[19]:


df_entities = df.explode('Entities').reset_index(drop=True)[['ID','Entities']]
df_entities.head(3)


# We can then split out the entity tuple elements into separate columns, noting that the entity type recognition, as well the entity extraction, may be a bit ropey:

# In[20]:


df_entities[['Entity','Type']] = df_entities['Entities'].apply(pd.Series)
df_entities.drop(columns='Entities', inplace=True)
df_entities.head(10)


# If we wanted to work on this a bit more, it would be handy to try be be able to recognise English county and placenames as such. We could also try to munge any `DATE` elements through a robust date parser in order to get the dates into an actual date object.
# 
# One other useful bit of information are the folio / page numbers.

# In[22]:


import re

TEST_STRING_2 = "Cheshire, Lancashire, Yorkshire ff 1-173 ff 174-283."

FF_PATTERN = r"ff \d+-\d+"

m = re.findall(FF_PATTERN, TEST_STRING_2, re.MULTILINE)
m


# Again, we can capture these into a long dataframe:

# In[32]:


df['Pages'] = df['Description'].apply(lambda x: re.findall(FF_PATTERN, x, re.MULTILINE))
df[['Description','Pages']].head(10)


# We can make the table longer by exploding multiple page references for any given record, and then also splitting out the first and last page reference:

# In[40]:


df_pages = df.explode('Pages').reset_index(drop=True)[['ID','Pages']].dropna()
df_pages[['Start', 'End']] = df_pages['Pages'].str.replace('ff','').str.strip().str.split('-').apply(pd.Series)
df_pages.sort_values(['ID','Start'], inplace=True)
df_pages.reset_index(drop=True, inplace=True)
df_pages.head(10)


# ## Referencing Into Actual PDF Documents
# 
# When downloading a scanned collection from the National Archives, the scan associated with a reference, for example, the scan associated with `HO 40/1`, may be split into several separate PDF documents.
# 
# We can merge these into a single document, which makes working with it slghtly easier from a programmatic point of view, albeit at making the memory requirements when dealing with a particular collection slightly heavier...
# 
# The following cell finds the filenames of all the PDFs I downloaded as part of the `HO-40-1` download and sorts them.

# In[25]:


from os import listdir

reference = 'HO-40-1'
pdfs = [f'../HO - Home Office/{f}' for f in listdir('../HO - Home Office') if f.startswith(reference)]
pdfs.sort()

pdfs[:3]


# We can then merge all these separate PDFs into a single PDF and save it as a new file:

# In[26]:


from PyPDF2 import PdfFileMerger

merger = PdfFileMerger()

for pdf in pdfs:
    merger.append(pdf)

#Save the merged PDF
merger.write(f"{reference}_result.pdf")
merger.close()


# We can view specified pages within the merged PDF as an image file, converted from the PDF using ImageMagick, at a specific page number.

# In[27]:


page_num = 500


# In[28]:


#The wand package provides a Python API for the Imagemagick application
#!pip3 install --user Wand
from wand.image import Image as WImage

print(f'Displaying at PDF page {page_num}.')

WImage(filename=f'{reference}_result.pdf[{page_num}]',resolution=200)


# In[ ]: