#!/usr/bin/env python
# coding: utf-8

# # Library and Archives Canada, Naturalization Records, 1915-1946
# ## Harvest records by country
# 
# See the LAC site for more [details of the database](http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/introduction.aspx) and how it was created.
# 
# This notebook helps you create a dataset with records of people from a specific country.
# 
# **Problems and limitations:**
# 
# * the database returns a **maximum of 2000 results** for any query;
# * I'd thought you might be able to get around this by using wildcard searches in the `surname` field, but despite the example given on the search page, there are no wildcard searches, instead all search terms are treated as substrings, matching anywhere in a field — so a surname search for 'Lee' returns 'Batslee', 'Fleenor' etc;
# * it seems that results are ordered by the `item id`, which appear to be assigned alphabetically by date, but there's no way of being sure about this — so the first 2000 results are *probably* the earliest results;
# * there doesn't seem to be any way of finding out what country names (or variations thereof) are in use, so you need to play around with the web interface first to find out which values work;
# * wives and children of a naturalised man are not assigned a `country` value, so they won't be picked up by a `country` search — see below for a way of possibly overcoming this...
# 
# **Results:**
# 
# Once harvested you can save the harvested data as a CSV file. The CSV file will contain the following columns:
# 
# * `item_id`
# * `surname`
# * `given_names`
# * `country`
# * `relation`
# * `year`
# * `reference`
# * `page`
# * `pdf_id`
# * `pdf_url`
# 
# Here's the results for a harvest of 'China':
# 
# * Search for 'China' — [lac-naturalisations-China.csv](lac-naturalisations-China.csv)
# * Search for 'China' supplemented with family members — [lac-naturalisations-China-with-families.csv](lac-naturalisations-China-with-families.csv)
# 
# 

# ### Setting things up

# In[162]:


# Import the bits and pieces that we need
import re
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
import pandas as pd
from IPython.display import display, HTML, FileLink


# In[22]:


# Set some variables
s = requests.Session()
SEARCH_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/list-naturalization-1915-1939.aspx'
ITEM_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/item-naturalization-1915-1939.aspx'


# In[81]:


# Define some functions

def process_page(soup):
    '''
    Extract data from a page of results.
    '''
    results = []
    try:
        for row in soup.find('table', class_='result_table').find('tbody').find_all('tr'):
            cells = row.find_all('td')
            results.append({
                'item_id': cells[0].get_text(),
                'surname': cells[1].string.strip(),
                'given_names': cells[2].string.strip(),
                'country': cells[3].string.strip()
            })
    except AttributeError:
        pass
        # No results
    return results

def process_row(soup, label):
    '''
    Get value from the row with the given label.
    '''
    try:
        value = soup.find('div', class_='genapp_item_display_label', string=re.compile(label)).find_next_sibling('div').string.strip()
    except AttributeError:
        value = ''
    return value

def process_item(item):
    '''
    Get data from an individual item page.
    '''
    response = s.get(ITEM_URL, params={'IdNumber': item['item_id']})
    soup = BeautifulSoup(response.text, 'lxml')
    for label in ['Year', 'Page', 'Reference', 'Relation']:
        item[label.lower()] = process_row(soup, label)
    pdf_link = soup.find('a', href=re.compile(r'&op=pdf'))
    item['pdf_id'] = pdf_link.string.strip()
    item['pdf_url'] = pdf_link['href']
    return item

def get_total_results(soup):
    '''
    Get the toal number of results for a search.
    '''
    results_info = soup.find('div', class_='search_term_value').string.strip()
    total_results = re.search(r'^\d+', results_info).group(0)
    return int(total_results)

def harvest_results_by_country(country):
    '''
    Harvest search results for the supplied country.
    Return a maximum of 2000 results.
    '''
    items = []
    params = {
        'CountryEn': country,
        'p_ID': 0
    }
    response = s.get(SEARCH_URL, params=params)
    soup = BeautifulSoup(response.text, 'lxml')
    total_results = get_total_results(soup)
    with tqdm_notebook(total=total_results) as pbar:
        for page in range(0, total_results, 15):
            params['p_ID'] = page
            response = s.get(SEARCH_URL, params=params)
            soup = BeautifulSoup(response.text, 'lxml')
            results = process_page(soup)
            for result in tqdm_notebook(results, leave=False):
                items.append(process_item(result))
                time.sleep(0.5)
            time.sleep(0.5)
            pbar.update(len(results))
    return items 


# ### Running the harvest

# In[96]:


# Start the harvest
# Substitute your own country value here
country = 'China'
items = harvest_results_by_country(country)


# ### Viewing the results

# In[155]:


df = pd.DataFrame(items)


# In[156]:


df.head()


# In[157]:


# How many results?
len(df)


# ### Save as CSV

# In[164]:


# Reorder columns
df = df[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
# Change id to numeric so we can use it to order
df['item_id'] = pd.to_numeric(df['item_id'])
df = df.replace('NULL', '')
df = df.sort_values(by=['item_id'])
df.to_csv('lac-naturalisations-{}.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}.csv'.format(country)))


# ### Adding wives and children
# 
# As noted above, the wives and children of a naturalised man aren't assigned a `country` value and so will be missing from the harvested data. While there are no explicit links between a naturalised man and his family, by making a couple of assumptions we can attempt to add data for wives and children.
# 
# Assumptions:
# 
# * wives and children will have the same surname as the naturalised man;
# * wives and children will appear immediately after tha naturalised man in the registers and will therefore be assigned sequential item ids.
# 
# These assumptions are based on some random poking around in the PDFs, but I can't be sure they will hold in every case.
# 
# Based on these assumptions, the methodology for adding records is:
# 
# * loop through all harvested records
# * for each record search for the surname
# * process the search results — as everything's treated as a substring and there's no relevance ranking we have to go through all the results which seems really inefficient, but...
# * if surnames match, ids are sequential, and the country field is empty, then it looks like we have a family member — grab the full details for this record
# 
# This is really slow and inefficient because if a naturalised man has no family we end up looking through all (or the first 2000) results for their surname. Also, if there are more than 2000 results for a surname then we might miss family members — there's no way of knowing...

# In[105]:


def harvest_families(items):
    '''
    Attempts to add family members to the search results for a given country.
    '''
    new_items = items.copy()
    for item in tqdm_notebook(items):
        # Are there more records to processs?
        more = True
        # Have family members been found?
        found = False
        # Look for the id following the man's id
        current_id = int(item['item_id']) + 1
        page = 0
        while more:
            response = s.get(SEARCH_URL, params={'Surname': item['surname'], 'p_ID': page})
            soup = BeautifulSoup(response.text)
            # Check for results
            try:
                rows = soup.find('table', class_='result_table').find('tbody').find_all('tr')
            except AttributeError:
                more = False
            else:
                # Process the rows on a page
                for row in rows:
                    cells = row.find_all('td')
                    # Check that the record has a sequential id and no country
                    if (int(cells[0].get_text()) == current_id) and (cells[1].string.strip() == item['surname']) and (cells[3].string == None):
                        new_item = {
                                    'item_id': cells[0].get_text(),
                                    'surname': cells[1].string.strip(),
                                    'given_names': cells[2].string.strip(),
                                    'country': ''
                                }
                        new_item = process_item(new_item)
                        new_items.append(new_item)
                        current_id += 1
                        # We've found a family member
                        found = True
                        time.sleep(0.5)
                    else:
                        # If we've already found family members
                        if found:
                            # It seems there are no more family members, so let's get out of the loops
                            more = False
                            break
            page += 15
            time.sleep(0.5)
    return new_items


# In[106]:


new_items = harvest_families(items)


# ### Let's look at the new dataset

# In[120]:


df2 = pd.DataFrame(new_items)


# In[143]:


# How many altogether now?
len(df2)


# In[145]:


# How many wives and children are there
len(df2.loc[df2['country'] == ''])


# In[139]:


df2.loc[df2['country'] == ''].head()


# In[140]:


# Just wives
df2.loc[df2['relation'] == 'Wife'].head()


# ### Save to CSV

# In[163]:


df2 = df2[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
df2['item_id'] = pd.to_numeric(df2['item_id'])
df2 = df2.replace('NULL', '')
df2 = df2.sort_values(by=['item_id'])
df2.to_csv('lac-naturalisations-{}-with-families.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}-with-families.csv'.format(country)))