See the LAC site for more details of the database and how it was created.
This notebook helps you create a dataset with records of people from a specific country.
Problems and limitations:
surname
field, but despite the example given on the search page, there are no wildcard searches, instead all search terms are treated as substrings, matching anywhere in a field — so a surname search for 'Lee' returns 'Batslee', 'Fleenor' etc;item id
, which appear to be assigned alphabetically by date, but there's no way of being sure about this — so the first 2000 results are probably the earliest results;country
value, so they won't be picked up by a country
search — see below for a way of possibly overcoming this...Results:
Once harvested you can save the harvested data as a CSV file. The CSV file will contain the following columns:
item_id
surname
given_names
country
relation
year
reference
page
pdf_id
pdf_url
Here's the results for a harvest of 'China':
# Import the bits and pieces that we need
import re
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
import pandas as pd
from IPython.display import display, HTML, FileLink
# Set some variables
s = requests.Session()
SEARCH_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/list-naturalization-1915-1939.aspx'
ITEM_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/item-naturalization-1915-1939.aspx'
# Define some functions
def process_page(soup):
'''
Extract data from a page of results.
'''
results = []
try:
for row in soup.find('table', class_='result_table').find('tbody').find_all('tr'):
cells = row.find_all('td')
results.append({
'item_id': cells[0].get_text(),
'surname': cells[1].string.strip(),
'given_names': cells[2].string.strip(),
'country': cells[3].string.strip()
})
except AttributeError:
pass
# No results
return results
def process_row(soup, label):
'''
Get value from the row with the given label.
'''
try:
value = soup.find('div', class_='genapp_item_display_label', string=re.compile(label)).find_next_sibling('div').string.strip()
except AttributeError:
value = ''
return value
def process_item(item):
'''
Get data from an individual item page.
'''
response = s.get(ITEM_URL, params={'IdNumber': item['item_id']})
soup = BeautifulSoup(response.text, 'lxml')
for label in ['Year', 'Page', 'Reference', 'Relation']:
item[label.lower()] = process_row(soup, label)
pdf_link = soup.find('a', href=re.compile(r'&op=pdf'))
item['pdf_id'] = pdf_link.string.strip()
item['pdf_url'] = pdf_link['href']
return item
def get_total_results(soup):
'''
Get the toal number of results for a search.
'''
results_info = soup.find('div', class_='search_term_value').string.strip()
total_results = re.search(r'^\d+', results_info).group(0)
return int(total_results)
def harvest_results_by_country(country):
'''
Harvest search results for the supplied country.
Return a maximum of 2000 results.
'''
items = []
params = {
'CountryEn': country,
'p_ID': 0
}
response = s.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'lxml')
total_results = get_total_results(soup)
with tqdm_notebook(total=total_results) as pbar:
for page in range(0, total_results, 15):
params['p_ID'] = page
response = s.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'lxml')
results = process_page(soup)
for result in tqdm_notebook(results, leave=False):
items.append(process_item(result))
time.sleep(0.5)
time.sleep(0.5)
pbar.update(len(results))
return items
# Start the harvest
# Substitute your own country value here
country = 'China'
items = harvest_results_by_country(country)
df = pd.DataFrame(items)
df.head()
# How many results?
len(df)
# Reorder columns
df = df[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
# Change id to numeric so we can use it to order
df['item_id'] = pd.to_numeric(df['item_id'])
df = df.replace('NULL', '')
df = df.sort_values(by=['item_id'])
df.to_csv('lac-naturalisations-{}.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}.csv'.format(country)))
As noted above, the wives and children of a naturalised man aren't assigned a country
value and so will be missing from the harvested data. While there are no explicit links between a naturalised man and his family, by making a couple of assumptions we can attempt to add data for wives and children.
Assumptions:
These assumptions are based on some random poking around in the PDFs, but I can't be sure they will hold in every case.
Based on these assumptions, the methodology for adding records is:
This is really slow and inefficient because if a naturalised man has no family we end up looking through all (or the first 2000) results for their surname. Also, if there are more than 2000 results for a surname then we might miss family members — there's no way of knowing...
def harvest_families(items):
'''
Attempts to add family members to the search results for a given country.
'''
new_items = items.copy()
for item in tqdm_notebook(items):
# Are there more records to processs?
more = True
# Have family members been found?
found = False
# Look for the id following the man's id
current_id = int(item['item_id']) + 1
page = 0
while more:
response = s.get(SEARCH_URL, params={'Surname': item['surname'], 'p_ID': page})
soup = BeautifulSoup(response.text)
# Check for results
try:
rows = soup.find('table', class_='result_table').find('tbody').find_all('tr')
except AttributeError:
more = False
else:
# Process the rows on a page
for row in rows:
cells = row.find_all('td')
# Check that the record has a sequential id and no country
if (int(cells[0].get_text()) == current_id) and (cells[1].string.strip() == item['surname']) and (cells[3].string == None):
new_item = {
'item_id': cells[0].get_text(),
'surname': cells[1].string.strip(),
'given_names': cells[2].string.strip(),
'country': ''
}
new_item = process_item(new_item)
new_items.append(new_item)
current_id += 1
# We've found a family member
found = True
time.sleep(0.5)
else:
# If we've already found family members
if found:
# It seems there are no more family members, so let's get out of the loops
more = False
break
page += 15
time.sleep(0.5)
return new_items
new_items = harvest_families(items)
df2 = pd.DataFrame(new_items)
# How many altogether now?
len(df2)
# How many wives and children are there
len(df2.loc[df2['country'] == ''])
df2.loc[df2['country'] == ''].head()
# Just wives
df2.loc[df2['relation'] == 'Wife'].head()
df2 = df2[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
df2['item_id'] = pd.to_numeric(df2['item_id'])
df2 = df2.replace('NULL', '')
df2 = df2.sort_values(by=['item_id'])
df2.to_csv('lac-naturalisations-{}-with-families.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}-with-families.csv'.format(country)))