See the LAC site for more details of the database and how it was created.
This notebook helps you create a dataset with records of people from a specific country.
Problems and limitations:
surname
field, but despite the example given on the search page, there are no wildcard searches, instead all search terms are treated as substrings, matching anywhere in a field — so a surname search for 'Lee' returns 'Batslee', 'Fleenor' etc;item id
, which appear to be assigned alphabetically by date, but there's no way of being sure about this — so the first 2000 results are probably the earliest results;country
value, so they won't be picked up by a country
search — see below for a way of possibly overcoming this...Results:
Once harvested you can save the harvested data as a CSV file. The CSV file will contain the following columns:
item_id
surname
given_names
country
relation
year
reference
page
pdf_id
pdf_url
Here's the results for a harvest of 'China':
# Import the bits and pieces that we need
import re
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm_notebook
import pandas as pd
from IPython.display import display, HTML, FileLink
# Set some variables
s = requests.Session()
SEARCH_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/list-naturalization-1915-1939.aspx'
ITEM_URL = 'http://www.bac-lac.gc.ca/eng/discover/immigration/citizenship-naturalization-records/naturalized-records-1915-1951/Pages/item-naturalization-1915-1939.aspx'
# Define some functions
def process_page(soup):
'''
Extract data from a page of results.
'''
results = []
try:
for row in soup.find('table', class_='result_table').find('tbody').find_all('tr'):
cells = row.find_all('td')
results.append({
'item_id': cells[0].get_text(),
'surname': cells[1].string.strip(),
'given_names': cells[2].string.strip(),
'country': cells[3].string.strip()
})
except AttributeError:
pass
# No results
return results
def process_row(soup, label):
'''
Get value from the row with the given label.
'''
try:
value = soup.find('div', class_='genapp_item_display_label', string=re.compile(label)).find_next_sibling('div').string.strip()
except AttributeError:
value = ''
return value
def process_item(item):
'''
Get data from an individual item page.
'''
response = s.get(ITEM_URL, params={'IdNumber': item['item_id']})
soup = BeautifulSoup(response.text, 'lxml')
for label in ['Year', 'Page', 'Reference', 'Relation']:
item[label.lower()] = process_row(soup, label)
pdf_link = soup.find('a', href=re.compile(r'&op=pdf'))
item['pdf_id'] = pdf_link.string.strip()
item['pdf_url'] = pdf_link['href']
return item
def get_total_results(soup):
'''
Get the toal number of results for a search.
'''
results_info = soup.find('div', class_='search_term_value').string.strip()
total_results = re.search(r'^\d+', results_info).group(0)
return int(total_results)
def harvest_results_by_country(country):
'''
Harvest search results for the supplied country.
Return a maximum of 2000 results.
'''
items = []
params = {
'CountryEn': country,
'p_ID': 0
}
response = s.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'lxml')
total_results = get_total_results(soup)
with tqdm_notebook(total=total_results) as pbar:
for page in range(0, total_results, 15):
params['p_ID'] = page
response = s.get(SEARCH_URL, params=params)
soup = BeautifulSoup(response.text, 'lxml')
results = process_page(soup)
for result in tqdm_notebook(results, leave=False):
items.append(process_item(result))
time.sleep(0.5)
time.sleep(0.5)
pbar.update(len(results))
return items
# Start the harvest
# Substitute your own country value here
country = 'China'
items = harvest_results_by_country(country)
HBox(children=(IntProgress(value=0, max=482), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=15), HTML(value='')))
HBox(children=(IntProgress(value=0, max=2), HTML(value='')))
df = pd.DataFrame(items)
df.head()
country | given_names | item_id | page | pdf_id | pdf_url | reference | relation | surname | year | |
---|---|---|---|---|---|---|---|---|---|---|
0 | China | Charlie | 2711 | 364 | P22-23_364 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | Canadian Gazette 1922-1923 | Fern | 1922-1923 | |
1 | China | Mah Qong | 3997 | 389 | P22-23_389 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | Canadian Gazette 1922-1923 | Hing | 1922-1923 | |
2 | China | Jim Lee | 4910 | 406 | P22-23_406 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | Canadian Gazette 1922-1923 | Ko | 1922-1923 | |
3 | China | Frank Ho | 5426 | 416 | P22-23_416 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | Canadian Gazette 1922-1923 | Lem | 1922-1923 | |
4 | China | Chin Jeng | 5560 | 419 | P22-23_419 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | Canadian Gazette 1922-1923 | Ling | 1922-1923 |
# How many results?
len(df)
482
# Reorder columns
df = df[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
# Change id to numeric so we can use it to order
df['item_id'] = pd.to_numeric(df['item_id'])
df = df.replace('NULL', '')
df = df.sort_values(by=['item_id'])
df.to_csv('lac-naturalisations-{}.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}.csv'.format(country)))
As noted above, the wives and children of a naturalised man aren't assigned a country
value and so will be missing from the harvested data. While there are no explicit links between a naturalised man and his family, by making a couple of assumptions we can attempt to add data for wives and children.
Assumptions:
These assumptions are based on some random poking around in the PDFs, but I can't be sure they will hold in every case.
Based on these assumptions, the methodology for adding records is:
This is really slow and inefficient because if a naturalised man has no family we end up looking through all (or the first 2000) results for their surname. Also, if there are more than 2000 results for a surname then we might miss family members — there's no way of knowing...
def harvest_families(items):
'''
Attempts to add family members to the search results for a given country.
'''
new_items = items.copy()
for item in tqdm_notebook(items):
# Are there more records to processs?
more = True
# Have family members been found?
found = False
# Look for the id following the man's id
current_id = int(item['item_id']) + 1
page = 0
while more:
response = s.get(SEARCH_URL, params={'Surname': item['surname'], 'p_ID': page})
soup = BeautifulSoup(response.text)
# Check for results
try:
rows = soup.find('table', class_='result_table').find('tbody').find_all('tr')
except AttributeError:
more = False
else:
# Process the rows on a page
for row in rows:
cells = row.find_all('td')
# Check that the record has a sequential id and no country
if (int(cells[0].get_text()) == current_id) and (cells[1].string.strip() == item['surname']) and (cells[3].string == None):
new_item = {
'item_id': cells[0].get_text(),
'surname': cells[1].string.strip(),
'given_names': cells[2].string.strip(),
'country': ''
}
new_item = process_item(new_item)
new_items.append(new_item)
current_id += 1
# We've found a family member
found = True
time.sleep(0.5)
else:
# If we've already found family members
if found:
# It seems there are no more family members, so let's get out of the loops
more = False
break
page += 15
time.sleep(0.5)
return new_items
new_items = harvest_families(items)
HBox(children=(IntProgress(value=0, max=482), HTML(value='')))
df2 = pd.DataFrame(new_items)
# How many altogether now?
len(df2)
626
# How many wives and children are there
len(df2.loc[df2['country'] == ''])
144
df2.loc[df2['country'] == ''].head()
item_id | surname | given_names | country | relation | year | reference | page | pdf_id | pdf_url | |
---|---|---|---|---|---|---|---|---|---|---|
482 | 6485 | Mow | Chewkong Wong | Minor child | 1922-1923 | Canadian Gazette 1922-1923 | 437 | P22-23_437 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | |
483 | 7539 | Poy | Hong Auk | Minor child | 1922-1923 | Canadian Gazette 1922-1923 | 457 | P22-23_457 | http://central.bac-lac.gc.ca/.item/?id=P22-23_... | |
484 | 13606 | Foo | Mah Kwack Hong | Minor child | 1923-1924 | Canadian Gazette 1923-1924 | 282 | P23-24_282 | http://central.bac-lac.gc.ca/.item/?id=P23-24_... | |
485 | 13607 | Foo | Mah Kwack Kee | Minor child | 1923-1924 | Canadian Gazette 1923-1924 | 282 | P23-24_282 | http://central.bac-lac.gc.ca/.item/?id=P23-24_... | |
486 | 13608 | Foo | Mah Kwack Lem | Minor child | 1923-1924 | Canadian Gazette 1923-1924 | 282 | P23-24_282 | http://central.bac-lac.gc.ca/.item/?id=P23-24_... |
# Just wives
df2.loc[df2['relation'] == 'Wife'].head()
item_id | surname | given_names | country | relation | year | reference | page | pdf_id | pdf_url | |
---|---|---|---|---|---|---|---|---|---|---|
499 | 34211 | Chin | Lee Shee | Wife | 1925-1926 | Canadian Gazette 1925-1926 | 336 | P25-26_336 | http://central.bac-lac.gc.ca/.item/?id=P25-26_... | |
500 | 34810 | Dai | Mar Sea | Wife | 1925-1926 | Canadian Gazette 1925-1926 | 347 | P25-26_347 | http://central.bac-lac.gc.ca/.item/?id=P25-26_... | |
501 | 36165 | Fong | Lee See | Wife | 1925-1926 | Canadian Gazette 1925-1926 | 373 | P25-26_373 | http://central.bac-lac.gc.ca/.item/?id=P25-26_... | |
502 | 36168 | Food | Lem | Wife | 1925-1926 | Canadian Gazette 1925-1926 | 373 | P25-26_373 | http://central.bac-lac.gc.ca/.item/?id=P25-26_... | |
503 | 37593 | Hing | Aarr Pong | Wife | 1925-1926 | Canadian Gazette 1925-1926 | 400 | P25-26_400 | http://central.bac-lac.gc.ca/.item/?id=P25-26_... |
df2 = df2[['item_id', 'surname', 'given_names', 'country', 'relation', 'year', 'reference', 'page', 'pdf_id', 'pdf_url']]
df2['item_id'] = pd.to_numeric(df2['item_id'])
df2 = df2.replace('NULL', '')
df2 = df2.sort_values(by=['item_id'])
df2.to_csv('lac-naturalisations-{}-with-families.csv'.format(country), index=False)
display(FileLink('lac-naturalisations-{}-with-families.csv'.format(country)))