Finding all the faces in the Tribune collection

This notebook runs a facial detection script across the whole Tribune collection. It saves cropped versions of all the detected faces, and creates a data file recording the number of faces detected per image.

If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.

Some tips:

  • Code cells have boxes around them. When you hover over them a icon appears.
  • To run a code cell either click the icon, or click on the cell and then hit Shift+Enter. The Shift+Enter combo will also move you to the next cell, so it's a quick way to work through the notebook.
  • While a cell is running a * appears in the square brackets next to the cell. Once the cell has finished running the asterix will be replaced with a number.
  • In most cases you'll want to start from the top of notebook and work your way down running each cell in turn. Later cells might depend on the results of earlier ones.
  • To edit a code cell, just click on it and type stuff. Remember to run the cell once you've finished editing.

In [ ]:
import cv2
import pandas as pd
import os
from urllib.parse import urlparse
import requests
from IPython.display import display, HTML
import copy
from tqdm import tqdm_notebook
import altair as alt
In [ ]:
# Load Tribune images data
df = pd.read_csv('')
In [ ]:
# Link to the facial detection data file
face_cl = cv2.CascadeClassifier( + 'haarcascade_frontalface_default.xml')

def download_image(img_url):
    Download and save the specified image.
    current_dir = os.getcwd()
    os.makedirs(os.path.join(current_dir, 'downloads'), exist_ok=True)
    parsed = urlparse(img_url)
    filename = os.path.join(current_dir, 'downloads', os.path.basename(parsed.path))
    response = requests.get(img_url, stream=True)
    with open(filename, 'wb') as fd:
        for chunk in response.iter_content(chunk_size=128):
    return filename 

def detect_faces(img_file, save_annotated=True):
    Use OpenCV to find faces.
    faces = []
    f = 1
    os.makedirs('faces', exist_ok=True)
    # print('Processing {}'.format(img_file))
        image = cv2.imread(img_file)
        # Create a copy to annotate
        results = image.copy()
        # Create a greyscale copy for face detection
        grey = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        # Find faces!
        # Try adjusting scaleFactor and minNeighbors if results aren't what you expect.
        faces = face_cl.detectMultiScale(grey, scaleFactor=1.3, minNeighbors=4, minSize=(50, 50))
    except cv2.error:
        for (x, y, w, h) in faces:
            # Save a cropped version of the detected face
            face = image[y: y + h, x: x + w]
            cv2.imwrite(os.path.join('faces', '{}-{}.jpg'.format(os.path.splitext(os.path.basename(img_file))[0], f)), face)
            # Draw a green box on the complete image
            cv2.rectangle(results, (x, y), (x + w, y + h), (0, 255, 0), 2)
            f += 1
        # Save the annotated image
        if save_annotated:
            cv2.imwrite(img_file, results)
    return faces

def process_images(images):
    Find faces in a list of images.
    Displays the results
    for img_id, img_url in images:
        filename = download_image(img_url)
        faces = detect_faces(filename)
        html = '<image src="downloads/{}"><br><a target="_blank" href="{}&embedded=true&toolbar=false">More details at SLNSW</a><br>'.format(os.path.basename(filename), img_id)
        print('I found {} faces...'.format(len(faces)))
        for i, face in enumerate(faces, 1):
            html += '<a target="_blank" href="faces/{0}-{1}.jpg"><image style="width: 100px; height: 100px; float: left; margin: 10px; object-fit: scale-down;" src="faces/{0}-{1}.jpg"></a>'.format(img_id, i)
def get_image_by_id(img_id):
    Process a specific image.
    images = [(img_id, '{0}.jpg'.format(img_id))]
In [ ]:
def download_and_detect():
    Download ALL THE IMAGES and look for faces.
    This will take a long time and use up lots of disk space.
    You can just take a slice of the images list to test it out.
    face_data = []
    images = df['images'].tolist()
    for image_id in tqdm_notebook(images):
        img_url = '{0}.jpg'.format(img_id)
        img_file = download_image(img_file)
        faces = detect_faces(img_file, save_annotated=False)
        face_data.append({'image': image, 'faces': len(faces)})
In [ ]:
def detect_all():
    I've already got copies of all the images, so I'll just point the script at them.
    face_data = []
    image_dir = '/Volumes/bigdata/mydata/SLNSW/Tribune/images'
    images = [i for i in os.listdir(image_dir) if i[-4:] == '.jpg']
    for image in tqdm_notebook(images):
        img_file = os.path.join(image_dir, image)
        faces = detect_faces(img_file, save_annotated=False)
        face_data.append({'image': image, 'faces': len(faces)})
In [ ]:
In [ ]:
# Convert to a dataframe and save as CSV
df = pd.DataFrame(face_data)
df.to_csv('faces_per_image.csv', index=False)
In [ ]:
df.to_csv('faces_per_image.csv', index=False)
In [ ]: