%matplotlib inline
import matplotlib.pyplot as plt
import sys, os, re, time
import urllib
import numpy as np
from IPython import parallel
This flickr parsing code is adapted from here
def extract_urls(html):
"""Extract images URLs from a page."""
re_imageurl = re.compile(r'src="(http://farm\d+.static.?flickr.com/\d+/\d+_\w+.jpg)"',re.IGNORECASE|re.DOTALL)
urls = re_imageurl.findall(html)
if len(urls)==0:
return []
return urls
def urls_for_tag(tag='face', min_images=100, max_pages=20):
"""get urls to flickr images with given tag(s)
scrapes flickr search page
"""
urls = []
page = 1
while len(urls) < min_images and page <= max_pages:
url = 'http://www.flickr.com/search/?q=%s&l=cc&ss=0&ct=0&mt=photos&w=all&adv=1&m=tags&page=%i' % (tag, page)
print "fetching %s" % url
urlfile = urllib.urlopen(url)
# global html
html= urlfile.read()
# print html
urlfile.close()
page_urls = extract_urls(html)
urls.extend(page_urls)
print "found %i images" % len(urls)
if not len(page_urls):
print "no new images"
break
page += 1
return urls
urls = urls_for_tag('portrait', 500)
def download_image(url, dest_dir='images'):
"""download an image from a url into a directory
returns the path to the downloaded image.
"""
import os
basename = url.rsplit('/', 1)[-1]
dest = os.path.join(dest_dir, basename)
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
if os.path.exists(dest):
print "already have %s" % dest
return dest
print "downloading %s -> %s" % (url, dest)
urlf = urllib.urlopen(url)
data = urlf.read()
urlf.close()
with open(dest, 'w') as f:
f.write(data)
return dest
First, initialize OpenCV for simple facial detection
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
# if you have opencv installed via homebrew, this would be in
# /usr/local/share/OpenCV/haarcascades/
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)
Then define a few functions for extracting faces from images
def extract_faces(image, faces):
"""Returns any faces in an image in a list of numpy arrays"""
import numpy as np
A = np.frombuffer(image.tostring(), dtype=np.uint8).reshape((image.height, image.width, image.nChannels))
A = A[:,:,::-1]
face_arrays = []
for face in faces:
Aface = A[face[1]:face[1]+face[3],face[0]:face[0]+face[2]]
face_arrays.append(Aface)
return face_arrays
def detect_faces(filename):
"""Loads an image into OpenCV, and detects faces
returns None if no image is found,
(filename, [list of numpy arrays]) if there are faces
"""
image = cv.LoadImage(filename)
faces = []
detected = cv.HaarDetectObjects(image, cascade, storage, 1.2, 2, cv.CV_HAAR_DO_CANNY_PRUNING, (100,100))
if detected:
for (x,y,w,h),n in detected:
faces.append((x,y,w,h))
if faces:
return filename, extract_faces(image, faces)
And finally, a two-step function that downloads an image from a url, and detects faces in it.
def faces_in_url(url):
"""detect faces in an image downloaded from a url"""
img_path = download_image(url)
return detect_faces(img_path)
If the network doesn't work, you can just generate a list of paths to images on your computer. For instance, these pictures are just everything from my iPhoto thumbnails directory, so vary from ~320x240 - 1024x768
import glob
library = os.path.expanduser("~/Pictures/2013.iphotolibrary")
pictures = []
for directory, subdirs, files in os.walk(os.path.join(library, 'Thumbnails')):
for fname in files:
if fname.endswith('.jpg'):
pictures.append(os.path.join(directory, fname))
Or this one, which globs pictures from a particular folder:
import glob
pictures = glob.glob("images/*/*.jpg")
Let's test our
for url in urls:
found = faces_in_url(url)
if found:
break
filename, faces = found
for face in faces:
plt.figure()
plt.imshow(face)
If the network isn't kind to you, we can skip the downloads, and just use pictures we have on the filesystem:
for p in pictures:
found = detect_faces(p)
if found:
break
filename, faces = found
for face in faces:
plt.figure()
plt.imshow(face)
Hey, that looks like a face!
First, we connect our parallel Client
rc = parallel.Client()
all_engines = rc[:]
view = rc.load_balanced_view()
Then we initialize OpenCV on all of the engines (identical to what we did above)
%%px
%cd notebooks/parallel
%%px
HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml"
import os, urllib
import cv
storage = cv.CreateMemStorage()
cascade = cv.Load(HAAR_CASCADE_PATH)
and make sure extract_faces
is defined everywhere
all_engines.push(dict(
extract_faces=extract_faces,
detect_faces=detect_faces,
download_image=download_image,
))
Now we can iterate through all of our pictures, and detect and display any faces we find
tic = time.time()
# if you are running offline, do this one:
# f = detect_faces
# source = pictures
# or you can download each image as part of the task:
f = faces_in_url
source = urls
amr = view.map_async(f, source[:1000], ordered=False)
nfound = 0
for r in amr:
if not r:
continue
filename, faces = r
nfound += len(faces)
print "%i faces found in %s" % (len(faces), filename)
for face in faces:
plt.imshow(face)
plt.show()
toc = time.time()
print "found %i faces in %i images in %f s" % (nfound, len(amr), toc-tic)