%matplotlib inline import matplotlib.pyplot as plt import sys, os, re, time import urllib import numpy as np from IPython import parallel def extract_urls(html): """Extract images URLs from a page.""" re_imageurl = re.compile(r'src="(http://farm\d+.static.?flickr.com/\d+/\d+_\w+.jpg)"',re.IGNORECASE|re.DOTALL) urls = re_imageurl.findall(html) if len(urls)==0: return [] return urls def urls_for_tag(tag='face', min_images=100, max_pages=20): """get urls to flickr images with given tag(s) scrapes flickr search page """ urls = [] page = 1 while len(urls) < min_images and page <= max_pages: url = 'http://www.flickr.com/search/?q=%s&l=cc&ss=0&ct=0&mt=photos&w=all&adv=1&m=tags&page=%i' % (tag, page) print "fetching %s" % url urlfile = urllib.urlopen(url) # global html html= urlfile.read() # print html urlfile.close() page_urls = extract_urls(html) urls.extend(page_urls) print "found %i images" % len(urls) if not len(page_urls): print "no new images" break page += 1 return urls urls = urls_for_tag('portrait', 500) def download_image(url, dest_dir='images'): """download an image from a url into a directory returns the path to the downloaded image. """ import os basename = url.rsplit('/', 1)[-1] dest = os.path.join(dest_dir, basename) if not os.path.exists(dest_dir): os.makedirs(dest_dir) if os.path.exists(dest): print "already have %s" % dest return dest print "downloading %s -> %s" % (url, dest) urlf = urllib.urlopen(url) data = urlf.read() urlf.close() with open(dest, 'w') as f: f.write(data) return dest HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml" # if you have opencv installed via homebrew, this would be in # /usr/local/share/OpenCV/haarcascades/ import cv storage = cv.CreateMemStorage() cascade = cv.Load(HAAR_CASCADE_PATH) def extract_faces(image, faces): """Returns any faces in an image in a list of numpy arrays""" import numpy as np A = np.frombuffer(image.tostring(), dtype=np.uint8).reshape((image.height, image.width, image.nChannels)) A = A[:,:,::-1] face_arrays = [] for face in faces: Aface = A[face[1]:face[1]+face[3],face[0]:face[0]+face[2]] face_arrays.append(Aface) return face_arrays def detect_faces(filename): """Loads an image into OpenCV, and detects faces returns None if no image is found, (filename, [list of numpy arrays]) if there are faces """ image = cv.LoadImage(filename) faces = [] detected = cv.HaarDetectObjects(image, cascade, storage, 1.2, 2, cv.CV_HAAR_DO_CANNY_PRUNING, (100,100)) if detected: for (x,y,w,h),n in detected: faces.append((x,y,w,h)) if faces: return filename, extract_faces(image, faces) def faces_in_url(url): """detect faces in an image downloaded from a url""" img_path = download_image(url) return detect_faces(img_path) import glob library = os.path.expanduser("~/Pictures/2013.iphotolibrary") pictures = [] for directory, subdirs, files in os.walk(os.path.join(library, 'Thumbnails')): for fname in files: if fname.endswith('.jpg'): pictures.append(os.path.join(directory, fname)) import glob pictures = glob.glob("images/*/*.jpg") for url in urls: found = faces_in_url(url) if found: break filename, faces = found for face in faces: plt.figure() plt.imshow(face) for p in pictures: found = detect_faces(p) if found: break filename, faces = found for face in faces: plt.figure() plt.imshow(face) rc = parallel.Client() all_engines = rc[:] view = rc.load_balanced_view() %%px HAAR_CASCADE_PATH = "haarcascade_frontalface_default.xml" import os, urllib import cv storage = cv.CreateMemStorage() cascade = cv.Load(HAAR_CASCADE_PATH) all_engines.push(dict( extract_faces=extract_faces, detect_faces=detect_faces, download_image=download_image, )) tic = time.time() # if you are running offline, do this one: # f = detect_faces # source = pictures # or you can download each image as part of the task: f = faces_in_url source = urls amr = view.map_async(f, source[:1000], ordered=False) nfound = 0 for r in amr: if not r: continue filename, faces = r nfound += len(faces) print "%i faces found in %s" % (len(faces), filename) for face in faces: plt.imshow(face) plt.show() toc = time.time() print "found %i faces in %i images in %f s" % (nfound, len(amr), toc-tic)