Outdated -- use fbi_crime_data instead
This notebook scrapes US crime data from http://www.disastercenter.com/crime/, and merges it into a single csv file
import requests
from fnmatch import fnmatch
import re
from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import numpy as np
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 50)
columns = 'Year Population Index Violent Property Murder Rape Robbery Assault Burglary Larceny Vehicle'.split()
Gather the URLs for each state's page
dom = BeautifulSoup(requests.get('http://www.disastercenter.com/crime/vacrime.htm').text, "html5lib")
urls = [a['href'] for a in dom.findAll('a') if fnmatch(a['href'], 'http://www.disastercenter.com/crime/*.htm')]
urls = {u.split('/')[-1].split('crim')[0]: u for u in urls}
def number(element):
try:
node = 'font' if element.find('font') else 'small'
d = element.find(node).contents[0]
d = d.replace(',', '').replace(u'\xa0', '').replace('*', '').replace("'",'')
return float(d)
except:
return np.nan
def crime_stats(url):
dom = BeautifulSoup(requests.get(url).text, "html5lib")
table = dom.find('center', text=re.compile('100,000')).find_next('table')
rows = filter(lambda x: len(x.findAll('td')) == 12, table.find_all('tr'))
data = [map(number, r.findAll('td')) for r in rows[2:]]
return pd.DataFrame(data, columns=columns)
states = {}
for state, url in urls.items():
print state,
if state in ['oh']: # Ohio is a malformed table. Ignore it.
continue
states[state] = crime_stats(url)
va co ca al ar vt il ga in ia az id ct nh nj nm tx la nc nd ne tn ny pa ak nv wa de dc wi wv hi ok fl wy me md ma oh ut mo mn mi kn us ri mt ms sc ky or sd
for s in states:
states[s]['state'] = s
data = pd.concat(states.values(), ignore_index=True)
data.to_csv('crime.csv', index=False)