#!/usr/bin/env python
# coding: utf-8

# # Some gun violence analysis with Wikipedia data

# As [requested by John Stokes](https://twitter.com/jonst0kes/status/282330530412888064),
# here are per-capita numbers for gun-related homicides,
# relating to GDP and total homicides,
# so the situation in the United States can be put in context relative to other nations.

# main data source is UNODC (via Wikipedia [here](http://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate)
# and [here](http://en.wikipedia.org/wiki/List_of_countries_by_firearm-related_death_rate)).
# 
# GDP data from World Bank, again [via Wikipedia](http://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28PPP%29_per_capita).
# 
# If the numbers on Wikipedia are inaccurate, or their relationship is not sound
# (e.g. numbers taken from different years, during which significant change occured)
# then obviously None of this analysis is valid.
# 
# To summarize the data,
# every possible way you look at it the US is lousy at preventing gun violence.
# Even when compared to significantly more violent places,
# gun violence in the US is a serious problem,
# and when compared to similarly wealthy places,
# the US is an outstanding disaster.

# **UPDATE:** the relationship of the gun data and totals does not seem to be valid.
# [FBI data](http://www2.fbi.gov/ucr/cius2009/offenses/violent_crime/index.html) suggests that
# the relative contribution of guns to homicides in the US is 47%,
# but relating these two data sources gives 80%.
# Internal comparisons should still be fine, but 'fraction' analysis has been stricken.
# 
# **UPDATE:** this is an updated version of [the original notebook](http://nbviewer.ipython.org/gist/minrk/4358066/Gun%20Data.ipynb), run with data provided by Haruo Kamioka.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, set_matplotlib_formats
set_matplotlib_formats('retina')

import pandas
pandas.set_option('display.precision', 2)


# Some utility functions for display

# In[2]:


def plot_percent(df, limit=10):
    df['Gun Percent'][:limit].plot()
    plt.ylim(0,100)
    plt.title("% Gun Homicide")
    plt.show()


# In[3]:


def plot_percapita(df, limit=10):
    df = df.ix[:,['Homicides', 'Gun Homicides']][:limit]
    df['Total Homicides'] = df['Homicides'] - df['Gun Homicides']
    del df['Homicides']
    df.plot(kind='bar', stacked=True, sort_columns=True)
    plt.ylabel("per 100k")
    plt.show()


# In[4]:


def display_relevant(df, limit=10):
    display(df.ix[:,['Homicides', 'Gun Homicides', 'Sources and notes']][:limit])


# Load the data

# In[5]:


totals = pandas.read_csv('totals.csv', sep=';', index_col='Country')
guns = pandas.read_csv('guns.csv', sep=';', index_col='Country')
gdp = pandas.read_csv('gdp.csv', sep=';', index_col='Country', thousands=',')
# rename a few columns:
gdp.columns = ['Rank', 'GDP', 'Year']
guns['Gun Homicides'] = guns['Homicides']
del guns['Homicides']
totals['Homicides'] = totals['Rate']


# In[6]:


data = totals.join(guns).join(gdp, rsuffix=' (GDP)')
data['Gun Percent'] = 100 * data['Gun Homicides'] / data['Homicides']
del data['Unintentional'], data['Undetermined'], data['Suicides']
data = data.dropna()


# In[7]:


data


# Of all sampled countries (Found data for 68 countries),
# the US is in the top 15 in Gun Homicides per capita.
# 
# Numbers are per 100k.

# In[8]:


data = data.sort("Gun Homicides", ascending=False)
display_relevant(data, 15)


# Take top 30 Countries by GDP

# In[9]:


top = data.sort('GDP')[-30:]


# and rank them by Gun Homicides per capita:

# In[10]:


top_by_guns = top.sort("Gun Homicides", ascending=False)
display_relevant(top_by_guns, 5)
plot_percapita(top_by_guns, 10)


# **NOTE:** these bar graphs should not be interpreted as fractions of a total,
# as the two data sources do not appear to be comparable.
# But the red and blue bar graphs should still be internally comparable.

# The US is easily #1 of 30 wealthiest countries in Gun Homicides per capita,
# by a factor of 4:1

# Adding USA, Canada, and Mexico to all of Europe,
# USA is a strong #2 behind Mexico in total gun homicides per-capita

# In[11]:


index = (data['Region'] == 'Europe') | \
        (data.index == 'United States') | \
        (data.index == 'Canada') | \
        (data.index == 'Mexico')
selected = data[index]

print("By Total Gun Homicides")
sys.stdout.flush()

by_guns = selected.sort("Gun Homicides", ascending=False)
#by_guns['Gun Homicides'].plot(kind='bar')
plot_percapita(by_guns, limit=25)
display_relevant(selected, limit=None)


# Let's just compare US, Canada, and UK:

# In[12]:


select = data.ix[['United States', 'Canada', 'United Kingdom']]
plot_percapita(select)


# Normalize to the US numbers (inverse)

# In[13]:


select['Homicides'] = select['Homicides']['United States'] / select['Homicides']
select['Gun Homicides'] = select['Gun Homicides']['United States'] / select['Gun Homicides']
display_relevant(select)


# In[14]:


select['Homicides'] = select['Homicides']['United States'] / select['Homicides']
select['Gun Homicides'] = select['Gun Homicides']['United States'] / select['Gun Homicides']
display_relevant(select)


# So, you are 2.6 times more likely to be killed in the US than Canada,
# and 3.5 times more likely than in the UK.
# That's bad, but not extreme.
# 
# However, you are 4.9 times more likely to be killed *with a gun* in the US than Canada,
# and almost 100 times more likely than in the UK.  That is pretty extreme.
# 

# Countries represented:

# In[15]:


for country in data.index:
    print(country)