#!/usr/bin/env python
# coding: utf-8

# # Stacked bar charts

# We've only been looking at the largest cities by population, but we have a whole bunch of other data we can use.

# In[68]:


import pandas

df = pandas.read_csv('US_cities.csv', index_col="AccentCity")
df[:5]


# We have over 4000 cities and towns in the database, definitely too many for a bar chart. But we could look at the population data at the state level. First, let's try just a simple groupby in pandas. 

# In[86]:


states_pop = df[["Population", "Region"]].groupby(by=["Region"]).sum()
states_pop[:5]


# In[70]:


states_pop.plot.bar()
plt.show()


# Let's make a stacked bar chart that shows us how much the three largest cities contribute to population compared to the rest of the state. First, we'll need to get a separate list of the three top cities. To do that, we'll be using the pandas groupby method.

# In[71]:


largest_cities_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max) == df["Population"]
df[largest_cities_idx][:5]


# In[72]:


def max2(cities):
    return nth_largest(cities, 2)
def max3(cities):
    return nth_largest(cities, 3)
def nth_largest(cities, n):
    nlargest = cities.nlargest(n)
    if len(nlargest) < n:
        return None
    return nlargest[n-1]
second_largest_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max2) == df["Population"]
third_largest_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max3) == df["Population"]
df[second_largest_idx][:5]


# Now we just need to sum the rest of the cities that we haven't counted yet.

# In[88]:


smaller_cities = df[~largest_cities_idx & ~second_largest_idx & ~third_largest_idx].copy()

smaller_cities[["Population", "Region"]].groupby(by=["Region"]).sum().plot.bar(stacked=True)
df[largest_cities_idx][["Population", "Region"]].groupby(by=["Region"]).sum().plot.bar(stacked=True)
# df[second_largest_idx].plot.bar()
# df[third_largest_idx].plot.bar()
plt.show()


# In[89]:


df